diff --git a/3rdparty/clapack/include/lapack.h b/3rdparty/clapack/include/lapack.h
index 3f89d223f1..5f38a997c6 100644
--- a/3rdparty/clapack/include/lapack.h
+++ b/3rdparty/clapack/include/lapack.h
@@ -86,6 +86,11 @@ int dhseqr_(char *job, char *compz, int *n, int *ilo, int *
 
 int disnan_(double *din);
 
+// "small" is a macro defined in Windows headers: https://stackoverflow.com/a/27794577
+#ifdef small
+#undef small
+#endif
+
 int dlabad_(double *small, double *large);
 
 int dlabrd_(int *m, int *n, int *nb, double *a, int *lda,
diff --git a/modules/3d/CMakeLists.txt b/modules/3d/CMakeLists.txt
index 9bed964dd7..fa524a3194 100644
--- a/modules/3d/CMakeLists.txt
+++ b/modules/3d/CMakeLists.txt
@@ -10,3 +10,7 @@ ocv_define_module(3d opencv_imgproc opencv_features2d opencv_flann ${debug_modul
     WRAP java objc python js
 )
 ocv_target_link_libraries(${the_module} ${LAPACK_LIBRARIES})
+
+if(NOT HAVE_EIGEN)
+  message(STATUS "3d: Eigen support is disabled. Eigen is Required for Posegraph optimization")
+endif()
diff --git a/modules/3d/doc/3d.bib b/modules/3d/doc/3d.bib
index 86f2277b16..34200f2f65 100644
--- a/modules/3d/doc/3d.bib
+++ b/modules/3d/doc/3d.bib
@@ -60,3 +60,22 @@
   url={https://elib.dlr.de/71888/1/strobl_2011iccv.pdf},
   doi={10.1109/ICCVW.2011.6130369}
 }
+
+@inproceedings{kinectfusion,
+author = {Izadi, Shahram and Kim, David and Hilliges, Otmar and Molyneaux, David and Newcombe, Richard and Kohli, Pushmeet and Shotton, Jamie and Hodges, Steve and Freeman, Dustin and Davison, Andrew and Fitzgibbon, Andrew},
+title = {KinectFusion: Real-time 3D Reconstruction and Interaction Using a Moving Depth Camera},
+booktitle = {},
+year = {2011},
+month = {October},
+abstract = {
+KinectFusion enables a user holding and moving a standard Kinect camera to rapidly create detailed 3D reconstructions of an indoor scene. Only the depth data from Kinect is used to track the 3D pose of the sensor and reconstruct, geometrically precise, 3D models of the physical scene in real-time. The capabilities of KinectFusion, as well as the novel GPU-based pipeline are described in full. We show uses of the core system for low-cost handheld scanning, and geometry-aware augmented reality and physics-based interactions. Novel extensions to the core GPU pipeline demonstrate object segmentation and user interaction directly in front of the sensor, without degrading camera tracking or reconstruction. These extensions are used to enable real-time multi-touch interactions anywhere, allowing any planar or non-planar reconstructed physical surface to be appropriated for touch.
+},
+publisher = {ACM},
+url = {https://www.microsoft.com/en-us/research/publication/kinectfusion-real-time-3d-reconstruction-and-interaction-using-a-moving-depth-camera/},
+address = {},
+pages = {559-568},
+journal = {},
+volume = {},
+chapter = {},
+isbn = {978-1-4503-0716-1},
+}
\ No newline at end of file
diff --git a/modules/3d/include/opencv2/3d.hpp b/modules/3d/include/opencv2/3d.hpp
index a2a28cddae..6750139e37 100644
--- a/modules/3d/include/opencv2/3d.hpp
+++ b/modules/3d/include/opencv2/3d.hpp
@@ -8,6 +8,9 @@
 #include "opencv2/core.hpp"
 #include "opencv2/core/types_c.h"
 
+#include "opencv2/3d/depth.hpp"
+#include "opencv2/3d/volume.hpp"
+
 /**
   @defgroup _3d 3D vision functionality
 
diff --git a/modules/3d/include/opencv2/3d/depth.hpp b/modules/3d/include/opencv2/3d/depth.hpp
new file mode 100644
index 0000000000..32543cc020
--- /dev/null
+++ b/modules/3d/include/opencv2/3d/depth.hpp
@@ -0,0 +1,503 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_3D_DEPTH_HPP
+#define OPENCV_3D_DEPTH_HPP
+
+#include <opencv2/core.hpp>
+#include <limits>
+
+namespace cv
+{
+//! @addtogroup rgbd
+//! @{
+
+/** Object that can compute the normals in an image.
+ * It is an object as it can cache data for speed efficiency
+ * The implemented methods are either:
+ * - FALS (the fastest) and SRI from
+ * ``Fast and Accurate Computation of Surface Normals from Range Images``
+ * by H. Badino, D. Huber, Y. Park and T. Kanade
+ * - the normals with bilateral filtering on a depth image from
+ * ``Gradient Response Maps for Real-Time Detection of Texture-Less Objects``
+ * by S. Hinterstoisser, C. Cagniart, S. Ilic, P. Sturm, N. Navab, P. Fua, and V. Lepetit
+ */
+class CV_EXPORTS_W RgbdNormals
+{
+public:
+    enum RgbdNormalsMethod
+    {
+      RGBD_NORMALS_METHOD_FALS = 0,
+      RGBD_NORMALS_METHOD_LINEMOD = 1,
+      RGBD_NORMALS_METHOD_SRI = 2
+    };
+
+    RgbdNormals() { }
+    virtual ~RgbdNormals() { }
+
+    /** Creates new RgbdNormals object
+     * @param rows the number of rows of the depth image normals will be computed on
+     * @param cols the number of cols of the depth image normals will be computed on
+     * @param depth the depth of the normals (only CV_32F or CV_64F)
+     * @param K the calibration matrix to use
+     * @param window_size the window size to compute the normals: can only be 1,3,5 or 7
+     * @param method one of the methods to use: RGBD_NORMALS_METHOD_SRI, RGBD_NORMALS_METHOD_FALS
+     */
+    CV_WRAP static Ptr<RgbdNormals> create(int rows = 0, int cols = 0, int depth = 0, InputArray K = noArray(), int window_size = 5,
+                                           RgbdNormals::RgbdNormalsMethod method = RgbdNormals::RgbdNormalsMethod::RGBD_NORMALS_METHOD_FALS);
+
+    /** Given a set of 3d points in a depth image, compute the normals at each point.
+     * @param points a rows x cols x 3 matrix of CV_32F/CV64F or a rows x cols x 1 CV_U16S
+     * @param normals a rows x cols x 3 matrix
+     */
+    CV_WRAP virtual void apply(InputArray points, OutputArray normals) const = 0;
+
+    /** Prepares cached data required for calculation
+    * If not called by user, called automatically at first calculation
+    */
+    CV_WRAP virtual void cache() const = 0;
+
+    CV_WRAP virtual int getRows() const = 0;
+    CV_WRAP virtual void setRows(int val) = 0;
+    CV_WRAP virtual int getCols() const = 0;
+    CV_WRAP virtual void setCols(int val) = 0;
+    CV_WRAP virtual int getWindowSize() const = 0;
+    CV_WRAP virtual void setWindowSize(int val) = 0;
+    CV_WRAP virtual int getDepth() const = 0;
+    CV_WRAP virtual void getK(OutputArray val) const = 0;
+    CV_WRAP virtual void setK(InputArray val) = 0;
+    CV_WRAP virtual RgbdNormals::RgbdNormalsMethod getMethod() const = 0;
+    CV_WRAP virtual void setMethod(RgbdNormals::RgbdNormalsMethod val) = 0;
+};
+
+
+/** Registers depth data to an external camera
+ * Registration is performed by creating a depth cloud, transforming the cloud by
+ * the rigid body transformation between the cameras, and then projecting the
+ * transformed points into the RGB camera.
+ *
+ * uv_rgb = K_rgb * [R | t] * z * inv(K_ir) * uv_ir
+ *
+ * Currently does not check for negative depth values.
+ *
+ * @param unregisteredCameraMatrix the camera matrix of the depth camera
+ * @param registeredCameraMatrix the camera matrix of the external camera
+ * @param registeredDistCoeffs the distortion coefficients of the external camera
+ * @param Rt the rigid body transform between the cameras. Transforms points from depth camera frame to external camera frame.
+ * @param unregisteredDepth the input depth data
+ * @param outputImagePlaneSize the image plane dimensions of the external camera (width, height)
+ * @param registeredDepth the result of transforming the depth into the external camera
+ * @param depthDilation whether or not the depth is dilated to avoid holes and occlusion errors (optional)
+ */
+CV_EXPORTS_W void registerDepth(InputArray unregisteredCameraMatrix, InputArray registeredCameraMatrix, InputArray registeredDistCoeffs,
+                                InputArray Rt, InputArray unregisteredDepth, const Size& outputImagePlaneSize,
+                                OutputArray registeredDepth, bool depthDilation=false);
+
+/**
+ * @param depth the depth image
+ * @param in_K
+ * @param in_points the list of xy coordinates
+ * @param points3d the resulting 3d points
+ */
+CV_EXPORTS_W void depthTo3dSparse(InputArray depth, InputArray in_K, InputArray in_points, OutputArray points3d);
+
+/** Converts a depth image to an organized set of 3d points.
+ * The coordinate system is x pointing left, y down and z away from the camera
+ * @param depth the depth image (if given as short int CV_U, it is assumed to be the depth in millimeters
+ *              (as done with the Microsoft Kinect), otherwise, if given as CV_32F or CV_64F, it is assumed in meters)
+ * @param K The calibration matrix
+ * @param points3d the resulting 3d points. They are of depth the same as `depth` if it is CV_32F or CV_64F, and the
+ *        depth of `K` if `depth` is of depth CV_U
+ * @param mask the mask of the points to consider (can be empty)
+ */
+CV_EXPORTS_W void depthTo3d(InputArray depth, InputArray K, OutputArray points3d, InputArray mask = noArray());
+
+/** If the input image is of type CV_16UC1 (like the Kinect one), the image is converted to floats, divided
+ * by depth_factor to get a depth in meters, and the values 0 are converted to std::numeric_limits<float>::quiet_NaN()
+ * Otherwise, the image is simply converted to floats
+ * @param in the depth image (if given as short int CV_U, it is assumed to be the depth in millimeters
+ *              (as done with the Microsoft Kinect), it is assumed in meters)
+ * @param type the desired output depth (CV_32F or CV_64F)
+ * @param out The rescaled float depth image
+ * @param depth_factor (optional) factor by which depth is converted to distance (by default = 1000.0 for Kinect sensor)
+ */
+CV_EXPORTS_W void rescaleDepth(InputArray in, int type, OutputArray out, double depth_factor = 1000.0);
+
+/** Warp the image: compute 3d points from the depth, transform them using given transformation,
+ * then project color point cloud to an image plane.
+ * This function can be used to visualize results of the Odometry algorithm.
+ * @param image The image (of CV_8UC1 or CV_8UC3 type)
+ * @param depth The depth (of type used in depthTo3d fuction)
+ * @param mask The mask of used pixels (of CV_8UC1), it can be empty
+ * @param Rt The transformation that will be applied to the 3d points computed from the depth
+ * @param cameraMatrix Camera matrix
+ * @param distCoeff Distortion coefficients
+ * @param warpedImage The warped image.
+ * @param warpedDepth The warped depth.
+ * @param warpedMask The warped mask.
+ */
+CV_EXPORTS_W void warpFrame(InputArray image, InputArray depth, InputArray mask, InputArray Rt, InputArray cameraMatrix, InputArray distCoeff,
+                            OutputArray warpedImage, OutputArray warpedDepth = noArray(), OutputArray warpedMask = noArray());
+
+enum RgbdPlaneMethod
+{
+    RGBD_PLANE_METHOD_DEFAULT
+};
+
+/** Find the planes in a depth image
+ * @param points3d the 3d points organized like the depth image: rows x cols with 3 channels
+ * @param normals the normals for every point in the depth image
+ * @param mask An image where each pixel is labeled with the plane it belongs to
+ *        and 255 if it does not belong to any plane
+ * @param plane_coefficients the coefficients of the corresponding planes (a,b,c,d) such that ax+by+cz+d=0, norm(a,b,c)=1
+ *        and c < 0 (so that the normal points towards the camera)
+ * @param block_size The size of the blocks to look at for a stable MSE
+ * @param min_size The minimum size of a cluster to be considered a plane
+ * @param threshold The maximum distance of a point from a plane to belong to it (in meters)
+ * @param sensor_error_a coefficient of the sensor error. 0 by default, use 0.0075 for a Kinect
+ * @param sensor_error_b coefficient of the sensor error. 0 by default
+ * @param sensor_error_c coefficient of the sensor error. 0 by default
+ * @param method The method to use to compute the planes.
+ */
+CV_EXPORTS_W void findPlanes(InputArray points3d, InputArray normals, OutputArray mask, OutputArray plane_coefficients,
+                             int block_size = 40, int min_size = 40*40, double threshold = 0.01,
+                             double sensor_error_a = 0, double sensor_error_b = 0,
+                             double sensor_error_c = 0,
+                             RgbdPlaneMethod method = RGBD_PLANE_METHOD_DEFAULT);
+
+/** Object that contains a frame data that is possibly needed for the Odometry.
+ * It's used for the efficiency (to pass precomputed/cached data of the frame that participates
+ * in the Odometry processing several times).
+ */
+struct CV_EXPORTS_W OdometryFrame
+{
+public:
+    /** These constants are used to set a type of cache which has to be prepared depending on the frame role:
+     * srcFrame or dstFrame (see compute method of the Odometry class). For the srcFrame and dstFrame different cache data may be required,
+     * some part of a cache may be common for both frame roles.
+     * @param CACHE_SRC The cache data for the srcFrame will be prepared.
+     * @param CACHE_DST The cache data for the dstFrame will be prepared.
+     * @param CACHE_ALL The cache data for both srcFrame and dstFrame roles will be computed.
+     * @param CACHE_DEPTH The frame will be generated from depth image
+     * @param CACHE_PTS The frame will be built from point cloud
+     */
+    enum OdometryFrameCacheType
+    {
+      CACHE_SRC = 1, CACHE_DST = 2, CACHE_ALL = CACHE_SRC + CACHE_DST
+    };
+
+    /** Indicates what pyramid is to access using get/setPyramid... methods:
+    * @param PYR_IMAGE The pyramid of RGB images
+    * @param PYR_DEPTH The pyramid of depth images
+    * @param PYR_MASK  The pyramid of masks
+    * @param PYR_CLOUD The pyramid of point clouds, produced from the pyramid of depths
+    * @param PYR_DIX   The pyramid of dI/dx derivative images
+    * @param PYR_DIY   The pyramid of dI/dy derivative images
+    * @param PYR_TEXMASK The pyramid of textured masks
+    * @param PYR_NORM  The pyramid of normals
+    * @param PYR_NORMMASK The pyramid of normals masks
+    */
+    enum OdometryFramePyramidType
+    {
+        PYR_IMAGE = 0, PYR_DEPTH = 1, PYR_MASK = 2, PYR_CLOUD = 3, PYR_DIX = 4, PYR_DIY = 5, PYR_TEXMASK = 6, PYR_NORM = 7, PYR_NORMMASK = 8,
+        N_PYRAMIDS
+    };
+
+    OdometryFrame() : ID(-1) { }
+    virtual ~OdometryFrame() { }
+
+    CV_WRAP static Ptr<OdometryFrame> create(InputArray image = noArray(), InputArray depth = noArray(),
+                                             InputArray  mask = noArray(), InputArray normals = noArray(), int ID = -1);
+
+    CV_WRAP virtual void setImage(InputArray  image) = 0;
+    CV_WRAP virtual void getImage(OutputArray image) = 0;
+    CV_WRAP virtual void setDepth(InputArray  depth) = 0;
+    CV_WRAP virtual void getDepth(OutputArray depth) = 0;
+    CV_WRAP virtual void setMask(InputArray  mask) = 0;
+    CV_WRAP virtual void getMask(OutputArray mask) = 0;
+    CV_WRAP virtual void setNormals(InputArray  normals) = 0;
+    CV_WRAP virtual void getNormals(OutputArray normals) = 0;
+
+    CV_WRAP virtual void setPyramidLevels(size_t nLevels) = 0;
+    CV_WRAP virtual size_t getPyramidLevels(OdometryFrame::OdometryFramePyramidType pyrType) = 0;
+
+    CV_WRAP virtual void setPyramidAt(InputArray  pyrImage, OdometryFrame::OdometryFramePyramidType pyrType, size_t level) = 0;
+    CV_WRAP virtual void getPyramidAt(OutputArray pyrImage, OdometryFrame::OdometryFramePyramidType pyrType, size_t level) = 0;
+
+    CV_PROP int ID;
+};
+
+
+/** Base class for computation of odometry.
+ */
+class CV_EXPORTS_W Odometry
+{
+public:
+
+    /** A class of transformation*/
+    enum OdometryTransformType
+    {
+      ROTATION = 1, TRANSLATION = 2, RIGID_BODY_MOTION = 4
+    };
+
+    virtual ~Odometry() { }
+
+    /** Create a new Odometry object based on its name. Currently supported names are:
+    * "RgbdOdometry", "ICPOdometry", "RgbdICPOdometry", "FastICPOdometry".
+    * @param odometryType algorithm's name
+    */
+    CV_WRAP static Ptr<Odometry> createFromName(const std::string& odometryType);
+
+    /** Method to compute a transformation from the source frame to the destination one.
+     * Some odometry algorithms do not used some data of frames (eg. ICP does not use images).
+     * In such case corresponding arguments can be set as empty Mat.
+     * The method returns true if all internal computations were possible (e.g. there were enough correspondences,
+     * system of equations has a solution, etc) and resulting transformation satisfies some test if it's provided
+     * by the Odometry inheritor implementation (e.g. thresholds for maximum translation and rotation).
+     * @param srcImage Image data of the source frame (CV_8UC1)
+     * @param srcDepth Depth data of the source frame (CV_32FC1, in meters)
+     * @param srcMask Mask that sets which pixels have to be used from the source frame (CV_8UC1)
+     * @param dstImage Image data of the destination frame (CV_8UC1)
+     * @param dstDepth Depth data of the destination frame (CV_32FC1, in meters)
+     * @param dstMask Mask that sets which pixels have to be used from the destination frame (CV_8UC1)
+     * @param Rt Resulting transformation from the source frame to the destination one (rigid body motion):
+     dst_p = Rt * src_p, where dst_p is a homogeneous point in the destination frame and src_p is
+     homogeneous point in the source frame,
+     Rt is 4x4 matrix of CV_64FC1 type.
+     * @param initRt Initial transformation from the source frame to the destination one (optional)
+     */
+    CV_WRAP virtual bool compute(InputArray srcImage, InputArray srcDepth, InputArray srcMask, InputArray dstImage, InputArray dstDepth,
+                                 InputArray dstMask, OutputArray Rt, InputArray initRt = noArray()) const = 0;
+
+    /** One more method to compute a transformation from the source frame to the destination one.
+     * It is designed to save on computing the frame data (image pyramids, normals, etc.).
+     */
+    CV_WRAP_AS(compute2) virtual bool compute(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame,
+                                              OutputArray Rt, InputArray initRt = noArray()) const = 0;
+
+    /** Prepare a cache for the frame. The function checks the precomputed/passed data (throws the error if this data
+     * does not satisfy) and computes all remaining cache data needed for the frame. Returned size is a resolution
+     * of the prepared frame.
+     * @param frame The odometry which will process the frame.
+     * @param cacheType The cache type: CACHE_SRC, CACHE_DST or CACHE_ALL.
+     */
+    CV_WRAP virtual Size prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const = 0;
+
+    /** Create odometry frame for current Odometry implementation
+     * @param image Image data of the frame (CV_8UC1)
+     * @param depth Depth data of the frame (CV_32FC1, in meters)
+     * @param mask  Mask that sets which pixels have to be used from the frame (CV_8UC1)
+    */
+    CV_WRAP virtual Ptr<OdometryFrame> makeOdometryFrame(InputArray image, InputArray depth, InputArray mask) const = 0;
+
+    CV_WRAP virtual void getCameraMatrix(OutputArray val) const = 0;
+    CV_WRAP virtual void setCameraMatrix(InputArray val) = 0;
+    CV_WRAP virtual Odometry::OdometryTransformType getTransformType() const = 0;
+    CV_WRAP virtual void setTransformType(Odometry::OdometryTransformType val) = 0;
+    CV_WRAP virtual void getIterationCounts(OutputArray val) const = 0;
+    CV_WRAP virtual void setIterationCounts(InputArray val) = 0;
+    /** For each pyramid level the pixels will be filtered out if they have gradient magnitude less than minGradientMagnitudes[level].
+    * Makes sense for RGB-based algorithms only.
+    */
+    CV_WRAP virtual void getMinGradientMagnitudes(OutputArray val) const = 0;
+    CV_WRAP virtual void setMinGradientMagnitudes(InputArray val) = 0;
+
+    /** Get max allowed translation in meters, default is 0.15 meters
+    Found delta transform is considered successful only if the translation is in given limits. */
+    CV_WRAP virtual double getMaxTranslation() const = 0;
+    /** Set max allowed translation in meters.
+    * Found delta transform is considered successful only if the translation is in given limits. */
+    CV_WRAP virtual void setMaxTranslation(double val) = 0;
+    /** Get max allowed rotation in degrees, default is 15 degrees.
+    * Found delta transform is considered successful only if the rotation is in given limits. */
+    CV_WRAP virtual double getMaxRotation() const = 0;
+    /** Set max allowed rotation in degrees.
+    * Found delta transform is considered successful only if the rotation is in given limits. */
+    CV_WRAP virtual void setMaxRotation(double val) = 0;
+};
+
+/** Odometry based on the paper "Real-Time Visual Odometry from Dense RGB-D Images",
+ * F. Steinbucker, J. Strum, D. Cremers, ICCV, 2011.
+ */
+class CV_EXPORTS_W RgbdOdometry: public Odometry
+{
+public:
+    RgbdOdometry() { }
+
+    /** Creates RgbdOdometry object
+     * @param cameraMatrix Camera matrix
+     * @param minDepth Pixels with depth less than minDepth will not be used (in meters)
+     * @param maxDepth Pixels with depth larger than maxDepth will not be used (in meters)
+     * @param maxDepthDiff Correspondences between pixels of two given frames will be filtered out
+     *                     if their depth difference is larger than maxDepthDiff (in meters)
+     * @param iterCounts Count of iterations on each pyramid level, defaults are [7, 7, 7, 10]
+     * @param minGradientMagnitudes For each pyramid level the pixels will be filtered out, default is 10 per each level
+     *                              if they have gradient magnitude less than minGradientMagnitudes[level].
+     * @param maxPointsPart The method uses a random pixels subset of size frameWidth x frameHeight x pointsPart (from 0 to 1)
+     * @param transformType Class of transformation
+     */
+    CV_WRAP static Ptr<RgbdOdometry> create(InputArray cameraMatrix = noArray(),
+                                            float minDepth = 0.f,
+                                            float maxDepth = 4.f,
+                                            float maxDepthDiff = 0.07f,
+                                            InputArray iterCounts = noArray(),
+                                            InputArray minGradientMagnitudes = noArray(),
+                                            float maxPointsPart = 0.07f,
+                                            Odometry::OdometryTransformType transformType = Odometry::RIGID_BODY_MOTION);
+
+    CV_WRAP virtual double getMinDepth() const = 0;
+    CV_WRAP virtual void   setMinDepth(double val) = 0;
+    CV_WRAP virtual double getMaxDepth() const = 0;
+    CV_WRAP virtual void   setMaxDepth(double val) = 0;
+    CV_WRAP virtual double getMaxDepthDiff() const = 0;
+    CV_WRAP virtual void   setMaxDepthDiff(double val) = 0;
+    CV_WRAP virtual double getMaxPointsPart() const = 0;
+    CV_WRAP virtual void   setMaxPointsPart(double val) = 0;
+
+};
+
+/** Odometry based on the paper "KinectFusion: Real-Time Dense Surface Mapping and Tracking",
+ * Richard A. Newcombe, Andrew Fitzgibbon, at al, SIGGRAPH, 2011.
+ */
+class CV_EXPORTS_W ICPOdometry: public Odometry
+{
+public:
+    ICPOdometry() { }
+
+    /** Creates new ICPOdometry object
+     * @param cameraMatrix Camera matrix
+     * @param minDepth Pixels with depth less than minDepth will not be used (in meters)
+     * @param maxDepth Pixels with depth larger than maxDepth will not be used (in meters)
+     * @param maxDepthDiff Correspondences between pixels of two given frames will be filtered out
+     *                     if their depth difference is larger than maxDepthDiff (in meters)
+     * @param maxPointsPart The method uses a random pixels subset of size frameWidth x frameHeight x pointsPart (from 0 to 1)
+     * @param iterCounts Count of iterations on each pyramid level, defaults are [7, 7, 7, 10]
+     * @param transformType Class of trasformation
+     */
+    CV_WRAP static Ptr<ICPOdometry> create(InputArray cameraMatrix = noArray(),
+                                           float minDepth = 0.f,
+                                           float maxDepth = 4.f,
+                                           float maxDepthDiff = 0.07f,
+                                           float maxPointsPart = 0.07f,
+                                           InputArray iterCounts = noArray(),
+                                           Odometry::OdometryTransformType transformType = Odometry::RIGID_BODY_MOTION);
+
+    CV_WRAP virtual double getMinDepth() const = 0;
+    CV_WRAP virtual void   setMinDepth(double val) = 0;
+    CV_WRAP virtual double getMaxDepth() const = 0;
+    CV_WRAP virtual void   setMaxDepth(double val) = 0;
+    CV_WRAP virtual double getMaxDepthDiff() const = 0;
+    CV_WRAP virtual void   setMaxDepthDiff(double val) = 0;
+    CV_WRAP virtual double getMaxPointsPart() const = 0;
+    CV_WRAP virtual void   setMaxPointsPart(double val) = 0;
+    CV_WRAP virtual Ptr<RgbdNormals> getNormalsComputer() const = 0;
+};
+
+/** Odometry that merges RgbdOdometry and ICPOdometry by minimize sum of their energy functions.
+ */
+class CV_EXPORTS_W RgbdICPOdometry: public Odometry
+{
+public:
+    RgbdICPOdometry() { }
+
+    /** Creates RgbdICPOdometry object
+     * @param cameraMatrix Camera matrix
+     * @param minDepth Pixels with depth less than minDepth will not be used (in meters)
+     * @param maxDepth Pixels with depth larger than maxDepth will not be used (in meters)
+     * @param maxDepthDiff Correspondences between pixels of two given frames will be filtered out
+     *                     if their depth difference is larger than maxDepthDiff (in meters)
+     * @param maxPointsPart The method uses a random pixels subset of size frameWidth x frameHeight x pointsPart (from 0 to 1)
+     * @param iterCounts Count of iterations on each pyramid level, defaults are [7, 7, 7, 10]
+     * @param minGradientMagnitudes For each pyramid level the pixels will be filtered out
+     *                              if they have gradient magnitude less than minGradientMagnitudes[level], default is 10 per each level
+     * @param transformType Class of trasformation
+     */
+    CV_WRAP static Ptr<RgbdICPOdometry> create(InputArray cameraMatrix = noArray(),
+                                               float minDepth = 0.f,
+                                               float maxDepth = 4.f,
+                                               float maxDepthDiff = 0.07f,
+                                               float maxPointsPart = 0.07f,
+                                               InputArray iterCounts = noArray(),
+                                               InputArray minGradientMagnitudes = noArray(),
+                                               Odometry::OdometryTransformType transformType = Odometry::RIGID_BODY_MOTION);
+
+    CV_WRAP virtual double getMinDepth() const = 0;
+    CV_WRAP virtual void   setMinDepth(double val) = 0;
+    CV_WRAP virtual double getMaxDepth() const = 0;
+    CV_WRAP virtual void   setMaxDepth(double val) = 0;
+    CV_WRAP virtual double getMaxDepthDiff() const = 0;
+    CV_WRAP virtual void   setMaxDepthDiff(double val) = 0;
+    CV_WRAP virtual double getMaxPointsPart() const = 0;
+    CV_WRAP virtual void   setMaxPointsPart(double val) = 0;
+
+    CV_WRAP virtual Ptr<RgbdNormals> getNormalsComputer() const = 0;
+};
+
+/** A faster version of ICPOdometry which is used in KinectFusion implementation
+ * Partial list of differences:
+ * - UMats are processed using OpenCL
+ * - CPU version is written in universal intrinsics
+ * - Filters points by angle
+ * - Interpolates points and normals
+ * - Doesn't use masks or min/max depth filtering
+ * - Doesn't use random subsets of points
+ * - Supports only Rt transform type
+ * - Initial transform is not supported and always ignored
+ * - Supports only 4-float vectors as input type
+ */
+class CV_EXPORTS_W FastICPOdometry: public Odometry
+{
+public:
+    FastICPOdometry() { }
+
+    /** Creates FastICPOdometry object
+     * @param cameraMatrix Camera matrix
+     * @param maxDistDiff Correspondences between pixels of two given frames will be filtered out
+     *                    if their depth difference is larger than maxDepthDiff (in meters)
+     * @param angleThreshold Correspondence will be filtered out
+     *                     if an angle between their normals is bigger than threshold
+     * @param sigmaDepth Depth sigma in meters for bilateral smooth
+     * @param sigmaSpatial Spatial sigma in pixels for bilateral smooth
+     * @param kernelSize Kernel size in pixels for bilateral smooth
+     * @param iterCounts Count of iterations on each pyramid level, defaults are [7, 7, 7, 10]
+     * @param depthFactor pre-scale per 1 meter for input values
+     * @param truncateThreshold Threshold for depth truncation in meters
+     *        All depth values beyond this threshold will be set to zero
+     */
+    CV_WRAP static Ptr<FastICPOdometry> create(InputArray cameraMatrix = noArray(),
+                                               float maxDistDiff = 0.07f,
+                                               float angleThreshold = (float)(30. * CV_PI / 180.),
+                                               float sigmaDepth = 0.04f,
+                                               float sigmaSpatial = 4.5f,
+                                               int kernelSize = 7,
+                                               InputArray iterCounts = noArray(),
+                                               float depthFactor = 1.f,
+                                               float truncateThreshold = 0.f);
+
+    CV_WRAP virtual double getMaxDistDiff() const = 0;
+    CV_WRAP virtual void   setMaxDistDiff(float val) = 0;
+    CV_WRAP virtual float  getAngleThreshold() const = 0;
+    CV_WRAP virtual void   setAngleThreshold(float f) = 0;
+    CV_WRAP virtual float  getSigmaDepth() const = 0;
+    CV_WRAP virtual void   setSigmaDepth(float f) = 0;
+    CV_WRAP virtual float  getSigmaSpatial() const = 0;
+    CV_WRAP virtual void   setSigmaSpatial(float f) = 0;
+    CV_WRAP virtual int    getKernelSize() const = 0;
+    CV_WRAP virtual void   setKernelSize(int f) = 0;
+    CV_WRAP virtual float  getDepthFactor() const = 0;
+    CV_WRAP virtual void   setDepthFactor(float depthFactor) = 0;
+    CV_WRAP virtual float  getTruncateThreshold() const = 0;
+    CV_WRAP virtual void   setTruncateThreshold(float truncateThreshold) = 0;
+
+};
+
+// TODO Depth interpolation
+// Curvature
+// Get rescaleDepth return dubles if asked for
+
+//! @}
+
+} /* namespace cv */
+
+#endif // include guard
diff --git a/modules/3d/include/opencv2/3d/detail/kinfu_frame.hpp b/modules/3d/include/opencv2/3d/detail/kinfu_frame.hpp
new file mode 100644
index 0000000000..bac9dfef54
--- /dev/null
+++ b/modules/3d/include/opencv2/3d/detail/kinfu_frame.hpp
@@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_3D_DETAIL_KINFU_FRAME_HPP
+#define OPENCV_3D_DETAIL_KINFU_FRAME_HPP
+
+#include <opencv2/core/affine.hpp>
+
+namespace cv {
+namespace detail {
+
+CV_EXPORTS void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray image, cv::Vec3f lightLoc);
+CV_EXPORTS void renderPointsNormalsColors(InputArray _points, InputArray _normals, InputArray _colors, OutputArray image);
+
+} // namespace detail
+} // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/include/opencv2/3d/detail/pose_graph.hpp b/modules/3d/include/opencv2/3d/detail/pose_graph.hpp
new file mode 100644
index 0000000000..d2e5d42a54
--- /dev/null
+++ b/modules/3d/include/opencv2/3d/detail/pose_graph.hpp
@@ -0,0 +1,60 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_3D_DETAIL_POSE_GRAPH_HPP
+#define OPENCV_3D_DETAIL_POSE_GRAPH_HPP
+
+#include "opencv2/core/affine.hpp"
+#include "opencv2/core/quaternion.hpp"
+
+namespace cv
+{
+namespace detail
+{
+
+// ATTENTION! This class is used internally in Large KinFu.
+// It has been pushed to publicly available headers for tests only.
+// Source compatibility of this API is not guaranteed in the future.
+
+// This class provides tools to solve so-called pose graph problem often arisen in SLAM problems
+// The pose graph format, cost function and optimization techniques
+// repeat the ones used in Ceres 3D Pose Graph Optimization:
+// http://ceres-solver.org/nnls_tutorial.html#other-examples, pose_graph_3d.cc bullet
+class CV_EXPORTS_W PoseGraph
+{
+public:
+    static Ptr<PoseGraph> create();
+    virtual ~PoseGraph();
+
+    // Node may have any id >= 0
+    virtual void addNode(size_t _nodeId, const Affine3d& _pose, bool fixed) = 0;
+    virtual bool isNodeExist(size_t nodeId) const = 0;
+    virtual bool setNodeFixed(size_t nodeId, bool fixed) = 0;
+    virtual bool isNodeFixed(size_t nodeId) const = 0;
+    virtual Affine3d getNodePose(size_t nodeId) const = 0;
+    virtual std::vector<size_t> getNodesIds() const = 0;
+    virtual size_t getNumNodes() const = 0;
+
+    // Edges have consequent indices starting from 0
+    virtual void addEdge(size_t _sourceNodeId, size_t _targetNodeId, const Affine3f& _transformation,
+                         const Matx66f& _information = Matx66f::eye()) = 0;
+    virtual size_t getEdgeStart(size_t i) const = 0;
+    virtual size_t getEdgeEnd(size_t i) const = 0;
+    virtual size_t getNumEdges() const = 0;
+
+    // checks if graph is connected and each edge connects exactly 2 nodes
+    virtual bool isValid() const = 0;
+
+    // Termination criteria are max number of iterations and min relative energy change to current energy
+    // Returns number of iterations elapsed or -1 if max number of iterations was reached or failed to optimize
+    virtual int optimize(const cv::TermCriteria& tc = cv::TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 100, 1e-6)) = 0;
+
+    // calculate cost function based on current nodes parameters
+    virtual double calcEnergy() const = 0;
+};
+
+}  // namespace detail
+}  // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/include/opencv2/3d/detail/submap.hpp b/modules/3d/include/opencv2/3d/detail/submap.hpp
new file mode 100644
index 0000000000..5d54adebfd
--- /dev/null
+++ b/modules/3d/include/opencv2/3d/detail/submap.hpp
@@ -0,0 +1,515 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_3D_DETAIL_SUBMAP_HPP
+#define OPENCV_3D_DETAIL_SUBMAP_HPP
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/affine.hpp>
+#include "opencv2/3d/detail/pose_graph.hpp"
+
+#include <type_traits>
+#include <vector>
+#include <map>
+#include <unordered_map>
+
+
+namespace cv
+{
+namespace detail
+{
+template<typename MatType>
+class Submap
+{
+public:
+    struct PoseConstraint
+    {
+        Affine3f estimatedPose;
+        int weight;
+
+        PoseConstraint() : weight(0){};
+
+        void accumulatePose(const Affine3f& _pose, int _weight = 1)
+        {
+            Matx44f accPose = estimatedPose.matrix * weight + _pose.matrix * _weight;
+            weight         += _weight;
+            accPose        /= float(weight);
+            estimatedPose   = Affine3f(accPose);
+        }
+    };
+    typedef std::map<int, PoseConstraint> Constraints;
+
+    Submap(int _id, const VolumeParams& volumeParams, const cv::Affine3f& _pose = cv::Affine3f::Identity(),
+           int _startFrameId = 0)
+        : id(_id), pose(_pose), cameraPose(Affine3f::Identity()), startFrameId(_startFrameId)
+    {
+        VolumeParams vp = volumeParams;
+        vp.kind = VolumeParams::VolumeKind::HASHTSDF;
+        Ptr<VolumeParams> pvp = makePtr<VolumeParams>(vp);
+        volume = makeVolume(pvp);
+    }
+    virtual ~Submap() = default;
+
+    virtual void integrate(InputArray _depth, float depthFactor, const cv::Matx33f& intrinsics, const int currframeId);
+    virtual void raycast(const cv::Affine3f& cameraPose, const cv::Matx33f& intrinsics, cv::Size frameSize,
+                         OutputArray points = noArray(), OutputArray normals = noArray());
+
+    virtual int getTotalAllocatedBlocks() const { return int(volume->getTotalVolumeUnits()); };
+    virtual int getVisibleBlocks(int currFrameId) const
+    {
+        return volume->getVisibleBlocks(currFrameId, FRAME_VISIBILITY_THRESHOLD);
+    }
+
+    float calcVisibilityRatio(int currFrameId) const
+    {
+        int allocate_blocks = getTotalAllocatedBlocks();
+        int visible_blocks  = getVisibleBlocks(currFrameId);
+        return float(visible_blocks) / float(allocate_blocks);
+    }
+
+    //! TODO: Possibly useless
+    virtual void setStartFrameId(int _startFrameId) { startFrameId = _startFrameId; };
+    virtual void setStopFrameId(int _stopFrameId) { stopFrameId = _stopFrameId; };
+
+    void composeCameraPose(const cv::Affine3f& _relativePose) { cameraPose = cameraPose * _relativePose; }
+    PoseConstraint& getConstraint(const int _id)
+    {
+        //! Creates constraints if doesn't exist yet
+        return constraints[_id];
+    }
+
+public:
+    const int id;
+    cv::Affine3f pose;
+    cv::Affine3f cameraPose;
+    Constraints constraints;
+
+    int startFrameId;
+    int stopFrameId;
+    //! TODO: Should we support submaps for regular volumes?
+    static constexpr int FRAME_VISIBILITY_THRESHOLD = 5;
+
+    //! TODO: Add support for GPU arrays (UMat)
+    Ptr<OdometryFrame> frame;
+
+    std::shared_ptr<Volume> volume;
+};
+
+template<typename MatType>
+
+void Submap<MatType>::integrate(InputArray _depth, float depthFactor, const cv::Matx33f& intrinsics,
+                                const int currFrameId)
+{
+    CV_Assert(currFrameId >= startFrameId);
+    volume->integrate(_depth, depthFactor, cameraPose.matrix, intrinsics, currFrameId);
+}
+
+template<typename MatType>
+void Submap<MatType>::raycast(const cv::Affine3f& _cameraPose, const cv::Matx33f& intrinsics, cv::Size frameSize,
+                              OutputArray points, OutputArray normals)
+{
+    if (!points.needed() && !normals.needed())
+    {
+        MatType pts, nrm;
+        frame->getPyramidAt(pts, OdometryFrame::PYR_CLOUD, 0);
+        frame->getPyramidAt(nrm, OdometryFrame::PYR_NORM, 0);
+        volume->raycast(_cameraPose.matrix, intrinsics, frameSize, pts, nrm);
+        frame->setPyramidAt(pts, OdometryFrame::PYR_CLOUD, 0);
+        frame->setPyramidAt(nrm, OdometryFrame::PYR_NORM,  0);
+    }
+    else
+    {
+        volume->raycast(_cameraPose.matrix, intrinsics, frameSize, points, normals);
+    }
+}
+
+
+/**
+ * @brief: Manages all the created submaps for a particular scene
+ */
+template<typename MatType>
+class SubmapManager
+{
+public:
+    enum class Type
+    {
+        NEW            = 0,
+        CURRENT        = 1,
+        RELOCALISATION = 2,
+        LOOP_CLOSURE   = 3,
+        LOST           = 4
+    };
+
+    struct ActiveSubmapData
+    {
+        Type type;
+        std::vector<Affine3f> constraints;
+        int trackingAttempts;
+    };
+
+    typedef Submap<MatType> SubmapT;
+    typedef std::map<int, Ptr<SubmapT>> IdToSubmapPtr;
+    typedef std::unordered_map<int, ActiveSubmapData> IdToActiveSubmaps;
+
+    SubmapManager(const VolumeParams& _volumeParams) : volumeParams(_volumeParams) {}
+    virtual ~SubmapManager() = default;
+
+    void reset() { submapList.clear(); };
+
+    bool shouldCreateSubmap(int frameId);
+    bool shouldChangeCurrSubmap(int _frameId, int toSubmapId);
+
+    //! Adds a new submap/volume into the current list of managed/Active submaps
+    int createNewSubmap(bool isCurrentActiveMap, const int currFrameId = 0, const Affine3f& pose = cv::Affine3f::Identity());
+
+    void removeSubmap(int _id);
+    size_t numOfSubmaps(void) const { return submapList.size(); };
+    size_t numOfActiveSubmaps(void) const { return activeSubmaps.size(); };
+
+    Ptr<SubmapT> getSubmap(int _id) const;
+    Ptr<SubmapT> getCurrentSubmap(void) const;
+
+    int estimateConstraint(int fromSubmapId, int toSubmapId, int& inliers, Affine3f& inlierPose);
+    bool updateMap(int _frameId, Ptr<OdometryFrame> _frame);
+
+    Ptr<detail::PoseGraph> MapToPoseGraph();
+    void PoseGraphToMap(const Ptr<detail::PoseGraph>& updatedPoseGraph);
+
+    VolumeParams volumeParams;
+
+    std::vector<Ptr<SubmapT>> submapList;
+    IdToActiveSubmaps activeSubmaps;
+
+    Ptr<detail::PoseGraph> poseGraph;
+};
+
+template<typename MatType>
+int SubmapManager<MatType>::createNewSubmap(bool isCurrentMap, int currFrameId, const Affine3f& pose)
+{
+    int newId = int(submapList.size());
+
+    Ptr<SubmapT> newSubmap = cv::makePtr<SubmapT>(newId, volumeParams, pose, currFrameId);
+    submapList.push_back(newSubmap);
+
+    ActiveSubmapData newSubmapData;
+    newSubmapData.trackingAttempts = 0;
+    newSubmapData.type             = isCurrentMap ? Type::CURRENT : Type::NEW;
+    activeSubmaps[newId]           = newSubmapData;
+
+    return newId;
+}
+
+template<typename MatType>
+Ptr<Submap<MatType>> SubmapManager<MatType>::getSubmap(int _id) const
+{
+    CV_Assert(submapList.size() > 0);
+    CV_Assert(_id >= 0 && _id < int(submapList.size()));
+    return submapList.at(_id);
+}
+
+template<typename MatType>
+Ptr<Submap<MatType>> SubmapManager<MatType>::getCurrentSubmap(void) const
+{
+    for (const auto& it : activeSubmaps)
+    {
+        if (it.second.type == Type::CURRENT)
+            return getSubmap(it.first);
+    }
+    return nullptr;
+}
+
+template<typename MatType>
+bool SubmapManager<MatType>::shouldCreateSubmap(int currFrameId)
+{
+    int currSubmapId = -1;
+    for (const auto& it : activeSubmaps)
+    {
+        auto submapData = it.second;
+        // No more than 1 new submap at a time!
+        if (submapData.type == Type::NEW)
+        {
+            return false;
+        }
+        if (submapData.type == Type::CURRENT)
+        {
+            currSubmapId = it.first;
+        }
+    }
+    //! TODO: This shouldn't be happening? since there should always be one active current submap
+    if (currSubmapId < 0)
+    {
+        return false;
+    }
+
+    Ptr<SubmapT> currSubmap = getSubmap(currSubmapId);
+    float ratio             = currSubmap->calcVisibilityRatio(currFrameId);
+
+    if (ratio < 0.2f)
+        return true;
+    return false;
+}
+
+template<typename MatType>
+int SubmapManager<MatType>::estimateConstraint(int fromSubmapId, int toSubmapId, int& inliers, Affine3f& inlierPose)
+{
+    static constexpr int MAX_ITER                    = 10;
+    static constexpr float CONVERGE_WEIGHT_THRESHOLD = 0.01f;
+    static constexpr float INLIER_WEIGHT_THRESH      = 0.8f;
+    static constexpr int MIN_INLIERS                 = 10;
+    static constexpr int MAX_TRACKING_ATTEMPTS = 25;
+
+    //! thresh = HUBER_THRESH
+    auto huberWeight = [](float residual, float thresh = 0.1f) -> float {
+        float rAbs = abs(residual);
+        if (rAbs < thresh)
+            return 1.0;
+        float numerator = sqrt(2 * thresh * rAbs - thresh * thresh);
+        return numerator / rAbs;
+    };
+
+    Ptr<SubmapT> fromSubmap          = getSubmap(fromSubmapId);
+    Ptr<SubmapT> toSubmap            = getSubmap(toSubmapId);
+    ActiveSubmapData& fromSubmapData = activeSubmaps.at(fromSubmapId);
+
+    Affine3f TcameraToFromSubmap = fromSubmap->cameraPose;
+    Affine3f TcameraToToSubmap   = toSubmap->cameraPose;
+
+    // FromSubmap -> ToSubmap transform
+    Affine3f candidateConstraint = TcameraToToSubmap * TcameraToFromSubmap.inv();
+    fromSubmapData.trackingAttempts++;
+    fromSubmapData.constraints.push_back(candidateConstraint);
+
+    std::vector<float> weights(fromSubmapData.constraints.size() + 1, 1.0f);
+
+    Affine3f prevConstraint = fromSubmap->getConstraint(toSubmap->id).estimatedPose;
+    int prevWeight          = fromSubmap->getConstraint(toSubmap->id).weight;
+
+    // Iterative reweighted least squares with huber threshold to find the inliers in the past observations
+    Vec6f meanConstraint;
+    float sumWeight = 0.0f;
+    for (int i = 0; i < MAX_ITER; i++)
+    {
+        Vec6f constraintVec;
+        for (int j = 0; j < int(weights.size() - 1); j++)
+        {
+            Affine3f currObservation = fromSubmapData.constraints[j];
+            cv::vconcat(currObservation.rvec(), currObservation.translation(), constraintVec);
+            meanConstraint += weights[j] * constraintVec;
+            sumWeight += weights[j];
+        }
+        // Heavier weight given to the estimatedPose
+        cv::vconcat(prevConstraint.rvec(), prevConstraint.translation(), constraintVec);
+        meanConstraint += weights.back() * prevWeight * constraintVec;
+        sumWeight += prevWeight;
+        meanConstraint /= float(sumWeight);
+
+        float residual = 0.0f;
+        float diff     = 0.0f;
+        for (int j = 0; j < int(weights.size()); j++)
+        {
+            int w;
+            if (j == int(weights.size() - 1))
+            {
+                cv::vconcat(prevConstraint.rvec(), prevConstraint.translation(), constraintVec);
+                w = prevWeight;
+            }
+            else
+            {
+                Affine3f currObservation = fromSubmapData.constraints[j];
+                cv::vconcat(currObservation.rvec(), currObservation.translation(), constraintVec);
+                w = 1;
+            }
+
+            cv::Vec6f residualVec = (constraintVec - meanConstraint);
+            residual         = float(norm(residualVec));
+            float newWeight = huberWeight(residual);
+            diff += w * abs(newWeight - weights[j]);
+            weights[j] = newWeight;
+        }
+
+        if (diff / (prevWeight + weights.size() - 1) < CONVERGE_WEIGHT_THRESHOLD)
+            break;
+    }
+
+    int localInliers = 0;
+    Matx44f inlierConstraint;
+    for (int i = 0; i < int(weights.size()); i++)
+    {
+        if (weights[i] > INLIER_WEIGHT_THRESH)
+        {
+            localInliers++;
+            if (i == int(weights.size() - 1))
+                inlierConstraint += prevConstraint.matrix;
+            else
+                inlierConstraint += fromSubmapData.constraints[i].matrix;
+        }
+    }
+    inlierConstraint /= float(max(localInliers, 1));
+    inlierPose = Affine3f(inlierConstraint);
+    inliers    = localInliers;
+
+    if (inliers >= MIN_INLIERS)
+    {
+        return 1;
+    }
+    if(fromSubmapData.trackingAttempts - inliers > (MAX_TRACKING_ATTEMPTS - MIN_INLIERS))
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+template<typename MatType>
+bool SubmapManager<MatType>::shouldChangeCurrSubmap(int _frameId, int toSubmapId)
+{
+    auto toSubmap         = getSubmap(toSubmapId);
+    auto toSubmapData     = activeSubmaps.at(toSubmapId);
+    auto currActiveSubmap = getCurrentSubmap();
+
+    int blocksInNewMap = toSubmap->getTotalAllocatedBlocks();
+    float newRatio     = toSubmap->calcVisibilityRatio(_frameId);
+
+    float currRatio = currActiveSubmap->calcVisibilityRatio(_frameId);
+
+    //! TODO: Check for a specific threshold?
+    if (blocksInNewMap <= 0)
+        return false;
+    if ((newRatio > currRatio) && (toSubmapData.type == Type::NEW))
+        return true;
+
+    return false;
+}
+
+template<typename MatType>
+bool SubmapManager<MatType>::updateMap(int _frameId, Ptr<OdometryFrame> _frame)
+{
+    bool mapUpdated = false;
+    int changedCurrentMapId = -1;
+
+    const int currSubmapId  = getCurrentSubmap()->id;
+
+    for (auto& it : activeSubmaps)
+    {
+        int submapId     = it.first;
+        auto& submapData = it.second;
+        if (submapData.type == Type::NEW || submapData.type == Type::LOOP_CLOSURE)
+        {
+            // Check with previous estimate
+            int inliers;
+            Affine3f inlierPose;
+            int constraintUpdate = estimateConstraint(submapId, currSubmapId, inliers, inlierPose);
+            if (constraintUpdate == 1)
+            {
+                typename SubmapT::PoseConstraint& submapConstraint = getSubmap(submapId)->getConstraint(currSubmapId);
+                submapConstraint.accumulatePose(inlierPose, inliers);
+                submapData.constraints.clear();
+                submapData.trackingAttempts = 0;
+
+                if (shouldChangeCurrSubmap(_frameId, submapId))
+                {
+                    changedCurrentMapId = submapId;
+                }
+                mapUpdated = true;
+            }
+            else if(constraintUpdate == -1)
+            {
+                submapData.type = Type::LOST;
+            }
+        }
+    }
+
+    std::vector<int> createNewConstraintsList;
+    for (auto& it : activeSubmaps)
+    {
+        int submapId     = it.first;
+        auto& submapData = it.second;
+
+        if (submapId == changedCurrentMapId)
+        {
+            submapData.type = Type::CURRENT;
+        }
+        if ((submapData.type == Type::CURRENT) && (changedCurrentMapId >= 0) && (submapId != changedCurrentMapId))
+        {
+            submapData.type = Type::LOST;
+            createNewConstraintsList.push_back(submapId);
+        }
+        if ((submapData.type == Type::NEW || submapData.type == Type::LOOP_CLOSURE) && (changedCurrentMapId >= 0))
+        {
+            //! TODO: Add a new type called NEW_LOST?
+            submapData.type = Type::LOST;
+            createNewConstraintsList.push_back(submapId);
+        }
+    }
+
+    for (typename IdToActiveSubmaps::iterator it = activeSubmaps.begin(); it != activeSubmaps.end();)
+    {
+        auto& submapData = it->second;
+        if (submapData.type == Type::LOST)
+            it = activeSubmaps.erase(it);
+        else
+            it++;
+    }
+
+    for (std::vector<int>::const_iterator it = createNewConstraintsList.begin(); it != createNewConstraintsList.end(); ++it)
+    {
+        int dataId = *it;
+        ActiveSubmapData newSubmapData;
+        newSubmapData.trackingAttempts = 0;
+        newSubmapData.type             = Type::LOOP_CLOSURE;
+        activeSubmaps[dataId]          = newSubmapData;
+    }
+
+    if (shouldCreateSubmap(_frameId))
+    {
+        Ptr<SubmapT> currActiveSubmap = getCurrentSubmap();
+        Affine3f newSubmapPose        = currActiveSubmap->pose * currActiveSubmap->cameraPose;
+        int submapId                  = createNewSubmap(false, _frameId, newSubmapPose);
+        auto newSubmap                = getSubmap(submapId);
+        newSubmap->frame              = _frame;
+    }
+
+    return mapUpdated;
+}
+
+template<typename MatType>
+Ptr<detail::PoseGraph> SubmapManager<MatType>::MapToPoseGraph()
+{
+    Ptr<detail::PoseGraph> localPoseGraph = detail::PoseGraph::create();
+
+    for(const auto& currSubmap : submapList)
+    {
+        const typename SubmapT::Constraints& constraintList = currSubmap->constraints;
+        for(const auto& currConstraintPair : constraintList)
+        {
+            // TODO: Handle case with duplicate constraints A -> B and B -> A
+            /* Matx66f informationMatrix = Matx66f::eye() * (currConstraintPair.second.weight/10); */
+            Matx66f informationMatrix = Matx66f::eye();
+            localPoseGraph->addEdge(currSubmap->id, currConstraintPair.first, currConstraintPair.second.estimatedPose, informationMatrix);
+        }
+    }
+
+    for(const auto& currSubmap : submapList)
+    {
+        localPoseGraph->addNode(currSubmap->id, currSubmap->pose, (currSubmap->id == 0));
+    }
+
+    return localPoseGraph;
+}
+
+template <typename MatType>
+void SubmapManager<MatType>::PoseGraphToMap(const Ptr<detail::PoseGraph>& updatedPoseGraph)
+{
+    for(const auto& currSubmap : submapList)
+    {
+        Affine3d pose = updatedPoseGraph->getNodePose(currSubmap->id);
+        if(!updatedPoseGraph->isNodeFixed(currSubmap->id))
+            currSubmap->pose = pose;
+    }
+}
+
+}  // namespace detail
+}  // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/include/opencv2/3d/volume.hpp b/modules/3d/include/opencv2/3d/volume.hpp
new file mode 100644
index 0000000000..3faf66f7ea
--- /dev/null
+++ b/modules/3d/include/opencv2/3d/volume.hpp
@@ -0,0 +1,128 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_3D_VOLUME_HPP
+#define OPENCV_3D_VOLUME_HPP
+
+#include "opencv2/core/affine.hpp"
+
+namespace cv
+{
+class CV_EXPORTS_W Volume
+{
+   public:
+    Volume(float _voxelSize, Matx44f _pose, float _raycastStepFactor) :
+        voxelSize(_voxelSize),
+        voxelSizeInv(1.0f / voxelSize),
+        pose(_pose),
+        raycastStepFactor(_raycastStepFactor)
+    { }
+
+    virtual ~Volume(){};
+
+    CV_WRAP
+    virtual void integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose,
+                           const Matx33f& intrinsics, const int frameId = 0)                   = 0;
+    virtual void integrate(InputArray _depth, InputArray _rgb, float depthFactor,
+                           const Matx44f& cameraPose, const Matx33f& intrinsics,
+                           const Matx33f& rgb_intrinsics, const int frameId = 0)               = 0;
+    CV_WRAP
+    virtual void raycast(const Matx44f& cameraPose, const Matx33f& intrinsics,
+                         const Size& frameSize, OutputArray points, OutputArray normals) const = 0;
+    virtual void raycast(const Matx44f& cameraPose, const Matx33f& intrinsics, const Size& frameSize,
+                         OutputArray points, OutputArray normals, OutputArray colors) const    = 0;
+    CV_WRAP
+    virtual void fetchNormals(InputArray points, OutputArray _normals) const                   = 0;
+    CV_WRAP
+    virtual void fetchPointsNormals(OutputArray points, OutputArray normals) const             = 0;
+    CV_WRAP
+    virtual void reset()                                                                       = 0;
+
+    // Works for hash-based volumes only, otherwise returns 1
+    virtual int getVisibleBlocks(int /*currFrameId*/, int /*frameThreshold*/) const { return 1; }
+    virtual size_t getTotalVolumeUnits() const { return 1; }
+
+   public:
+    const float voxelSize;
+    const float voxelSizeInv;
+    const Affine3f pose;
+    const float raycastStepFactor;
+};
+
+struct CV_EXPORTS_W VolumeParams
+{
+    enum VolumeKind
+    {
+        TSDF        = 0,
+        HASHTSDF    = 1,
+        COLOREDTSDF = 2
+    };
+
+    /** @brief Kind of Volume
+        Values can be TSDF (single volume) or HASHTSDF (hashtable of volume units)
+    */
+    CV_PROP_RW int kind;
+
+    /** @brief Resolution of voxel space
+        Number of voxels in each dimension.
+        Applicable only for TSDF Volume.
+        HashTSDF volume only supports equal resolution in all three dimensions
+    */
+    CV_PROP_RW int resolutionX;
+    CV_PROP_RW int resolutionY;
+    CV_PROP_RW int resolutionZ;
+
+    /** @brief Resolution of volumeUnit in voxel space
+        Number of voxels in each dimension for volumeUnit
+        Applicable only for hashTSDF.
+    */
+    CV_PROP_RW int unitResolution = {0};
+
+    /** @brief Initial pose of the volume in meters, should be 4x4 float or double matrix */
+    CV_PROP_RW Mat pose;
+
+    /** @brief Length of voxels in meters */
+    CV_PROP_RW float voxelSize;
+
+    /** @brief TSDF truncation distance
+        Distances greater than value from surface will be truncated to 1.0
+    */
+    CV_PROP_RW float tsdfTruncDist;
+
+    /** @brief Max number of frames to integrate per voxel
+        Represents the max number of frames over which a running average
+        of the TSDF is calculated for a voxel
+    */
+    CV_PROP_RW int maxWeight;
+
+    /** @brief Threshold for depth truncation in meters
+        Truncates the depth greater than threshold to 0
+    */
+    CV_PROP_RW float depthTruncThreshold;
+
+    /** @brief Length of single raycast step
+        Describes the percentage of voxel length that is skipped per march
+    */
+    CV_PROP_RW float raycastStepFactor;
+
+    /** @brief Default set of parameters that provide higher quality reconstruction
+        at the cost of slow performance.
+    */
+    CV_WRAP static Ptr<VolumeParams> defaultParams(int _volumeType);
+
+    /** @brief Coarse set of parameters that provides relatively higher performance
+        at the cost of reconstrution quality.
+    */
+    CV_WRAP static Ptr<VolumeParams> coarseParams(int _volumeType);
+};
+
+
+CV_EXPORTS_W Ptr<Volume> makeVolume(const Ptr<VolumeParams>& _volumeParams);
+CV_EXPORTS_W Ptr<Volume> makeVolume(int _volumeType, float _voxelSize, Matx44f _pose,
+                                    float _raycastStepFactor, float _truncDist, int _maxWeight, float _truncateThreshold,
+                                    int _resolutionX, int _resolutionY, int _resolutionZ);
+
+}  // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/misc/python/test/test_rgbd.py b/modules/3d/misc/python/test/test_rgbd.py
new file mode 100644
index 0000000000..3db766e0ad
--- /dev/null
+++ b/modules/3d/misc/python/test/test_rgbd.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+# Python 2/3 compatibility
+from __future__ import print_function
+
+import os, numpy
+import unittest
+import cv2 as cv
+
+from tests_common import NewOpenCVTests
+
+class rgbd_test(NewOpenCVTests):
+
+    def test_computeRgbdPlane(self):
+
+        depth_image = self.get_sample('cv/rgbd/depth.png', cv.IMREAD_ANYDEPTH)
+        if depth_image is None:
+            raise unittest.SkipTest("Missing files with test data")
+
+        K = numpy.array([[525, 0, 320.5], [0, 525, 240.5], [0, 0, 1]])
+        points3d = cv.depthTo3d(depth_image, K)
+        normals = cv.RgbdNormals_create(480, 640, cv.CV_32F, K).apply(points3d)
+        _, planes_coeff = cv.findPlanes(points3d, normals, numpy.array([]), numpy.array([]), 40, 1600, 0.01, 0, 0, 0, cv.RGBD_PLANE_METHOD_DEFAULT)
+
+        planes_coeff_expected = \
+        numpy.asarray([[[-0.02447728, -0.8678335 , -0.49625182,  4.02800846]],
+                        [[-0.05055107, -0.86144137, -0.50533485,  3.95456314]],
+                        [[-0.03294908, -0.86964548, -0.49257591,  3.97052431]],
+                        [[-0.02886586, -0.87153459, -0.48948362,  7.77550507]],
+                        [[-0.04455929, -0.87659335, -0.47916424,  3.93200684]],
+                        [[-0.21514639,  0.18835169, -0.95824611,  7.59479475]],
+                        [[-0.01006953, -0.86679155, -0.49856904,  4.01355648]],
+                        [[-0.00876531, -0.87571168, -0.48275498,  3.96768975]],
+                        [[-0.06395926, -0.86951321, -0.48975089,  4.08618736]],
+                        [[-0.01403128, -0.87593341, -0.48222789,  7.74559402]],
+                        [[-0.01143177, -0.87495202, -0.4840748 ,  7.75355816]]],
+                        dtype=numpy.float32)
+
+        eps = 0.05
+        self.assertLessEqual(cv.norm(planes_coeff, planes_coeff_expected, cv.NORM_L2), eps)
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/3d/perf/perf_precomp.hpp b/modules/3d/perf/perf_precomp.hpp
index 0d91bd9605..0a02bd551c 100644
--- a/modules/3d/perf/perf_precomp.hpp
+++ b/modules/3d/perf/perf_precomp.hpp
@@ -7,4 +7,8 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/3d.hpp"
 
+#ifdef HAVE_OPENCL
+#include <opencv2/core/ocl.hpp>
+#endif
+
 #endif
diff --git a/modules/3d/perf/perf_tsdf.cpp b/modules/3d/perf/perf_tsdf.cpp
new file mode 100644
index 0000000000..e1cc5e07e4
--- /dev/null
+++ b/modules/3d/perf/perf_tsdf.cpp
@@ -0,0 +1,437 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+using namespace cv;
+
+/** Reprojects screen point to camera space given z coord. */
+struct Reprojector
+{
+    Reprojector() {}
+    inline Reprojector(Matx33f intr)
+    {
+        fxinv = 1.f / intr(0, 0), fyinv = 1.f / intr(1, 1);
+        cx = intr(0, 2), cy = intr(1, 2);
+    }
+    template<typename T>
+    inline cv::Point3_<T> operator()(cv::Point3_<T> p) const
+    {
+        T x = p.z * (p.x - cx) * fxinv;
+        T y = p.z * (p.y - cy) * fyinv;
+        return cv::Point3_<T>(x, y, p.z);
+    }
+
+    float fxinv, fyinv, cx, cy;
+};
+
+template<class Scene>
+struct RenderInvoker : ParallelLoopBody
+{
+    RenderInvoker(Mat_<float>& _frame, Affine3f _pose,
+        Reprojector _reproj, float _depthFactor, bool _onlySemisphere)
+        : ParallelLoopBody(),
+        frame(_frame),
+        pose(_pose),
+        reproj(_reproj),
+        depthFactor(_depthFactor),
+        onlySemisphere(_onlySemisphere)
+    { }
+
+    virtual void operator ()(const cv::Range& r) const
+    {
+        for (int y = r.start; y < r.end; y++)
+        {
+            float* frameRow = frame[y];
+            for (int x = 0; x < frame.cols; x++)
+            {
+                float pix = 0;
+
+                Point3f orig = pose.translation();
+                // direction through pixel
+                Point3f screenVec = reproj(Point3f((float)x, (float)y, 1.f));
+                float xyt = 1.f / (screenVec.x * screenVec.x +
+                    screenVec.y * screenVec.y + 1.f);
+                Point3f dir = normalize(Vec3f(pose.rotation() * screenVec));
+                // screen space axis
+                dir.y = -dir.y;
+
+                const float maxDepth = 20.f;
+                const float maxSteps = 256;
+                float t = 0.f;
+                for (int step = 0; step < maxSteps && t < maxDepth; step++)
+                {
+                    Point3f p = orig + dir * t;
+                    float d = Scene::map(p, onlySemisphere);
+                    if (d < 0.000001f)
+                    {
+                        float depth = std::sqrt(t * t * xyt);
+                        pix = depth * depthFactor;
+                        break;
+                    }
+                    t += d;
+                }
+
+                frameRow[x] = pix;
+            }
+        }
+    }
+
+    Mat_<float>& frame;
+    Affine3f pose;
+    Reprojector reproj;
+    float depthFactor;
+    bool onlySemisphere;
+};
+
+struct Scene
+{
+    virtual ~Scene() {}
+    static Ptr<Scene> create(Size sz, Matx33f _intr, float _depthFactor, bool onlySemisphere);
+    virtual Mat depth(Affine3f pose) = 0;
+    virtual std::vector<Affine3f> getPoses() = 0;
+};
+
+struct SemisphereScene : Scene
+{
+    const int framesPerCycle = 72;
+    const float nCycles = 0.25f;
+    const Affine3f startPose = Affine3f(Vec3f(0.f, 0.f, 0.f), Vec3f(1.5f, 0.3f, -2.1f));
+
+    Size frameSize;
+    Matx33f intr;
+    float depthFactor;
+    bool onlySemisphere;
+
+    SemisphereScene(Size sz, Matx33f _intr, float _depthFactor, bool _onlySemisphere) :
+        frameSize(sz), intr(_intr), depthFactor(_depthFactor), onlySemisphere(_onlySemisphere)
+    { }
+
+    static float map(Point3f p, bool onlySemisphere)
+    {
+        float plane = p.y + 0.5f;
+        Point3f spherePose = p - Point3f(-0.0f, 0.3f, 1.1f);
+        float sphereRadius = 0.5f;
+        float sphere = (float)cv::norm(spherePose) - sphereRadius;
+        float sphereMinusBox = sphere;
+
+        float subSphereRadius = 0.05f;
+        Point3f subSpherePose = p - Point3f(0.3f, -0.1f, -0.3f);
+        float subSphere = (float)cv::norm(subSpherePose) - subSphereRadius;
+
+        float res;
+        if (!onlySemisphere)
+            res = min({ sphereMinusBox, subSphere, plane });
+        else
+            res = sphereMinusBox;
+
+        return res;
+    }
+
+    Mat depth(Affine3f pose) override
+    {
+        Mat_<float> frame(frameSize);
+        Reprojector reproj(intr);
+
+        Range range(0, frame.rows);
+        parallel_for_(range, RenderInvoker<SemisphereScene>(frame, pose, reproj, depthFactor, onlySemisphere));
+
+        return std::move(frame);
+    }
+
+    std::vector<Affine3f> getPoses() override
+    {
+        std::vector<Affine3f> poses;
+        for (int i = 0; i < framesPerCycle * nCycles; i++)
+        {
+            float angle = (float)(CV_2PI * i / framesPerCycle);
+            Affine3f pose;
+            pose = pose.rotate(startPose.rotation());
+            pose = pose.rotate(Vec3f(0.f, -0.5f, 0.f) * angle);
+            pose = pose.translate(Vec3f(startPose.translation()[0] * sin(angle),
+                startPose.translation()[1],
+                startPose.translation()[2] * cos(angle)));
+            poses.push_back(pose);
+        }
+
+        return poses;
+    }
+
+};
+
+Ptr<Scene> Scene::create(Size sz, Matx33f _intr, float _depthFactor, bool _onlySemisphere)
+{
+    return makePtr<SemisphereScene>(sz, _intr, _depthFactor, _onlySemisphere);
+}
+
+// this is a temporary solution
+// ----------------------------
+
+typedef cv::Vec4f ptype;
+typedef cv::Mat_< ptype > Points;
+typedef Points Normals;
+typedef Size2i Size;
+
+template<int p>
+inline float specPow(float x)
+{
+    if (p % 2 == 0)
+    {
+        float v = specPow<p / 2>(x);
+        return v * v;
+    }
+    else
+    {
+        float v = specPow<(p - 1) / 2>(x);
+        return v * v * x;
+    }
+}
+
+template<>
+inline float specPow<0>(float /*x*/)
+{
+    return 1.f;
+}
+
+template<>
+inline float specPow<1>(float x)
+{
+    return x;
+}
+
+inline cv::Vec3f fromPtype(const ptype& x)
+{
+    return cv::Vec3f(x[0], x[1], x[2]);
+}
+
+inline Point3f normalize(const Vec3f& v)
+{
+    double nv = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+    return v * (nv ? 1. / nv : 0.);
+}
+
+void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray image, Affine3f lightPose)
+{
+    Size sz = _points.size();
+    image.create(sz, CV_8UC4);
+
+    Points  points = _points.getMat();
+    Normals normals = _normals.getMat();
+
+    Mat_<Vec4b> img = image.getMat();
+
+    Range range(0, sz.height);
+    const int nstripes = -1;
+    parallel_for_(range, [&](const Range&)
+        {
+            for (int y = range.start; y < range.end; y++)
+            {
+                Vec4b* imgRow = img[y];
+                const ptype* ptsRow = points[y];
+                const ptype* nrmRow = normals[y];
+
+                for (int x = 0; x < sz.width; x++)
+                {
+                    Point3f p = fromPtype(ptsRow[x]);
+                    Point3f n = fromPtype(nrmRow[x]);
+
+                    Vec4b color;
+
+                    if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z) )
+                    {
+                        color = Vec4b(0, 32, 0, 0);
+                    }
+                    else
+                    {
+                        const float Ka = 0.3f;  //ambient coeff
+                        const float Kd = 0.5f;  //diffuse coeff
+                        const float Ks = 0.2f;  //specular coeff
+                        const int   sp = 20;  //specular power
+
+                        const float Ax = 1.f;   //ambient color,  can be RGB
+                        const float Dx = 1.f;   //diffuse color,  can be RGB
+                        const float Sx = 1.f;   //specular color, can be RGB
+                        const float Lx = 1.f;   //light color
+
+                        Point3f l = normalize(lightPose.translation() - Vec3f(p));
+                        Point3f v = normalize(-Vec3f(p));
+                        Point3f r = normalize(Vec3f(2.f * n * n.dot(l) - l));
+
+                        uchar ix = (uchar)((Ax * Ka * Dx + Lx * Kd * Dx * max(0.f, n.dot(l)) +
+                            Lx * Ks * Sx * specPow<sp>(max(0.f, r.dot(v)))) * 255.f);
+                        color = Vec4b(ix, ix, ix, 0);
+                    }
+
+                    imgRow[x] = color;
+                }
+            }
+        }, nstripes);
+}
+
+// ----------------------------
+
+class Settings
+{
+public:
+    float depthFactor;
+    Matx33f intr;
+    Size frameSize;
+    Vec3f lightPose;
+
+    Ptr<Volume> volume;
+    Ptr<Scene> scene;
+    std::vector<Affine3f> poses;
+
+    Settings(bool useHashTSDF)
+    {
+        frameSize = Size(640, 480);
+
+        float fx, fy, cx, cy;
+        fx = fy = 525.f;
+        cx = frameSize.width / 2 - 0.5f;
+        cy = frameSize.height / 2 - 0.5f;
+        intr = Matx33f(fx,  0, cx,
+                        0, fy, cy,
+                        0,  0,  1);
+
+        // 5000 for the 16-bit PNG files
+        // 1 for the 32-bit float images in the ROS bag files
+        depthFactor = 5000;
+
+        Vec3i volumeDims = Vec3i::all(512); //number of voxels
+
+        float volSize = 3.f;
+        float voxelSize = volSize / 512.f; //meters
+
+        // default pose of volume cube
+        Affine3f volumePose = Affine3f().translate(Vec3f(-volSize / 2.f, -volSize / 2.f, 0.5f));
+        float tsdf_trunc_dist = 7 * voxelSize; // about 0.04f in meters
+        int tsdf_max_weight = 64;   //frames
+
+        float raycast_step_factor = 0.25f;  //in voxel sizes
+        // gradient delta factor is fixed at 1.0f and is not used
+        //p.gradient_delta_factor = 0.5f; //in voxel sizes
+
+        //p.lightPose = p.volume_pose.translation()/4; //meters
+        lightPose = Vec3f::all(0.f); //meters
+
+        // depth truncation is not used by default but can be useful in some scenes
+        float truncateThreshold = 0.f; //meters
+
+        VolumeParams::VolumeKind volumeKind = VolumeParams::VolumeKind::TSDF;
+
+        if (useHashTSDF)
+        {
+            volumeKind = VolumeParams::VolumeKind::HASHTSDF;
+            truncateThreshold = 4.f;
+        }
+        else
+        {
+            volSize = 3.f;
+            volumeDims = Vec3i::all(128); //number of voxels
+            voxelSize = volSize / 128.f;
+            tsdf_trunc_dist = 2 * voxelSize; // 0.04f in meters
+
+            raycast_step_factor = 0.75f;  //in voxel sizes
+        }
+
+        volume = makeVolume(volumeKind, voxelSize, volumePose.matrix,
+                            raycast_step_factor, tsdf_trunc_dist, tsdf_max_weight,
+                            truncateThreshold, volumeDims[0], volumeDims[1], volumeDims[2]);
+
+        scene = Scene::create(frameSize, intr, depthFactor, true);
+        poses = scene->getPoses();
+    }
+};
+
+void displayImage(Mat depth, UMat _points, UMat _normals, float depthFactor, Vec3f lightPose)
+{
+    Mat  points, normals, image;
+    AccessFlag af = ACCESS_READ;
+    normals = _normals.getMat(af);
+    points = _points.getMat(af);
+    patchNaNs(points);
+
+    imshow("depth", depth * (1.f / depthFactor / 4.f));
+    renderPointsNormals(points, normals, image, lightPose);
+    imshow("render", image);
+    waitKey(2000);
+}
+
+static const bool display = false;
+
+PERF_TEST(Perf_TSDF, integrate)
+{
+    Settings settings(false);
+    for (size_t i = 0; i < settings.poses.size(); i++)
+    {
+        Matx44f pose = settings.poses[i].matrix;
+        Mat depth = settings.scene->depth(pose);
+        startTimer();
+        settings.volume->integrate(depth, settings.depthFactor, pose, settings.intr);
+        stopTimer();
+        depth.release();
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST(Perf_TSDF, raycast)
+{
+    Settings settings(false);
+    for (size_t i = 0; i < settings.poses.size(); i++)
+    {
+        UMat _points, _normals;
+        Matx44f pose = settings.poses[i].matrix;
+        Mat depth = settings.scene->depth(pose);
+
+        settings.volume->integrate(depth, settings.depthFactor, pose, settings.intr);
+        startTimer();
+        settings.volume->raycast(pose, settings.intr, settings.frameSize, _points, _normals);
+        stopTimer();
+
+        if (display)
+            displayImage(depth, _points, _normals, settings.depthFactor, settings.lightPose);
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST(Perf_HashTSDF, integrate)
+{
+    Settings settings(true);
+
+    for (size_t i = 0; i < settings.poses.size(); i++)
+    {
+        Matx44f pose = settings.poses[i].matrix;
+        Mat depth = settings.scene->depth(pose);
+        startTimer();
+        settings.volume->integrate(depth, settings.depthFactor, pose, settings.intr);
+        stopTimer();
+        depth.release();
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST(Perf_HashTSDF, raycast)
+{
+    Settings settings(true);
+    for (size_t i = 0; i < settings.poses.size(); i++)
+    {
+        UMat _points, _normals;
+        Matx44f pose = settings.poses[i].matrix;
+        Mat depth = settings.scene->depth(pose);
+
+        settings.volume->integrate(depth, settings.depthFactor, pose, settings.intr);
+        startTimer();
+        settings.volume->raycast(pose, settings.intr, settings.frameSize, _points, _normals);
+        stopTimer();
+
+        if (display)
+            displayImage(depth, _points, _normals, settings.depthFactor, settings.lightPose);
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+}} // namespace
diff --git a/modules/3d/samples/odometry_evaluation.cpp b/modules/3d/samples/odometry_evaluation.cpp
new file mode 100644
index 0000000000..51cd81d4e7
--- /dev/null
+++ b/modules/3d/samples/odometry_evaluation.cpp
@@ -0,0 +1,210 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include <opencv2/3d.hpp>
+
+#include <opencv2/highgui.hpp>
+#include <opencv2/3d.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/utility.hpp>
+#include <opencv2/core/quaternion.hpp>
+
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+using namespace cv;
+
+#define BILATERAL_FILTER 0// if 1 then bilateral filter will be used for the depth
+
+static
+void writeResults( const string& filename, const vector<string>& timestamps, const vector<Mat>& Rt )
+{
+    CV_Assert( timestamps.size() == Rt.size() );
+
+    ofstream file( filename.c_str() );
+    if( !file.is_open() )
+        return;
+
+    cout.precision(4);
+    for( size_t i = 0; i < Rt.size(); i++ )
+    {
+        const Mat& Rt_curr = Rt[i];
+        if( Rt_curr.empty() )
+            continue;
+
+        CV_Assert( Rt_curr.type() == CV_64FC1 );
+
+        Quatd rot = Quatd::createFromRotMat(Rt_curr(Rect(0, 0, 3, 3)));
+
+        // timestamp tx ty tz qx qy qz qw
+        file << timestamps[i] << " " << fixed
+             << Rt_curr.at<double>(0,3) << " " << Rt_curr.at<double>(1,3) << " " << Rt_curr.at<double>(2,3) << " "
+             << rot.x << " " << rot.y << " " << rot.z << " " << rot.w << endl;
+
+    }
+    file.close();
+}
+
+static
+void setCameraMatrixFreiburg1(float& fx, float& fy, float& cx, float& cy)
+{
+    fx = 517.3f; fy = 516.5f; cx = 318.6f; cy = 255.3f;
+}
+
+static
+void setCameraMatrixFreiburg2(float& fx, float& fy, float& cx, float& cy)
+{
+    fx = 520.9f; fy = 521.0f; cx = 325.1f; cy = 249.7f;
+}
+
+/*
+ * This sample helps to evaluate odometry on TUM datasets and benchmark http://vision.in.tum.de/data/datasets/rgbd-dataset.
+ * At this link you can find instructions for evaluation. The sample runs some opencv odometry and saves a camera trajectory
+ * to file of format that the benchmark requires. Saved file can be used for online evaluation.
+ */
+int main(int argc, char** argv)
+{
+    if(argc != 4)
+    {
+        cout << "Format: file_with_rgb_depth_pairs trajectory_file odometry_name [Rgbd or ICP or RgbdICP or FastICP]" << endl;
+        return -1;
+    }
+
+    vector<string> timestamps;
+    vector<Mat> Rts;
+
+    const string filename = argv[1];
+    ifstream file( filename.c_str() );
+    if( !file.is_open() )
+        return -1;
+
+    char dlmrt = '/';
+    size_t pos = filename.rfind(dlmrt);
+    string dirname = pos == string::npos ? "" : filename.substr(0, pos) + dlmrt;
+
+    const int timestampLength = 17;
+    const int rgbPathLehgth = 17+8;
+    const int depthPathLehgth = 17+10;
+
+    float fx = 525.0f, // default
+          fy = 525.0f,
+          cx = 319.5f,
+          cy = 239.5f;
+    if(filename.find("freiburg1") != string::npos)
+        setCameraMatrixFreiburg1(fx, fy, cx, cy);
+    if(filename.find("freiburg2") != string::npos)
+        setCameraMatrixFreiburg2(fx, fy, cx, cy);
+    Mat cameraMatrix = Mat::eye(3,3,CV_32FC1);
+    {
+        cameraMatrix.at<float>(0,0) = fx;
+        cameraMatrix.at<float>(1,1) = fy;
+        cameraMatrix.at<float>(0,2) = cx;
+        cameraMatrix.at<float>(1,2) = cy;
+    }
+
+    Ptr<OdometryFrame> frame_prev = OdometryFrame::create(),
+                       frame_curr = OdometryFrame::create();
+    Ptr<Odometry> odometry = Odometry::createFromName(string(argv[3]) + "Odometry");
+    if(odometry.empty())
+    {
+        cout << "Can not create Odometry algorithm. Check the passed odometry name." << endl;
+        return -1;
+    }
+    odometry->setCameraMatrix(cameraMatrix);
+
+    TickMeter gtm;
+    int count = 0;
+    for(int i = 0; !file.eof(); i++)
+    {
+        string str;
+        std::getline(file, str);
+        if(str.empty()) break;
+        if(str.at(0) == '#') continue; /* comment */
+
+        Mat image, depth;
+        // Read one pair (rgb and depth)
+        // example: 1305031453.359684 rgb/1305031453.359684.png 1305031453.374112 depth/1305031453.374112.png
+#if BILATERAL_FILTER
+        TickMeter tm_bilateral_filter;
+#endif
+        {
+            string rgbFilename = str.substr(timestampLength + 1, rgbPathLehgth );
+            string timestap = str.substr(0, timestampLength);
+            string depthFilename = str.substr(2*timestampLength + rgbPathLehgth + 3, depthPathLehgth );
+
+            image = imread(dirname + rgbFilename);
+            depth = imread(dirname + depthFilename, -1);
+
+            CV_Assert(!image.empty());
+            CV_Assert(!depth.empty());
+            CV_Assert(depth.type() == CV_16UC1);
+
+            cout << i << " " << rgbFilename << " " << depthFilename << endl;
+
+            // scale depth
+            Mat depth_flt;
+            depth.convertTo(depth_flt, CV_32FC1, 1.f/5000.f);
+#if !BILATERAL_FILTER
+            depth_flt.setTo(std::numeric_limits<float>::quiet_NaN(), depth == 0);
+            depth = depth_flt;
+#else
+            tm_bilateral_filter.start();
+            depth = Mat(depth_flt.size(), CV_32FC1, Scalar(0));
+            const double depth_sigma = 0.03;
+            const double space_sigma = 4.5;  // in pixels
+            Mat invalidDepthMask = depth_flt == 0.f;
+            depth_flt.setTo(-5*depth_sigma, invalidDepthMask);
+            bilateralFilter(depth_flt, depth, -1, depth_sigma, space_sigma);
+            depth.setTo(std::numeric_limits<float>::quiet_NaN(), invalidDepthMask);
+            tm_bilateral_filter.stop();
+            cout << "Time filter " << tm_bilateral_filter.getTimeSec() << endl;
+#endif
+            timestamps.push_back( timestap );
+        }
+
+        {
+            Mat gray;
+            cvtColor(image, gray, COLOR_BGR2GRAY);
+            frame_curr->setImage(gray);
+            frame_curr->setDepth(depth);
+
+            Mat Rt;
+            if(!Rts.empty())
+            {
+                TickMeter tm;
+                tm.start();
+                gtm.start();
+                bool res = odometry->compute(frame_curr, frame_prev, Rt);
+                gtm.stop();
+                tm.stop();
+                count++;
+                cout << "Time " << tm.getTimeSec() << endl;
+#if BILATERAL_FILTER
+                cout << "Time ratio " << tm_bilateral_filter.getTimeSec() / tm.getTimeSec() << endl;
+#endif
+                if(!res)
+                    Rt = Mat::eye(4,4,CV_64FC1);
+            }
+
+            if( Rts.empty() )
+                Rts.push_back(Mat::eye(4,4,CV_64FC1));
+            else
+            {
+                Mat& prevRt = *Rts.rbegin();
+                cout << "Rt " << Rt << endl;
+                Rts.push_back( prevRt * Rt );
+            }
+
+            if (!frame_prev.empty())
+                frame_prev.release();
+            std::swap(frame_prev, frame_curr);
+        }
+    }
+
+    std::cout << "Average time " << gtm.getAvgTimeSec() << std::endl;
+    writeResults(argv[2], timestamps, Rts);
+
+    return 0;
+}
diff --git a/modules/3d/src/calibration_base.cpp b/modules/3d/src/calibration_base.cpp
index 860ecd7510..0a9dcdd856 100644
--- a/modules/3d/src/calibration_base.cpp
+++ b/modules/3d/src/calibration_base.cpp
@@ -181,8 +181,8 @@ void cv::Rodrigues(InputArray _src, OutputArray _dst, OutputArray _jacobian)
         }
         else
         {
-            double c = cos(theta);
-            double s = sin(theta);
+            double c = std::cos(theta);
+            double s = std::sin(theta);
             double c1 = 1. - c;
             double itheta = theta ? 1./theta : 0.;
 
@@ -244,7 +244,7 @@ void cv::Rodrigues(InputArray _src, OutputArray _dst, OutputArray _jacobian)
         s = std::sqrt((r.x*r.x + r.y*r.y + r.z*r.z)*0.25);
         c = (R(0, 0) + R(1, 1) + R(2, 2) - 1)*0.5;
         c = c > 1. ? 1. : c < -1. ? -1. : c;
-        theta = acos(c);
+        theta = std::acos(c);
 
         if( s < 1e-5 )
         {
@@ -1047,9 +1047,9 @@ cv::Vec3d cv::RQDecomp3x3( InputArray _Marr,
 
     // calculate the euler angle
     Vec3d eulerAngles(
-        acos(Qx(1, 1)) * (Qx(1, 2) >= 0 ? 1 : -1) * (180.0 / CV_PI),
-        acos(Qy(0, 0)) * (Qy(2, 0) >= 0 ? 1 : -1) * (180.0 / CV_PI),
-        acos(Qz(0, 0)) * (Qz(0, 1) >= 0 ? 1 : -1) * (180.0 / CV_PI));
+        std::acos(Qx(1, 1)) * (Qx(1, 2) >= 0 ? 1 : -1) * (180.0 / CV_PI),
+        std::acos(Qy(0, 0)) * (Qy(2, 0) >= 0 ? 1 : -1) * (180.0 / CV_PI),
+        std::acos(Qz(0, 0)) * (Qz(0, 1) >= 0 ? 1 : -1) * (180.0 / CV_PI));
 
     /* Calculate orthogonal matrix. */
     /*
diff --git a/modules/3d/src/distortion_model.hpp b/modules/3d/src/distortion_model.hpp
index 9280c98c56..c6fb7e58ed 100644
--- a/modules/3d/src/distortion_model.hpp
+++ b/modules/3d/src/distortion_model.hpp
@@ -79,10 +79,10 @@ void computeTiltProjectionMatrix(FLOAT tauX,
     Matx<FLOAT, 3, 3>* dMatTiltdTauY = 0,
     Matx<FLOAT, 3, 3>* invMatTilt = 0)
 {
-    FLOAT cTauX = cos(tauX);
-    FLOAT sTauX = sin(tauX);
-    FLOAT cTauY = cos(tauY);
-    FLOAT sTauY = sin(tauY);
+    FLOAT cTauX = std::cos(tauX);
+    FLOAT sTauX = std::sin(tauX);
+    FLOAT cTauY = std::cos(tauY);
+    FLOAT sTauY = std::sin(tauY);
     Matx<FLOAT, 3, 3> matRotX = Matx<FLOAT, 3, 3>(1,0,0,0,cTauX,sTauX,0,-sTauX,cTauX);
     Matx<FLOAT, 3, 3> matRotY = Matx<FLOAT, 3, 3>(cTauY,0,-sTauY,0,1,0,sTauY,0,cTauY);
     Matx<FLOAT, 3, 3> matRotXY = matRotY * matRotX;
diff --git a/modules/3d/src/dls.cpp b/modules/3d/src/dls.cpp
index 67b2cd8132..cbcd8fea6a 100644
--- a/modules/3d/src/dls.cpp
+++ b/modules/3d/src/dls.cpp
@@ -613,24 +613,24 @@ Mat dls::skewsymm(const Mat * X1)
 Mat dls::rotx(const double t)
 {
     // rotx: rotation about y-axis
-    double ct = cos(t);
-    double st = sin(t);
+    double ct = std::cos(t);
+    double st = std::sin(t);
     return (Mat_<double>(3,3) << 1, 0, 0, 0, ct, -st, 0, st, ct);
 }
 
 Mat dls::roty(const double t)
 {
     // roty: rotation about y-axis
-    double ct = cos(t);
-    double st = sin(t);
+    double ct = std::cos(t);
+    double st = std::sin(t);
     return (Mat_<double>(3,3) << ct, 0, st, 0, 1, 0, -st, 0, ct);
 }
 
 Mat dls::rotz(const double t)
 {
     // rotz: rotation about y-axis
-    double ct = cos(t);
-    double st = sin(t);
+    double ct = std::cos(t);
+    double st = std::sin(t);
     return (Mat_<double>(3,3) << ct, -st, 0, st, ct, 0, 0, 0, 1);
 }
 
diff --git a/modules/3d/src/ippe.cpp b/modules/3d/src/ippe.cpp
index ccfda42c55..e15641e2bf 100644
--- a/modules/3d/src/ippe.cpp
+++ b/modules/3d/src/ippe.cpp
@@ -303,9 +303,9 @@ void PoseSolver::rot2vec(InputArray _R, OutputArray _r)
     Mat rvec = _r.getMat();
 
     double trace = R.at<double>(0, 0) + R.at<double>(1, 1) + R.at<double>(2, 2);
-    double w_norm = acos((trace - 1.0) / 2.0);
+    double w_norm = std::acos((trace - 1.0) / 2.0);
     double eps = std::numeric_limits<float>::epsilon();
-    double d = 1 / (2 * sin(w_norm)) * w_norm;
+    double d = 1 / (2 * std::sin(w_norm)) * w_norm;
     if (w_norm < eps) //rotation is the identity
     {
         rvec.setTo(0);
diff --git a/modules/3d/src/levmarq.cpp b/modules/3d/src/levmarq.cpp
index c539585b51..d4c188f17c 100644
--- a/modules/3d/src/levmarq.cpp
+++ b/modules/3d/src/levmarq.cpp
@@ -146,11 +146,11 @@ static int LMSolver_run(InputOutputArray _param0, InputArray _mask,
     Mat mask = _mask.getMat();
     Mat param0 = _param0.getMat();
     Mat x, xd, r, rd, J, A, Ap, v, temp_d, d, Am, vm, dm;
-    int ptype = param0.type();
+    int p0type = param0.type();
     int maxIters = termcrit.type & TermCriteria::COUNT ? termcrit.maxCount : 1000;
     double epsx = termcrit.type & TermCriteria::EPS ? termcrit.epsilon : 0, epsf = epsx;
 
-    CV_Assert( (param0.cols == 1 || param0.rows == 1) && (ptype == CV_32F || ptype == CV_64F));
+    CV_Assert( (param0.cols == 1 || param0.rows == 1) && (p0type == CV_32F || p0type == CV_64F));
     CV_Assert( cb || cb_alt );
 
     int lx = param0.rows + param0.cols - 1;
@@ -278,7 +278,7 @@ static int LMSolver_run(InputOutputArray _param0, InputArray _mask,
     if( param0.size() != x.size() )
         transpose(x, x);
 
-    x.convertTo(param0, ptype);
+    x.convertTo(param0, p0type);
     if( iter == maxIters )
         iter = -iter;
 
diff --git a/modules/3d/src/opencl/hash_tsdf.cl b/modules/3d/src/opencl/hash_tsdf.cl
new file mode 100644
index 0000000000..3b5a906518
--- /dev/null
+++ b/modules/3d/src/opencl/hash_tsdf.cl
@@ -0,0 +1,653 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory
+
+#define USE_INTERPOLATION_IN_GETNORMAL 1
+#define HASH_DIVISOR 32768
+
+typedef char int8_t;
+typedef uint int32_t;
+
+typedef int8_t TsdfType;
+typedef uchar WeightType;
+
+struct TsdfVoxel
+{
+    TsdfType tsdf;
+    WeightType weight;
+};
+
+
+static inline TsdfType floatToTsdf(float num)
+{
+    int8_t res = (int8_t) ( (num * (-128)) );
+    res = res ? res : (num < 0 ? 1 : -1);
+    return res;
+}
+
+static inline float tsdfToFloat(TsdfType num)
+{
+    return ( (float) num ) / (-128);
+}
+
+static uint calc_hash(int3 x)
+{
+    unsigned int seed = 0;
+    unsigned int GOLDEN_RATIO = 0x9e3779b9;
+    seed ^= x.s0 + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+    seed ^= x.s1 + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+    seed ^= x.s2 + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+    return seed;
+}
+
+
+//TODO: make hashDivisor a power of 2
+//TODO: put it to this .cl file as a constant
+static int custom_find(int3 idx, const int hashDivisor, __global const int* hashes,
+                    __global const int4* data)
+{
+    int hash = calc_hash(idx) % hashDivisor;
+    int place = hashes[hash];
+    // search a place
+    while (place >= 0)
+    {
+        if (all(data[place].s012 == idx))
+            break;
+        else
+            place = data[place].s3;
+    }
+
+    return place;
+}
+
+
+
+static void integrateVolumeUnit(
+                        int x, int y,
+                        __global const char * depthptr,
+                        int depth_step, int depth_offset,
+                        int depth_rows, int depth_cols,
+                        __global struct TsdfVoxel * volumeptr,
+                        const __global char * pixNormsPtr,
+                        int pixNormsStep, int pixNormsOffset,
+                        int pixNormsRows, int pixNormsCols,
+                        const float16 vol2camMatrix,
+                        const float voxelSize,
+                        const int4 volResolution4,
+                        const int4 volStrides4,
+                        const float2 fxy,
+                        const float2 cxy,
+                        const float dfac,
+                        const float truncDist,
+                        const int maxWeight
+                        )
+{
+    const int3 volResolution = volResolution4.xyz;
+
+    if(x >= volResolution.x || y >= volResolution.y)
+        return;
+
+    // coord-independent constants
+    const int3 volStrides = volStrides4.xyz;
+    const float2 limits = (float2)(depth_cols-1, depth_rows-1);
+
+    const float4 vol2cam0 = vol2camMatrix.s0123;
+    const float4 vol2cam1 = vol2camMatrix.s4567;
+    const float4 vol2cam2 = vol2camMatrix.s89ab;
+
+    const float truncDistInv = 1.f/truncDist;
+
+    // optimization of camSpace transformation (vector addition instead of matmul at each z)
+    float4 inPt = (float4)(x*voxelSize, y*voxelSize, 0, 1);
+    float3 basePt = (float3)(dot(vol2cam0, inPt),
+                             dot(vol2cam1, inPt),
+                             dot(vol2cam2, inPt));
+
+    float3 camSpacePt = basePt;
+
+    // zStep == vol2cam*(float3(x, y, 1)*voxelSize) - basePt;
+    float3 zStep = ((float3)(vol2cam0.z, vol2cam1.z, vol2cam2.z))*voxelSize;
+
+    int volYidx = x*volStrides.x + y*volStrides.y;
+
+    int startZ, endZ;
+    if(fabs(zStep.z) > 1e-5f)
+    {
+        int baseZ = convert_int(-basePt.z / zStep.z);
+        if(zStep.z > 0)
+        {
+            startZ = baseZ;
+            endZ = volResolution.z;
+        }
+        else
+        {
+            startZ = 0;
+            endZ = baseZ;
+        }
+    }
+    else
+    {
+        if(basePt.z > 0)
+        {
+            startZ = 0; endZ = volResolution.z;
+        }
+        else
+        {
+            // z loop shouldn't be performed
+            startZ = endZ = 0;
+        }
+    }
+
+    startZ = max(0, startZ);
+    endZ = min(volResolution.z, endZ);
+
+    for(int z = startZ; z < endZ; z++)
+    {
+        // optimization of the following:
+        //float3 camSpacePt = vol2cam * ((float3)(x, y, z)*voxelSize);
+        camSpacePt += zStep;
+
+        if(camSpacePt.z <= 0)
+            continue;
+
+        float3 camPixVec = camSpacePt / camSpacePt.z;
+        float2 projected = mad(camPixVec.xy, fxy, cxy); // mad(a,b,c) = a * b + c
+
+        float v;
+        // bilinearly interpolate depth at projected
+        if(all(projected >= 0) && all(projected < limits))
+        {
+            float2 ip = floor(projected);
+            int xi = ip.x, yi = ip.y;
+
+            __global const float* row0 = (__global const float*)(depthptr + depth_offset +
+                                                                 (yi+0)*depth_step);
+            __global const float* row1 = (__global const float*)(depthptr + depth_offset +
+                                                                 (yi+1)*depth_step);
+
+            float v00 = row0[xi+0];
+            float v01 = row0[xi+1];
+            float v10 = row1[xi+0];
+            float v11 = row1[xi+1];
+            float4 vv = (float4)(v00, v01, v10, v11);
+
+            // assume correct depth is positive
+            if(all(vv > 0))
+            {
+                float2 t = projected - ip;
+                float2 vf = mix(vv.xz, vv.yw, t.x);
+                v = mix(vf.s0, vf.s1, t.y);
+            }
+            else
+                continue;
+        }
+        else
+            continue;
+
+        if(v == 0)
+            continue;
+
+        int2 projInt = convert_int2(projected);
+        float pixNorm = *(__global const float*)(pixNormsPtr + pixNormsOffset + projInt.y*pixNormsStep + projInt.x*sizeof(float));
+        //float pixNorm = length(camPixVec);
+
+        // difference between distances of point and of surface to camera
+        float sdf = pixNorm*(v*dfac - camSpacePt.z);
+        // possible alternative is:
+        // float sdf = length(camSpacePt)*(v*dfac/camSpacePt.z - 1.0);
+        if(sdf >= -truncDist)
+        {
+            float tsdf = fmin(1.0f, sdf * truncDistInv);
+            int volIdx = volYidx + z*volStrides.z;
+
+            struct TsdfVoxel voxel = volumeptr[volIdx];
+            float value  = tsdfToFloat(voxel.tsdf);
+            int weight = voxel.weight;
+            // update TSDF
+            value = (value*weight + tsdf) / (weight + 1);
+            weight = min(weight + 1, maxWeight);
+
+            voxel.tsdf = floatToTsdf(value);
+            voxel.weight = weight;
+            volumeptr[volIdx] = voxel;
+        }
+    }
+
+}
+
+
+__kernel void integrateAllVolumeUnits(
+                        // depth
+                        __global const char * depthptr,
+                        int depth_step, int depth_offset,
+                        int depth_rows, int depth_cols,
+                        // hashMap
+                        __global const int* hashes,
+                        __global const int4* data,
+                        // volUnitsData
+                        __global struct TsdfVoxel * allVolumePtr,
+                        int table_step, int table_offset,
+                        int table_rows, int table_cols,
+                        // pixNorms
+                        const __global char * pixNormsPtr,
+                        int pixNormsStep, int pixNormsOffset,
+                        int pixNormsRows, int pixNormsCols,
+                        // isActiveFlags
+                        __global const uchar* isActiveFlagsPtr,
+                        int isActiveFlagsStep, int isActiveFlagsOffset,
+                        int isActiveFlagsRows, int isActiveFlagsCols,
+                        // cam matrices:
+                        const float16 vol2cam,
+                        const float16 camInv,
+                        // scalars:
+                        const float voxelSize,
+                        const int volUnitResolution,
+                        const int4 volStrides4,
+                        const float2 fxy,
+                        const float2 cxy,
+                        const float dfac,
+                        const float truncDist,
+                        const int maxWeight
+                        )
+{
+    const int hash_divisor = HASH_DIVISOR;
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    int row = get_global_id(2);
+    int3 idx = data[row].xyz;
+
+    const int4 volResolution4 = (int4)(volUnitResolution,
+                                       volUnitResolution,
+                                       volUnitResolution,
+                                       volUnitResolution);
+
+    int isActive = *(__global const uchar*)(isActiveFlagsPtr + isActiveFlagsOffset + row);
+
+    if (isActive)
+    {
+        int volCubed = volUnitResolution * volUnitResolution * volUnitResolution;
+        __global struct TsdfVoxel * volumeptr = (__global struct TsdfVoxel*)
+                                                (allVolumePtr + table_offset + row * volCubed);
+
+        // volUnit2cam = world2cam * volUnit2world =
+        // camPoseInv * volUnitPose = camPoseInv * (volPose + idx * volUnitSize) =
+        // camPoseInv * (volPose + idx * volUnitResolution * voxelSize) =
+        // camPoseInv * (volPose + mulIdx) = camPoseInv * volPose + camPoseInv * mulIdx =
+        // vol2cam + camPoseInv * mulIdx
+        float3 mulIdx = convert_float3(idx * volUnitResolution) * voxelSize;
+        float16 volUnit2cam = vol2cam;
+        volUnit2cam.s37b += (float3)(dot(mulIdx, camInv.s012),
+                                     dot(mulIdx, camInv.s456),
+                                     dot(mulIdx, camInv.s89a));
+
+        integrateVolumeUnit(
+            i, j,
+            depthptr,
+            depth_step, depth_offset,
+            depth_rows, depth_cols,
+            volumeptr,
+            pixNormsPtr,
+            pixNormsStep, pixNormsOffset,
+            pixNormsRows, pixNormsCols,
+            volUnit2cam,
+            voxelSize,
+            volResolution4,
+            volStrides4,
+            fxy,
+            cxy,
+            dfac,
+            truncDist,
+            maxWeight
+            );
+    }
+}
+
+
+static struct TsdfVoxel at(int3 volumeIdx, int row, int volumeUnitDegree,
+                           int3 volStrides, __global const struct TsdfVoxel * allVolumePtr, int table_offset)
+
+{
+    //! Out of bounds
+    if (any(volumeIdx >= (1 << volumeUnitDegree)) ||
+        any(volumeIdx < 0))
+    {
+        struct TsdfVoxel dummy;
+        dummy.tsdf = floatToTsdf(1.0f);
+        dummy.weight = 0;
+        return dummy;
+    }
+
+    int volCubed = 1 << (volumeUnitDegree*3);
+    __global struct TsdfVoxel * volData = (__global struct TsdfVoxel*)
+                                          (allVolumePtr + table_offset + row * volCubed);
+    int3 ismul = volumeIdx * volStrides;
+    int coordBase = ismul.x + ismul.y + ismul.z;
+    return volData[coordBase];
+}
+
+
+static struct TsdfVoxel atVolumeUnit(int3 volumeIdx, int3 volumeUnitIdx, int row,
+                                     int volumeUnitDegree, int3 volStrides,
+                                     __global const struct TsdfVoxel * allVolumePtr, int table_offset)
+
+{
+    //! Out of bounds
+    if (row < 0)
+    {
+        struct TsdfVoxel dummy;
+        dummy.tsdf = floatToTsdf(1.0f);
+        dummy.weight = 0;
+        return dummy;
+    }
+
+    int3 volUnitLocalIdx = volumeIdx - (volumeUnitIdx << volumeUnitDegree);
+    int volCubed = 1 << (volumeUnitDegree*3);
+    __global struct TsdfVoxel * volData = (__global struct TsdfVoxel*)
+                                          (allVolumePtr + table_offset + row * volCubed);
+    int3 ismul = volUnitLocalIdx * volStrides;
+    int coordBase = ismul.x + ismul.y + ismul.z;
+    return volData[coordBase];
+}
+
+inline float interpolate(float3 t, float8 vz)
+{
+    float4 vy = mix(vz.s0246, vz.s1357, t.z);
+    float2 vx = mix(vy.s02, vy.s13, t.y);
+    return mix(vx.s0, vx.s1, t.x);
+}
+
+inline float3 getNormalVoxel(float3 ptVox, __global const struct TsdfVoxel* allVolumePtr,
+                             int volumeUnitDegree,
+                             const int hash_divisor,
+                             __global const int* hashes,
+                             __global const int4* data,
+
+                             int3 volStrides, int table_offset)
+{
+    float3 normal = (float3) (0.0f, 0.0f, 0.0f);
+    float3 fip = floor(ptVox);
+    int3 iptVox = convert_int3(fip);
+
+    // A small hash table to reduce a number of findRow() calls
+    // -2 and lower means not queried yet
+    // -1 means not found
+    // 0+ means found
+    int iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = -2;
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    int4 offsets[] = { (int4)( 1,  0,  0, 0), (int4)(-1,  0,  0, 0), (int4)( 0,  1,  0, 0), // 0-3
+                       (int4)( 0, -1,  0, 0), (int4)( 0,  0,  1, 0), (int4)( 0,  0, -1, 0)  // 4-7
+    };
+
+    const int nVals = 6;
+    float vals[6];
+#else
+    int4 offsets[]={(int4)( 0,  0,  0, 0), (int4)( 0,  0,  1, 0), (int4)( 0,  1,  0, 0), (int4)( 0,  1,  1, 0), //  0-3
+                    (int4)( 1,  0,  0, 0), (int4)( 1,  0,  1, 0), (int4)( 1,  1,  0, 0), (int4)( 1,  1,  1, 0), //  4-7
+                    (int4)(-1,  0,  0, 0), (int4)(-1,  0,  1, 0), (int4)(-1,  1,  0, 0), (int4)(-1,  1,  1, 0), //  8-11
+                    (int4)( 2,  0,  0, 0), (int4)( 2,  0,  1, 0), (int4)( 2,  1,  0, 0), (int4)( 2,  1,  1, 0), // 12-15
+                    (int4)( 0, -1,  0, 0), (int4)( 0, -1,  1, 0), (int4)( 1, -1,  0, 0), (int4)( 1, -1,  1, 0), // 16-19
+                    (int4)( 0,  2,  0, 0), (int4)( 0,  2,  1, 0), (int4)( 1,  2,  0, 0), (int4)( 1,  2,  1, 0), // 20-23
+                    (int4)( 0,  0, -1, 0), (int4)( 0,  1, -1, 0), (int4)( 1,  0, -1, 0), (int4)( 1,  1, -1, 0), // 24-27
+                    (int4)( 0,  0,  2, 0), (int4)( 0,  1,  2, 0), (int4)( 1,  0,  2, 0), (int4)( 1,  1,  2, 0), // 28-31
+    };
+    const int nVals = 32;
+    float vals[32];
+#endif
+
+    for (int i = 0; i < nVals; i++)
+    {
+        int3 pt = iptVox + offsets[i].s012;
+
+        // VoxelToVolumeUnitIdx()
+        int3 volumeUnitIdx = pt >> volumeUnitDegree;
+
+        int3 vand = (volumeUnitIdx & 1);
+        int dictIdx = vand.s0 + vand.s1 * 2 + vand.s2 * 4;
+
+        int it = iterMap[dictIdx];
+        if (it < -1)
+        {
+            it = custom_find(volumeUnitIdx, hash_divisor, hashes, data);
+            iterMap[dictIdx] = it;
+        }
+
+        struct TsdfVoxel tmp = atVolumeUnit(pt, volumeUnitIdx, it, volumeUnitDegree, volStrides, allVolumePtr, table_offset);
+        vals[i] = tsdfToFloat( tmp.tsdf );
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    float3 pv, nv;
+
+    pv = (float3)(vals[0*2  ], vals[1*2  ], vals[2*2  ]);
+    nv = (float3)(vals[0*2+1], vals[1*2+1], vals[2*2+1]);
+    normal = pv - nv;
+#else
+
+    float cxv[8], cyv[8], czv[8];
+
+    // How these numbers were obtained:
+    // 1. Take the basic interpolation sequence:
+    // 000, 001, 010, 011, 100, 101, 110, 111
+    // where each digit corresponds to shift by x, y, z axis respectively.
+    // 2. Add +1 for next or -1 for prev to each coordinate to corresponding axis
+    // 3. Search corresponding values in offsets
+    const int idxxn[8] = {  8,  9, 10, 11,  0,  1,  2,  3 };
+    const int idxxp[8] = {  4,  5,  6,  7, 12, 13, 14, 15 };
+    const int idxyn[8] = { 16, 17,  0,  1, 18, 19,  4,  5 };
+    const int idxyp[8] = {  2,  3, 20, 21,  6,  7, 22, 23 };
+    const int idxzn[8] = { 24,  0, 25,  2, 26,  4, 27,  6 };
+    const int idxzp[8] = {  1, 28,  3, 29,  5, 30,  7, 31 };
+
+    float vcxp[8], vcxn[8];
+    float vcyp[8], vcyn[8];
+    float vczp[8], vczn[8];
+
+    for (int i = 0; i < 8; i++)
+    {
+        vcxp[i] = vals[idxxp[i]]; vcxn[i] = vals[idxxn[i]];
+        vcyp[i] = vals[idxyp[i]]; vcyn[i] = vals[idxyn[i]];
+        vczp[i] = vals[idxzp[i]]; vczn[i] = vals[idxzn[i]];
+    }
+
+    float8 cxp = vload8(0, vcxp), cxn = vload8(0, vcxn);
+    float8 cyp = vload8(0, vcyp), cyn = vload8(0, vcyn);
+    float8 czp = vload8(0, vczp), czn = vload8(0, vczn);
+    float8 cx = cxp - cxn;
+    float8 cy = cyp - cyn;
+    float8 cz = czp - czn;
+
+    float3 tv = ptVox - fip;
+    normal.x = interpolate(tv, cx);
+    normal.y = interpolate(tv, cy);
+    normal.z = interpolate(tv, cz);
+#endif
+
+    float norm = sqrt(dot(normal, normal));
+    return norm < 0.0001f ? nan((uint)0) : normal / norm;
+}
+
+typedef float4 ptype;
+
+__kernel void raycast(
+                    __global const int* hashes,
+                    __global const int4* data,
+                    __global char * pointsptr,
+                      int points_step, int points_offset,
+                    __global char * normalsptr,
+                      int normals_step, int normals_offset,
+                    const int2 frameSize,
+                    __global const struct TsdfVoxel * allVolumePtr,
+                        int table_step, int table_offset,
+                        int table_rows, int table_cols,
+                    float16 cam2volRotGPU,
+                    float16 vol2camRotGPU,
+                    float truncateThreshold,
+                    const float2 fixy, const float2 cxy,
+                    const float4 boxDown4, const float4 boxUp4,
+                    const float tstep,
+                    const float voxelSize,
+                    const float voxelSizeInv,
+                    float volumeUnitSize,
+                    float truncDist,
+                    int volumeUnitDegree,
+                    int4 volStrides4
+                    )
+{
+    const int hash_divisor = HASH_DIVISOR;
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x >= frameSize.x || y >= frameSize.y)
+        return;
+
+    float3 point  = nan((uint)0);
+    float3 normal = nan((uint)0);
+
+    const float3 camRot0  = cam2volRotGPU.s012;
+    const float3 camRot1  = cam2volRotGPU.s456;
+    const float3 camRot2  = cam2volRotGPU.s89a;
+    const float3 camTrans = cam2volRotGPU.s37b;
+
+    const float3 volRot0  = vol2camRotGPU.s012;
+    const float3 volRot1  = vol2camRotGPU.s456;
+    const float3 volRot2  = vol2camRotGPU.s89a;
+    const float3 volTrans = vol2camRotGPU.s37b;
+
+    float3 planed = (float3)(((float2)(x, y) - cxy)*fixy, 1.f);
+    planed = (float3)(dot(planed, camRot0),
+                      dot(planed, camRot1),
+                      dot(planed, camRot2));
+
+    float3 orig = (float3) (camTrans.s0, camTrans.s1, camTrans.s2);
+    float3 dir = fast_normalize(planed);
+    float3 origScaled = orig * voxelSizeInv;
+    float3 dirScaled = dir * voxelSizeInv;
+
+    float tmin = 0;
+    float tmax = truncateThreshold;
+    float tcurr = tmin;
+    float tprev = tcurr;
+    float prevTsdf = truncDist;
+
+    int3 volStrides = volStrides4.xyz;
+
+    while (tcurr < tmax)
+    {
+        float3 currRayPosVox = origScaled + tcurr * dirScaled;
+
+        // VolumeToVolumeUnitIdx()
+        int3 currVoxel = convert_int3(floor(currRayPosVox));
+        int3 currVolumeUnitIdx = currVoxel >> volumeUnitDegree;
+
+        int row = custom_find(currVolumeUnitIdx, hash_divisor, hashes, data);
+
+        float currTsdf = prevTsdf;
+        int currWeight = 0;
+        float stepSize = 0.5 * volumeUnitSize;
+        int3 volUnitLocalIdx;
+
+        if (row >= 0)
+        {
+            volUnitLocalIdx = currVoxel - (currVolumeUnitIdx << volumeUnitDegree);
+            struct TsdfVoxel currVoxel = at(volUnitLocalIdx, row, volumeUnitDegree, volStrides, allVolumePtr, table_offset);
+
+            currTsdf = tsdfToFloat(currVoxel.tsdf);
+            currWeight = currVoxel.weight;
+            stepSize = tstep;
+        }
+
+        if (prevTsdf > 0.f && currTsdf <= 0.f && currWeight > 0)
+        {
+            float tInterp = (tcurr * prevTsdf - tprev * currTsdf) / (prevTsdf - currTsdf);
+            if ( !isnan(tInterp) && !isinf(tInterp) )
+            {
+                float3 pvox = origScaled + tInterp * dirScaled;
+                float3 nv = getNormalVoxel( pvox, allVolumePtr, volumeUnitDegree,
+                                            hash_divisor, hashes, data,
+                                            volStrides, table_offset);
+
+                if(!any(isnan(nv)))
+                {
+                    //convert pv and nv to camera space
+                    normal = (float3)(dot(nv, volRot0),
+                                      dot(nv, volRot1),
+                                      dot(nv, volRot2));
+                    // interpolation optimized a little
+                    float3 pv = pvox * voxelSize;
+                    point = (float3)(dot(pv, volRot0),
+                                     dot(pv, volRot1),
+                                     dot(pv, volRot2)) + volTrans;
+                }
+            }
+            break;
+        }
+        prevTsdf = currTsdf;
+        tprev = tcurr;
+        tcurr += stepSize;
+    }
+
+    __global float* pts = (__global float*)(pointsptr  +  points_offset + y*points_step   + x*sizeof(ptype));
+    __global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step  + x*sizeof(ptype));
+    vstore4((float4)(point,  0), 0, pts);
+    vstore4((float4)(normal, 0), 0, nrm);
+}
+
+
+__kernel void markActive (
+        __global const int4* hashSetData,
+
+        __global char* isActiveFlagsPtr,
+        int isActiveFlagsStep, int isActiveFlagsOffset,
+        int isActiveFlagsRows, int isActiveFlagsCols,
+
+        __global char* lastVisibleIndicesPtr,
+        int lastVisibleIndicesStep, int lastVisibleIndicesOffset,
+        int lastVisibleIndicesRows, int lastVisibleIndicesCols,
+
+        const float16 vol2cam,
+        const float2 fxy,
+        const float2 cxy,
+        const int2 frameSz,
+        const float volumeUnitSize,
+        const int lastVolIndex,
+        const float truncateThreshold,
+        const int frameId
+        )
+{
+    const int hash_divisor = HASH_DIVISOR;
+    int row = get_global_id(0);
+
+    if (row < lastVolIndex)
+    {
+        int3 idx = hashSetData[row].xyz;
+
+        float3 volumeUnitPos = convert_float3(idx) * volumeUnitSize;
+
+        float3 volUnitInCamSpace = (float3) (dot(volumeUnitPos, vol2cam.s012),
+                                             dot(volumeUnitPos, vol2cam.s456),
+                                             dot(volumeUnitPos, vol2cam.s89a)) + vol2cam.s37b;
+
+        if (volUnitInCamSpace.z < 0 || volUnitInCamSpace.z > truncateThreshold)
+        {
+            *(isActiveFlagsPtr + isActiveFlagsOffset + row * isActiveFlagsStep) = 0;
+            return;
+        }
+
+        float2 cameraPoint;
+        float invz = 1.f / volUnitInCamSpace.z;
+        cameraPoint = fxy * volUnitInCamSpace.xy * invz + cxy;
+
+        if (all(cameraPoint >= 0) && all(cameraPoint < convert_float2(frameSz)))
+        {
+            *(__global int*)(lastVisibleIndicesPtr + lastVisibleIndicesOffset + row * lastVisibleIndicesStep) = frameId;
+            *(isActiveFlagsPtr + isActiveFlagsOffset + row * isActiveFlagsStep) = 1;
+        }
+    }
+}
diff --git a/modules/3d/src/opencl/icp.cl b/modules/3d/src/opencl/icp.cl
new file mode 100644
index 0000000000..93b859a194
--- /dev/null
+++ b/modules/3d/src/opencl/icp.cl
@@ -0,0 +1,239 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#define UTSIZE 27
+
+typedef float4 ptype;
+
+/*
+    Calculate an upper triangle of Ab matrix then reduce it across workgroup
+*/
+
+inline void calcAb7(__global const char * oldPointsptr,
+                    int oldPoints_step, int oldPoints_offset,
+                    __global const char * oldNormalsptr,
+                    int oldNormals_step, int oldNormals_offset,
+                    const int2 oldSize,
+                    __global const char * newPointsptr,
+                    int newPoints_step, int newPoints_offset,
+                    __global const char * newNormalsptr,
+                    int newNormals_step, int newNormals_offset,
+                    const int2 newSize,
+                    const float16 poseMatrix,
+                    const float2 fxy,
+                    const float2 cxy,
+                    const float sqDistanceThresh,
+                    const float minCos,
+                    float* ab7
+                    )
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if(x >= newSize.x || y >= newSize.y)
+        return;
+
+    // coord-independent constants
+
+    const float3 poseRot0 = poseMatrix.s012;
+    const float3 poseRot1 = poseMatrix.s456;
+    const float3 poseRot2 = poseMatrix.s89a;
+    const float3 poseTrans = poseMatrix.s37b;
+
+    const float2 oldEdge = (float2)(oldSize.x - 1, oldSize.y - 1);
+
+    __global const ptype* newPtsRow = (__global const ptype*)(newPointsptr +
+                                                              newPoints_offset +
+                                                              y*newPoints_step);
+
+    __global const ptype* newNrmRow = (__global const ptype*)(newNormalsptr +
+                                                              newNormals_offset +
+                                                              y*newNormals_step);
+
+    float3 newP = newPtsRow[x].xyz;
+    float3 newN = newNrmRow[x].xyz;
+
+    if( any(isnan(newP)) || any(isnan(newN)) ||
+        any(isinf(newP)) || any(isinf(newN)) )
+        return;
+
+    //transform to old coord system
+    newP = (float3)(dot(newP, poseRot0),
+                    dot(newP, poseRot1),
+                    dot(newP, poseRot2)) + poseTrans;
+    newN = (float3)(dot(newN, poseRot0),
+                    dot(newN, poseRot1),
+                    dot(newN, poseRot2));
+
+    //find correspondence by projecting the point
+    float2 oldCoords = (newP.xy/newP.z)*fxy+cxy;
+
+    if(!(all(oldCoords >= 0.f) && all(oldCoords < oldEdge)))
+        return;
+
+    // bilinearly interpolate oldPts and oldNrm under oldCoords point
+    float3 oldP, oldN;
+    float2 ip = floor(oldCoords);
+    float2 t = oldCoords - ip;
+    int xi = ip.x, yi = ip.y;
+
+    __global const ptype* prow0 = (__global const ptype*)(oldPointsptr +
+                                                          oldPoints_offset +
+                                                          (yi+0)*oldPoints_step);
+    __global const ptype* prow1 = (__global const ptype*)(oldPointsptr +
+                                                          oldPoints_offset +
+                                                          (yi+1)*oldPoints_step);
+    float3 p00 = prow0[xi+0].xyz;
+    float3 p01 = prow0[xi+1].xyz;
+    float3 p10 = prow1[xi+0].xyz;
+    float3 p11 = prow1[xi+1].xyz;
+
+    // NaN check is done later
+
+    __global const ptype* nrow0 = (__global const ptype*)(oldNormalsptr +
+                                                          oldNormals_offset +
+                                                          (yi+0)*oldNormals_step);
+    __global const ptype* nrow1 = (__global const ptype*)(oldNormalsptr +
+                                                          oldNormals_offset +
+                                                          (yi+1)*oldNormals_step);
+
+    float3 n00 = nrow0[xi+0].xyz;
+    float3 n01 = nrow0[xi+1].xyz;
+    float3 n10 = nrow1[xi+0].xyz;
+    float3 n11 = nrow1[xi+1].xyz;
+
+    // NaN check is done later
+
+    float3 p0 = mix(p00, p01, t.x);
+    float3 p1 = mix(p10, p11, t.x);
+    oldP = mix(p0, p1, t.y);
+
+    float3 n0 = mix(n00, n01, t.x);
+    float3 n1 = mix(n10, n11, t.x);
+    oldN = mix(n0, n1, t.y);
+
+    if( any(isnan(oldP)) || any(isnan(oldN)) ||
+        any(isinf(oldP)) || any(isinf(oldN)) )
+        return;
+
+    //filter by distance
+    float3 diff = newP - oldP;
+    if(dot(diff, diff) > sqDistanceThresh)
+        return;
+
+    //filter by angle
+    if(fabs(dot(newN, oldN)) < minCos)
+        return;
+
+    // build point-wise vector ab = [ A | b ]
+
+    float3 VxN = cross(newP, oldN);
+    float ab[7] = {VxN.x, VxN.y, VxN.z, oldN.x, oldN.y, oldN.z, -dot(oldN, diff)};
+
+    for(int i = 0; i < 7; i++)
+        ab7[i] = ab[i];
+}
+
+
+__kernel void getAb(__global const char * oldPointsptr,
+                    int oldPoints_step, int oldPoints_offset,
+                    __global const char * oldNormalsptr,
+                    int oldNormals_step, int oldNormals_offset,
+                    const int2 oldSize,
+                    __global const char * newPointsptr,
+                    int newPoints_step, int newPoints_offset,
+                    __global const char * newNormalsptr,
+                    int newNormals_step, int newNormals_offset,
+                    const int2 newSize,
+                    const float16 poseMatrix,
+                    const float2 fxy,
+                    const float2 cxy,
+                    const float sqDistanceThresh,
+                    const float minCos,
+                    __local float * reducebuf,
+                    __global char* groupedSumptr,
+                    int groupedSum_step, int groupedSum_offset
+)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int gx = get_group_id(0);
+    const int gy = get_group_id(1);
+    const int gw = get_num_groups(0);
+    const int gh = get_num_groups(1);
+
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+    const int lw = get_local_size(0);
+    const int lh = get_local_size(1);
+    const int lsz = lw*lh;
+    const int lid = lx + ly*lw;
+
+    float ab[7];
+    for(int i = 0; i < 7; i++)
+        ab[i] = 0;
+
+    calcAb7(oldPointsptr,
+            oldPoints_step, oldPoints_offset,
+            oldNormalsptr,
+            oldNormals_step, oldNormals_offset,
+            oldSize,
+            newPointsptr,
+            newPoints_step, newPoints_offset,
+            newNormalsptr,
+            newNormals_step, newNormals_offset,
+            newSize,
+            poseMatrix,
+            fxy, cxy,
+            sqDistanceThresh,
+            minCos,
+            ab);
+
+    // build point-wise upper-triangle matrix [ab^T * ab] w/o last row
+    // which is [A^T*A | A^T*b]
+    // and gather sum
+
+    __local float* upperTriangle = reducebuf + lid*UTSIZE;
+
+    int pos = 0;
+    for(int i = 0; i < 6; i++)
+    {
+        for(int j = i; j < 7; j++)
+        {
+            upperTriangle[pos++] = ab[i]*ab[j];
+        }
+    }
+
+    // reduce upperTriangle to local mem
+
+    // maxStep = ctz(lsz), ctz isn't supported on CUDA devices
+    const int c = clz(lsz & -lsz);
+    const int maxStep = c ? 31 - c : c;
+    for(int nstep = 1; nstep <= maxStep; nstep++)
+    {
+        if(lid % (1 << nstep) == 0)
+        {
+            __local float* rto   = reducebuf + UTSIZE*lid;
+            __local float* rfrom = reducebuf + UTSIZE*(lid+(1 << (nstep-1)));
+            for(int i = 0; i < UTSIZE; i++)
+                rto[i] += rfrom[i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // here group sum should be in reducebuf[0...UTSIZE]
+    if(lid == 0)
+    {
+        __global float* groupedRow = (__global float*)(groupedSumptr +
+                                                       groupedSum_offset +
+                                                       gy*groupedSum_step);
+
+        for(int i = 0; i < UTSIZE; i++)
+            groupedRow[gx*UTSIZE + i] = reducebuf[i];
+    }
+}
diff --git a/modules/3d/src/opencl/kinfu_frame.cl b/modules/3d/src/opencl/kinfu_frame.cl
new file mode 100644
index 0000000000..9ed66b7f5e
--- /dev/null
+++ b/modules/3d/src/opencl/kinfu_frame.cl
@@ -0,0 +1,288 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+inline float3 reproject(float3 p, float2 fxyinv, float2 cxy)
+{
+    float2 pp = p.z*(p.xy - cxy)*fxyinv;
+    return (float3)(pp, p.z);
+}
+
+typedef float4 ptype;
+
+__kernel void computePointsNormals(__global char * pointsptr,
+                                   int points_step, int points_offset,
+                                   __global char * normalsptr,
+                                   int normals_step, int normals_offset,
+                                   __global const char * depthptr,
+                                   int depth_step, int depth_offset,
+                                   int depth_rows, int depth_cols,
+                                   const float2 fxyinv,
+                                   const float2 cxy,
+                                   const float dfac
+                                    )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x >= depth_cols || y >= depth_rows)
+        return;
+
+    __global const float* row0 = (__global const float*)(depthptr + depth_offset +
+                                                         (y+0)*depth_step);
+    __global const float* row1 = (__global const float*)(depthptr + depth_offset +
+                                                         (y+1)*depth_step);
+
+    float d00 = row0[x];
+    float z00 = d00*dfac;
+    float3 p00 = (float3)(convert_float2((int2)(x, y)), z00);
+    float3 v00 = reproject(p00, fxyinv, cxy);
+
+    float3 p = nan((uint)0), n = nan((uint)0);
+
+    if(x < depth_cols - 1 && y < depth_rows - 1)
+    {
+        float d01 = row0[x+1];
+        float d10 = row1[x];
+
+        float z01 = d01*dfac;
+        float z10 = d10*dfac;
+
+        if(z00 != 0 && z01 != 0 && z10 != 0)
+        {
+            float3 p01 = (float3)(convert_float2((int2)(x+1, y+0)), z01);
+            float3 p10 = (float3)(convert_float2((int2)(x+0, y+1)), z10);
+            float3 v01 = reproject(p01, fxyinv, cxy);
+            float3 v10 = reproject(p10, fxyinv, cxy);
+
+            float3 vec = cross(v01 - v00, v10 - v00);
+            n = - normalize(vec);
+            p = v00;
+        }
+    }
+
+    __global float* pts = (__global float*)(pointsptr  +  points_offset + y*points_step  + x*sizeof(ptype));
+    __global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype));
+    vstore4((ptype)(p, 0), 0, pts);
+    vstore4((ptype)(n, 0), 0, nrm);
+}
+
+__kernel void pyrDownBilateral(__global const char * depthptr,
+                               int depth_step, int depth_offset,
+                               int depth_rows, int depth_cols,
+                               __global char * depthDownptr,
+                               int depthDown_step, int depthDown_offset,
+                               int depthDown_rows, int depthDown_cols,
+                               const float sigma
+                               )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x >= depthDown_cols || y >= depthDown_rows)
+        return;
+
+    const float sigma3 = sigma*3;
+    const int D = 5;
+
+    __global const float* srcCenterRow = (__global const float*)(depthptr + depth_offset +
+                                                                 (2*y)*depth_step);
+
+    float center = srcCenterRow[2*x];
+
+    int sx = max(0, 2*x - D/2), ex = min(2*x - D/2 + D, depth_cols-1);
+    int sy = max(0, 2*y - D/2), ey = min(2*y - D/2 + D, depth_rows-1);
+
+    float sum = 0;
+    int count = 0;
+
+    for(int iy = sy; iy < ey; iy++)
+    {
+        __global const float* srcRow = (__global const float*)(depthptr + depth_offset +
+                                                               (iy)*depth_step);
+        for(int ix = sx; ix < ex; ix++)
+        {
+            float val = srcRow[ix];
+            if(fabs(val - center) < sigma3)
+            {
+                sum += val; count++;
+            }
+        }
+    }
+
+    __global float* downRow = (__global float*)(depthDownptr + depthDown_offset +
+                                                y*depthDown_step + x*sizeof(float));
+
+    *downRow = (count == 0) ? 0 : sum/convert_float(count);
+}
+
+//TODO: remove bilateral when OpenCV performs 32f bilat with OpenCL
+
+__kernel void customBilateral(__global const char * srcptr,
+                              int src_step, int src_offset,
+                              __global char * dstptr,
+                              int dst_step, int dst_offset,
+                              const int2 frameSize,
+                              const int kernelSize,
+                              const float sigma_spatial2_inv_half,
+                              const float sigma_depth2_inv_half
+)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x >= frameSize.x || y >= frameSize.y)
+        return;
+
+    __global const float* srcCenterRow = (__global const float*)(srcptr + src_offset +
+                                                                 y*src_step);
+    float value = srcCenterRow[x];
+
+    int tx = min (x - kernelSize / 2 + kernelSize, frameSize.x - 1);
+    int ty = min (y - kernelSize / 2 + kernelSize, frameSize.y - 1);
+
+    float sum1 = 0;
+    float sum2 = 0;
+
+    for (int cy = max (y - kernelSize / 2, 0); cy < ty; ++cy)
+    {
+        __global const float* srcRow = (__global const float*)(srcptr + src_offset +
+                                                               cy*src_step);
+        for (int cx = max (x - kernelSize / 2, 0); cx < tx; ++cx)
+        {
+            float depth = srcRow[cx];
+
+            float space2 = convert_float((x - cx) * (x - cx) + (y - cy) * (y - cy));
+            float color2 = (value - depth) * (value - depth);
+
+            float weight = native_exp (-(space2 * sigma_spatial2_inv_half +
+                                         color2 * sigma_depth2_inv_half));
+
+            sum1 += depth * weight;
+            sum2 += weight;
+        }
+    }
+
+    __global float* dst = (__global float*)(dstptr + dst_offset +
+                                            y*dst_step + x*sizeof(float));
+    *dst = sum1/sum2;
+}
+
+__kernel void pyrDownPointsNormals(__global const char * pptr,
+                                   int p_step, int p_offset,
+                                   __global const char * nptr,
+                                   int n_step, int n_offset,
+                                   __global char * pdownptr,
+                                   int pdown_step, int pdown_offset,
+                                   __global char * ndownptr,
+                                   int ndown_step, int ndown_offset,
+                                   const int2 downSize
+                                   )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x >= downSize.x || y >= downSize.y)
+        return;
+
+    float3 point = nan((uint)0), normal = nan((uint)0);
+
+    __global const ptype* pUpRow0 = (__global const ptype*)(pptr + p_offset + (2*y  )*p_step);
+    __global const ptype* pUpRow1 = (__global const ptype*)(pptr + p_offset + (2*y+1)*p_step);
+
+    float3 d00 = pUpRow0[2*x  ].xyz;
+    float3 d01 = pUpRow0[2*x+1].xyz;
+    float3 d10 = pUpRow1[2*x  ].xyz;
+    float3 d11 = pUpRow1[2*x+1].xyz;
+
+    if(!(any(isnan(d00)) || any(isnan(d01)) ||
+         any(isnan(d10)) || any(isnan(d11))))
+    {
+        point = (d00 + d01 + d10 + d11)*0.25f;
+
+        __global const ptype* nUpRow0 = (__global const ptype*)(nptr + n_offset + (2*y  )*n_step);
+        __global const ptype* nUpRow1 = (__global const ptype*)(nptr + n_offset + (2*y+1)*n_step);
+
+        float3 n00 = nUpRow0[2*x  ].xyz;
+        float3 n01 = nUpRow0[2*x+1].xyz;
+        float3 n10 = nUpRow1[2*x  ].xyz;
+        float3 n11 = nUpRow1[2*x+1].xyz;
+
+        normal = (n00 + n01 + n10 + n11)*0.25f;
+    }
+
+    __global ptype* pts = (__global ptype*)(pdownptr + pdown_offset + y*pdown_step);
+    __global ptype* nrm = (__global ptype*)(ndownptr + ndown_offset + y*ndown_step);
+    pts[x] = (ptype)(point,  0);
+    nrm[x] = (ptype)(normal, 0);
+}
+
+typedef char4 pixelType;
+
+// 20 is fixed power
+float specPow20(float x)
+{
+    float x2 = x*x;
+    float x5 = x2*x2*x;
+    float x10 = x5*x5;
+    float x20 = x10*x10;
+    return x20;
+}
+
+__kernel void render(__global const char * pointsptr,
+                     int points_step, int points_offset,
+                     __global const char * normalsptr,
+                     int normals_step, int normals_offset,
+                     __global char * imgptr,
+                     int img_step, int img_offset,
+                     const int2 frameSize,
+                     const float4 lightPt
+                    )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x >= frameSize.x || y >= frameSize.y)
+        return;
+
+    __global const ptype* ptsRow = (__global const ptype*)(pointsptr  + points_offset  + y*points_step  + x*sizeof(ptype));
+    __global const ptype* nrmRow = (__global const ptype*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype));
+
+    float3 p = (*ptsRow).xyz;
+    float3 n = (*nrmRow).xyz;
+
+    pixelType color;
+
+    if(any(isnan(p)))
+    {
+        color = (pixelType)(0, 32, 0, 0);
+    }
+    else
+    {
+        const float Ka = 0.3f;  //ambient coeff
+        const float Kd = 0.5f;  //diffuse coeff
+        const float Ks = 0.2f;  //specular coeff
+        //const int   sp = 20;  //specular power, fixed in specPow20()
+
+        const float Ax = 1.f;   //ambient color,  can be RGB
+        const float Dx = 1.f;   //diffuse color,  can be RGB
+        const float Sx = 1.f;   //specular color, can be RGB
+        const float Lx = 1.f;   //light color
+
+        float3 l = normalize(lightPt.xyz - p);
+        float3 v = normalize(-p);
+        float3 r = normalize(2.f*n*dot(n, l) - l);
+
+        float val = (Ax*Ka*Dx + Lx*Kd*Dx*max(0.f, dot(n, l)) +
+                     Lx*Ks*Sx*specPow20(max(0.f, dot(r, v))));
+
+        uchar ix = convert_uchar(val*255.f);
+        color = (pixelType)(ix, ix, ix, 0);
+    }
+
+    __global char* imgRow = (__global char*)(imgptr + img_offset + y*img_step + x*sizeof(pixelType));
+    vstore4(color, 0, imgRow);
+}
diff --git a/modules/3d/src/opencl/tsdf.cl b/modules/3d/src/opencl/tsdf.cl
new file mode 100644
index 0000000000..fe7bb34501
--- /dev/null
+++ b/modules/3d/src/opencl/tsdf.cl
@@ -0,0 +1,856 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+typedef char int8_t;
+typedef int8_t TsdfType;
+typedef uchar WeightType;
+
+struct TsdfVoxel
+{
+    TsdfType tsdf;
+    WeightType weight;
+};
+
+static inline TsdfType floatToTsdf(float num)
+{
+    int8_t res = (int8_t) ( (num * (-128)) );
+    res = res ? res : (num < 0 ? 1 : -1);
+    return res;
+}
+
+static inline float tsdfToFloat(TsdfType num)
+{
+    return ( (float) num ) / (-128);
+}
+
+__kernel void integrate(__global const char * depthptr,
+                        int depth_step, int depth_offset,
+                        int depth_rows, int depth_cols,
+                        __global struct TsdfVoxel * volumeptr,
+                        __global const float * vol2camptr,
+                        const float voxelSize,
+                        const int4 volResolution4,
+                        const int4 volDims4,
+                        const float2 fxy,
+                        const float2 cxy,
+                        const float dfac,
+                        const float truncDist,
+                        const int maxWeight,
+                        const __global float * pixNorms)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    const int3 volResolution = volResolution4.xyz;
+
+    if(x >= volResolution.x || y >= volResolution.y)
+        return;
+
+    // coord-independent constants
+    const int3 volDims = volDims4.xyz;
+    const float2 limits = (float2)(depth_cols-1, depth_rows-1);
+
+    __global const float* vm = vol2camptr;
+    const float4 vol2cam0 = vload4(0, vm);
+    const float4 vol2cam1 = vload4(1, vm);
+    const float4 vol2cam2 = vload4(2, vm);
+
+    const float truncDistInv = 1.f/truncDist;
+
+    // optimization of camSpace transformation (vector addition instead of matmul at each z)
+    float4 inPt = (float4)(x*voxelSize, y*voxelSize, 0, 1);
+    float3 basePt = (float3)(dot(vol2cam0, inPt),
+                             dot(vol2cam1, inPt),
+                             dot(vol2cam2, inPt));
+
+    float3 camSpacePt = basePt;
+
+    // zStep == vol2cam*(float3(x, y, 1)*voxelSize) - basePt;
+    float3 zStep = ((float3)(vol2cam0.z, vol2cam1.z, vol2cam2.z))*voxelSize;
+
+    int volYidx = x*volDims.x + y*volDims.y;
+
+    int startZ, endZ;
+    if(fabs(zStep.z) > 1e-5f)
+    {
+        int baseZ = convert_int(-basePt.z / zStep.z);
+        if(zStep.z > 0)
+        {
+            startZ = baseZ;
+            endZ = volResolution.z;
+        }
+        else
+        {
+            startZ = 0;
+            endZ = baseZ;
+        }
+    }
+    else
+    {
+        if(basePt.z > 0)
+        {
+            startZ = 0; endZ = volResolution.z;
+        }
+        else
+        {
+            // z loop shouldn't be performed
+            //startZ = endZ = 0;
+            return;
+        }
+    }
+
+    startZ = max(0, startZ);
+    endZ = min(volResolution.z, endZ);
+
+    for(int z = startZ; z < endZ; z++)
+    {
+        // optimization of the following:
+        //float3 camSpacePt = vol2cam * ((float3)(x, y, z)*voxelSize);
+        camSpacePt += zStep;
+
+        if(camSpacePt.z <= 0)
+            continue;
+
+        float3 camPixVec = camSpacePt / camSpacePt.z;
+        float2 projected = mad(camPixVec.xy, fxy, cxy);
+
+        float v;
+        // bilinearly interpolate depth at projected
+        if(all(projected >= 0) && all(projected < limits))
+        {
+            float2 ip = floor(projected);
+            int xi = ip.x, yi = ip.y;
+
+            __global const float* row0 = (__global const float*)(depthptr + depth_offset +
+                                                                 (yi+0)*depth_step);
+            __global const float* row1 = (__global const float*)(depthptr + depth_offset +
+                                                                 (yi+1)*depth_step);
+
+            float v00 = row0[xi+0];
+            float v01 = row0[xi+1];
+            float v10 = row1[xi+0];
+            float v11 = row1[xi+1];
+            float4 vv = (float4)(v00, v01, v10, v11);
+
+            // assume correct depth is positive
+            if(all(vv > 0))
+            {
+                float2 t = projected - ip;
+                float2 vf = mix(vv.xz, vv.yw, t.x);
+                v = mix(vf.s0, vf.s1, t.y);
+            }
+            else
+                continue;
+        }
+        else
+            continue;
+
+        if(v == 0)
+            continue;
+
+        int idx = projected.y * depth_cols + projected.x;
+        float pixNorm = pixNorms[idx];
+        //float pixNorm = length(camPixVec);
+
+        // difference between distances of point and of surface to camera
+        float sdf = pixNorm*(v*dfac - camSpacePt.z);
+        // possible alternative is:
+        // float sdf = length(camSpacePt)*(v*dfac/camSpacePt.z - 1.0);
+
+        if(sdf >= -truncDist)
+        {
+            float tsdf = fmin(1.0f, sdf * truncDistInv);
+            int volIdx = volYidx + z*volDims.z;
+
+            struct TsdfVoxel voxel = volumeptr[volIdx];
+            float value  = tsdfToFloat(voxel.tsdf);
+            int weight = voxel.weight;
+
+            // update TSDF
+            value = (value*weight + tsdf) / (weight + 1);
+            weight = min(weight + 1, maxWeight);
+
+            voxel.tsdf = floatToTsdf(value);
+            voxel.weight = weight;
+            volumeptr[volIdx] = voxel;
+        }
+    }
+}
+
+
+inline float interpolateVoxel(float3 p, __global const struct TsdfVoxel* volumePtr,
+                              int3 volDims, int8 neighbourCoords)
+{
+    float3 fip = floor(p);
+    int3 ip = convert_int3(fip);
+    float3 t = p - fip;
+
+    int3 cmul = volDims*ip;
+    int coordBase = cmul.x + cmul.y + cmul.z;
+    int nco[8];
+    vstore8(neighbourCoords + coordBase, 0, nco);
+
+    float vaz[8];
+    for(int i = 0; i < 8; i++)
+        vaz[i] = tsdfToFloat(volumePtr[nco[i]].tsdf);
+
+    float8 vz = vload8(0, vaz);
+
+    float4 vy = mix(vz.s0246, vz.s1357, t.z);
+    float2 vx = mix(vy.s02, vy.s13, t.y);
+    return mix(vx.s0, vx.s1, t.x);
+}
+
+inline float3 getNormalVoxel(float3 p, __global const struct TsdfVoxel* volumePtr,
+                             int3 volResolution, int3 volDims, int8 neighbourCoords)
+{
+    if(any(p < 1) || any(p >= convert_float3(volResolution - 2)))
+        return nan((uint)0);
+
+    float3 fip = floor(p);
+    int3 ip = convert_int3(fip);
+    float3 t = p - fip;
+
+    int3 cmul = volDims*ip;
+    int coordBase = cmul.x + cmul.y + cmul.z;
+    int nco[8];
+    vstore8(neighbourCoords + coordBase, 0, nco);
+
+    int arDims[3];
+    vstore3(volDims, 0, arDims);
+    float an[3];
+    for(int c = 0; c < 3; c++)
+    {
+        int dim = arDims[c];
+
+        float vaz[8];
+        for(int i = 0; i < 8; i++)
+            vaz[i] = tsdfToFloat(volumePtr[nco[i] + dim].tsdf) -
+                     tsdfToFloat(volumePtr[nco[i] - dim].tsdf);
+
+        float8 vz = vload8(0, vaz);
+
+        float4 vy = mix(vz.s0246, vz.s1357, t.z);
+        float2 vx = mix(vy.s02, vy.s13, t.y);
+
+        an[c] = mix(vx.s0, vx.s1, t.x);
+    }
+
+    //gradientDeltaFactor is fixed at 1.0 of voxel size
+    float3 n = vload3(0, an);
+    float Norm = sqrt(n.x*n.x + n.y*n.y + n.z*n.z);
+    return Norm < 0.0001f ? nan((uint)0) : n / Norm;
+    //return fast_normalize(vload3(0, an));
+}
+
+typedef float4 ptype;
+
+__kernel void raycast(__global char * pointsptr,
+                      int points_step, int points_offset,
+                      __global char * normalsptr,
+                      int normals_step, int normals_offset,
+                      const int2 frameSize,
+                      __global const struct TsdfVoxel * volumeptr,
+                      __global const float * vol2camptr,
+                      __global const float * cam2volptr,
+                      const float2 fixy,
+                      const float2 cxy,
+                      const float4 boxDown4,
+                      const float4 boxUp4,
+                      const float tstep,
+                      const float voxelSize,
+                      const int4 volResolution4,
+                      const int4 volDims4,
+                      const int8 neighbourCoords
+                      )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x >= frameSize.x || y >= frameSize.y)
+        return;
+
+    // coordinate-independent constants
+
+    __global const float* cm = cam2volptr;
+    const float3 camRot0  = vload4(0, cm).xyz;
+    const float3 camRot1  = vload4(1, cm).xyz;
+    const float3 camRot2  = vload4(2, cm).xyz;
+    const float3 camTrans = (float3)(cm[3], cm[7], cm[11]);
+
+    __global const float* vm = vol2camptr;
+    const float3 volRot0  = vload4(0, vm).xyz;
+    const float3 volRot1  = vload4(1, vm).xyz;
+    const float3 volRot2  = vload4(2, vm).xyz;
+    const float3 volTrans = (float3)(vm[3], vm[7], vm[11]);
+
+    const float3 boxDown = boxDown4.xyz;
+    const float3 boxUp   = boxUp4.xyz;
+    const int3   volDims = volDims4.xyz;
+
+    const int3 volResolution = volResolution4.xyz;
+
+    const float invVoxelSize = native_recip(voxelSize);
+
+    // kernel itself
+
+    float3 point  = nan((uint)0);
+    float3 normal = nan((uint)0);
+
+    float3 orig = camTrans;
+
+    // get direction through pixel in volume space:
+    // 1. reproject (x, y) on projecting plane where z = 1.f
+    float3 planed = (float3)(((float2)(x, y) - cxy)*fixy, 1.f);
+
+    // 2. rotate to volume space
+    planed = (float3)(dot(planed, camRot0),
+                      dot(planed, camRot1),
+                      dot(planed, camRot2));
+
+    // 3. normalize
+    float3 dir = fast_normalize(planed);
+
+    // compute intersection of ray with all six bbox planes
+    float3 rayinv = native_recip(dir);
+    float3 tbottom = rayinv*(boxDown - orig);
+    float3 ttop    = rayinv*(boxUp   - orig);
+
+    // re-order intersections to find smallest and largest on each axis
+    float3 minAx = min(ttop, tbottom);
+    float3 maxAx = max(ttop, tbottom);
+
+    // near clipping plane
+    const float clip = 0.f;
+    float tmin = max(max(max(minAx.x, minAx.y), max(minAx.x, minAx.z)), clip);
+    float tmax =     min(min(maxAx.x, maxAx.y), min(maxAx.x, maxAx.z));
+
+    // precautions against getting coordinates out of bounds
+    tmin = tmin + tstep;
+    tmax = tmax - tstep;
+
+    if(tmin < tmax)
+    {
+        // interpolation optimized a little
+        orig *= invVoxelSize;
+        dir  *= invVoxelSize;
+
+        float3 rayStep = dir*tstep;
+        float3 next = (orig + dir*tmin);
+        float f = interpolateVoxel(next, volumeptr, volDims, neighbourCoords);
+        float fnext = f;
+
+        // raymarch
+        int steps = 0;
+        int nSteps = floor(native_divide(tmax - tmin, tstep));
+        bool stop = false;
+        for(int i = 0; i < nSteps; i++)
+        {
+            // fix for wrong steps counting
+            if(!stop)
+            {
+                next += rayStep;
+
+                // fetch voxel
+                int3 ip = convert_int3(round(next));
+                int3 cmul = ip*volDims;
+                int idx = cmul.x + cmul.y + cmul.z;
+                fnext = tsdfToFloat(volumeptr[idx].tsdf);
+
+                if(fnext != f)
+                {
+                    fnext = interpolateVoxel(next, volumeptr, volDims, neighbourCoords);
+
+                    // when ray crosses a surface
+                    if(signbit(f) != signbit(fnext))
+                    {
+                        stop = true; continue;
+                    }
+
+                    f = fnext;
+                }
+                steps++;
+            }
+        }
+
+        // if ray penetrates a surface from outside
+        // linearly interpolate t between two f values
+        if(f > 0 && fnext < 0)
+        {
+            float3 tp = next - rayStep;
+            float ft   = interpolateVoxel(tp,   volumeptr, volDims, neighbourCoords);
+            float ftdt = interpolateVoxel(next, volumeptr, volDims, neighbourCoords);
+            // float t = tmin + steps*tstep;
+            // float ts = t - tstep*ft/(ftdt - ft);
+            float ts = tmin + tstep*(steps - native_divide(ft, ftdt - ft));
+
+            // avoid division by zero
+            if(!isnan(ts) && !isinf(ts))
+            {
+                float3 pv = orig + dir*ts;
+                float3 nv = getNormalVoxel(pv, volumeptr, volResolution, volDims, neighbourCoords);
+
+                if(!any(isnan(nv)))
+                {
+                    //convert pv and nv to camera space
+                    normal = (float3)(dot(nv, volRot0),
+                                      dot(nv, volRot1),
+                                      dot(nv, volRot2));
+                    // interpolation optimized a little
+                    pv *= voxelSize;
+                    point = (float3)(dot(pv, volRot0),
+                                     dot(pv, volRot1),
+                                     dot(pv, volRot2)) + volTrans;
+                }
+            }
+        }
+    }
+
+    __global float* pts = (__global float*)(pointsptr  +  points_offset + y*points_step  + x*sizeof(ptype));
+    __global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype));
+    vstore4((float4)(point,  0), 0, pts);
+    vstore4((float4)(normal, 0), 0, nrm);
+}
+
+
+__kernel void getNormals(__global const char * pointsptr,
+                         int points_step, int points_offset,
+                         __global char * normalsptr,
+                         int normals_step, int normals_offset,
+                         const int2 frameSize,
+                         __global const struct TsdfVoxel* volumeptr,
+                         __global const float * volPoseptr,
+                         __global const float * invPoseptr,
+                         const float voxelSizeInv,
+                         const int4 volResolution4,
+                         const int4 volDims4,
+                         const int8 neighbourCoords
+                         )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x >= frameSize.x || y >= frameSize.y)
+        return;
+
+    // coordinate-independent constants
+
+    __global const float* vp = volPoseptr;
+    const float3 volRot0  = vload4(0, vp).xyz;
+    const float3 volRot1  = vload4(1, vp).xyz;
+    const float3 volRot2  = vload4(2, vp).xyz;
+    const float3 volTrans = (float3)(vp[3], vp[7], vp[11]);
+
+    __global const float* iv = invPoseptr;
+    const float3 invRot0 = vload4(0, iv).xyz;
+    const float3 invRot1 = vload4(1, iv).xyz;
+    const float3 invRot2 = vload4(2, iv).xyz;
+    const float3 invTrans = (float3)(iv[3], iv[7], iv[11]);
+
+    const int3 volResolution = volResolution4.xyz;
+    const int3 volDims = volDims4.xyz;
+
+    // kernel itself
+
+    __global const ptype* ptsRow = (__global const ptype*)(pointsptr +
+                                                           points_offset +
+                                                           y*points_step);
+    float3 p = ptsRow[x].xyz;
+    float3 n = nan((uint)0);
+    if(!any(isnan(p)))
+    {
+        float3 voxPt = (float3)(dot(p, invRot0),
+                                dot(p, invRot1),
+                                dot(p, invRot2)) + invTrans;
+        voxPt = voxPt * voxelSizeInv;
+        n = getNormalVoxel(voxPt, volumeptr, volResolution, volDims, neighbourCoords);
+        n = (float3)(dot(n, volRot0),
+                     dot(n, volRot1),
+                     dot(n, volRot2));
+    }
+
+    __global float* nrm = (__global float*)(normalsptr +
+                                            normals_offset +
+                                            y*normals_step +
+                                            x*sizeof(ptype));
+
+    vstore4((float4)(n, 0), 0, nrm);
+}
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+
+struct CoordReturn
+{
+    bool result;
+    float3 point;
+    float3 normal;
+};
+
+inline struct CoordReturn coord(int x, int y, int z, float3 V, float v0, int axis,
+                                __global const struct TsdfVoxel* volumeptr,
+                                int3 volResolution, int3 volDims,
+                                int8 neighbourCoords,
+                                float voxelSize, float voxelSizeInv,
+                                const float3 volRot0,
+                                const float3 volRot1,
+                                const float3 volRot2,
+                                const float3 volTrans,
+                                bool needNormals,
+                                bool scan
+                                )
+{
+    struct CoordReturn cr;
+
+    // 0 for x, 1 for y, 2 for z
+    bool limits = false;
+    int3 shift;
+    float Vc = 0.f;
+    if(axis == 0)
+    {
+        shift = (int3)(1, 0, 0);
+        limits = (x + 1 < volResolution.x);
+        Vc = V.x;
+    }
+    if(axis == 1)
+    {
+        shift = (int3)(0, 1, 0);
+        limits = (y + 1 < volResolution.y);
+        Vc = V.y;
+    }
+    if(axis == 2)
+    {
+        shift = (int3)(0, 0, 1);
+        limits = (z + 1 < volResolution.z);
+        Vc = V.z;
+    }
+
+    if(limits)
+    {
+        int3 ip = ((int3)(x, y, z)) + shift;
+        int3 cmul = ip*volDims;
+        int idx = cmul.x + cmul.y + cmul.z;
+
+        struct TsdfVoxel voxel = volumeptr[idx];
+        float vd  = tsdfToFloat(voxel.tsdf);
+        int weight = voxel.weight;
+
+        if(weight != 0 && vd != 1.f)
+        {
+            if((v0 > 0 && vd < 0) || (v0 < 0 && vd > 0))
+            {
+                // calc actual values or estimate amount of space
+                if(!scan)
+                {
+                    // linearly interpolate coordinate
+                    float Vn = Vc + voxelSize;
+                    float dinv = 1.f/(fabs(v0)+fabs(vd));
+                    float inter = (Vc*fabs(vd) + Vn*fabs(v0))*dinv;
+
+                    float3 p = (float3)(shift.x ? inter : V.x,
+                                        shift.y ? inter : V.y,
+                                        shift.z ? inter : V.z);
+
+                    cr.point = (float3)(dot(p, volRot0),
+                                        dot(p, volRot1),
+                                        dot(p, volRot2)) + volTrans;
+
+                    if(needNormals)
+                    {
+                        float3 nv = getNormalVoxel(p * voxelSizeInv,
+                                                   volumeptr, volResolution, volDims, neighbourCoords);
+
+                        cr.normal = (float3)(dot(nv, volRot0),
+                                             dot(nv, volRot1),
+                                             dot(nv, volRot2));
+                    }
+                }
+
+                cr.result = true;
+                return cr;
+            }
+        }
+    }
+
+    cr.result = false;
+    return cr;
+}
+
+
+__kernel void scanSize(__global const struct TsdfVoxel* volumeptr,
+                       const int4 volResolution4,
+                       const int4 volDims4,
+                       const int8 neighbourCoords,
+                       __global const float * volPoseptr,
+                       const float voxelSize,
+                       const float voxelSizeInv,
+                       __local int* reducebuf,
+                       __global char* groupedSumptr,
+                       int groupedSum_slicestep,
+                       int groupedSum_step, int groupedSum_offset
+                       )
+{
+    const int3 volDims = volDims4.xyz;
+    const int3 volResolution = volResolution4.xyz;
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    bool validVoxel = true;
+    if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z)
+        validVoxel = false;
+
+    const int gx = get_group_id(0);
+    const int gy = get_group_id(1);
+    const int gz = get_group_id(2);
+
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+    const int lz = get_local_id(2);
+    const int lw = get_local_size(0);
+    const int lh = get_local_size(1);
+    const int ld = get_local_size(2);
+    const int lsz = lw*lh*ld;
+    const int lid = lx + ly*lw + lz*lw*lh;
+
+    // coordinate-independent constants
+
+    __global const float* vp = volPoseptr;
+    const float3 volRot0  = vload4(0, vp).xyz;
+    const float3 volRot1  = vload4(1, vp).xyz;
+    const float3 volRot2  = vload4(2, vp).xyz;
+    const float3 volTrans = (float3)(vp[3], vp[7], vp[11]);
+
+    // kernel itself
+    int npts = 0;
+    if(validVoxel)
+    {
+        int3 ip = (int3)(x, y, z);
+        int3 cmul = ip*volDims;
+        int idx = cmul.x + cmul.y + cmul.z;
+        struct TsdfVoxel voxel = volumeptr[idx];
+        float value  = tsdfToFloat(voxel.tsdf);
+        int weight = voxel.weight;
+
+        // if voxel is not empty
+        if(weight != 0 && value != 1.f)
+        {
+            float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize;
+
+            #pragma unroll
+            for(int i = 0; i < 3; i++)
+            {
+                struct CoordReturn cr;
+                cr = coord(x, y, z, V, value, i,
+                           volumeptr, volResolution, volDims,
+                           neighbourCoords,
+                           voxelSize, voxelSizeInv,
+                           volRot0, volRot1, volRot2, volTrans,
+                           false, true);
+                if(cr.result)
+                {
+                    npts++;
+                }
+            }
+        }
+    }
+
+    // reducebuf keeps counters for each thread
+    reducebuf[lid] = npts;
+
+    // reduce counter to local mem
+
+    // maxStep = ctz(lsz), ctz isn't supported on CUDA devices
+    const int c = clz(lsz & -lsz);
+    const int maxStep = c ? 31 - c : c;
+    for(int nstep = 1; nstep <= maxStep; nstep++)
+    {
+        if(lid % (1 << nstep) == 0)
+        {
+            int rto   = lid;
+            int rfrom = lid + (1 << (nstep-1));
+            reducebuf[rto] += reducebuf[rfrom];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if(lid == 0)
+    {
+        __global int* groupedRow = (__global int*)(groupedSumptr +
+                                                   groupedSum_offset +
+                                                   gy*groupedSum_step +
+                                                   gz*groupedSum_slicestep);
+
+        groupedRow[gx] = reducebuf[0];
+    }
+}
+
+
+__kernel void fillPtsNrm(__global const struct TsdfVoxel* volumeptr,
+                         const int4 volResolution4,
+                         const int4 volDims4,
+                         const int8 neighbourCoords,
+                         __global const float * volPoseptr,
+                         const float voxelSize,
+                         const float voxelSizeInv,
+                         const int needNormals,
+                         __local float* localbuf,
+                         volatile __global int* atomicCtr,
+                         __global const char* groupedSumptr,
+                         int groupedSum_slicestep,
+                         int groupedSum_step, int groupedSum_offset,
+                         __global char * pointsptr,
+                         int points_step, int points_offset,
+                         __global char * normalsptr,
+                         int normals_step, int normals_offset
+                         )
+{
+    const int3 volDims = volDims4.xyz;
+    const int3 volResolution = volResolution4.xyz;
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    bool validVoxel = true;
+    if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z)
+        validVoxel = false;
+
+    const int gx = get_group_id(0);
+    const int gy = get_group_id(1);
+    const int gz = get_group_id(2);
+
+    __global int* groupedRow = (__global int*)(groupedSumptr +
+                                               groupedSum_offset +
+                                               gy*groupedSum_step +
+                                               gz*groupedSum_slicestep);
+
+    // this group contains 0 pts, skip it
+    int nptsGroup = groupedRow[gx];
+    if(nptsGroup == 0)
+        return;
+
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+    const int lz = get_local_id(2);
+    const int lw = get_local_size(0);
+    const int lh = get_local_size(1);
+    const int ld = get_local_size(2);
+    const int lsz = lw*lh*ld;
+    const int lid = lx + ly*lw + lz*lw*lh;
+
+    // coordinate-independent constants
+
+    __global const float* vp = volPoseptr;
+    const float3 volRot0  = vload4(0, vp).xyz;
+    const float3 volRot1  = vload4(1, vp).xyz;
+    const float3 volRot2  = vload4(2, vp).xyz;
+    const float3 volTrans = (float3)(vp[3], vp[7], vp[11]);
+
+    // kernel itself
+    int npts = 0;
+    float3 parr[3], narr[3];
+    if(validVoxel)
+    {
+        int3 ip = (int3)(x, y, z);
+        int3 cmul = ip*volDims;
+        int idx = cmul.x + cmul.y + cmul.z;
+        struct TsdfVoxel voxel = volumeptr[idx];
+        float value  = tsdfToFloat(voxel.tsdf);
+        int weight = voxel.weight;
+
+        // if voxel is not empty
+        if(weight != 0 && value != 1.f)
+        {
+            float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize;
+
+            #pragma unroll
+            for(int i = 0; i < 3; i++)
+            {
+                struct CoordReturn cr;
+                cr = coord(x, y, z, V, value, i,
+                           volumeptr, volResolution, volDims,
+                           neighbourCoords,
+                           voxelSize, voxelSizeInv,
+                           volRot0, volRot1, volRot2, volTrans,
+                           needNormals, false);
+
+                if(cr.result)
+                {
+                    parr[npts] = cr.point;
+                    narr[npts] = cr.normal;
+                    npts++;
+                }
+            }
+        }
+    }
+
+    // 4 floats per point or normal
+    const int elemStep = 4;
+
+    __local float* normAddr;
+    __local int localCtr;
+    if(lid == 0)
+        localCtr = 0;
+
+    // push all pts (and nrm) from private array to local mem
+    int privateCtr = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    privateCtr = atomic_add(&localCtr, npts);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = 0; i < npts; i++)
+    {
+        __local float* addr = localbuf + (privateCtr+i)*elemStep;
+        vstore4((float4)(parr[i], 0), 0, addr);
+    }
+
+    if(needNormals)
+    {
+        normAddr = localbuf + localCtr*elemStep;
+
+        for(int i = 0; i < npts; i++)
+        {
+            __local float* addr = normAddr + (privateCtr+i)*elemStep;
+            vstore4((float4)(narr[i], 0), 0, addr);
+        }
+    }
+
+    // debugging purposes
+    if(lid == 0)
+    {
+        if(localCtr != nptsGroup)
+        {
+            printf("!!! fetchPointsNormals result may be incorrect, npts != localCtr at %3d %3d %3d: %3d vs %3d\n",
+                   gx, gy, gz, localCtr, nptsGroup);
+        }
+    }
+
+    // copy local buffer to global mem
+    __local int whereToWrite;
+    if(lid == 0)
+        whereToWrite = atomic_add(atomicCtr, localCtr);
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
+    event_t ev[2];
+    int evn = 0;
+    // points and normals are 1-column matrices
+    __global float* pts = (__global float*)(pointsptr +
+                                            points_offset +
+                                            whereToWrite*points_step);
+    ev[evn++] = async_work_group_copy(pts, localbuf, localCtr*elemStep, 0);
+
+    if(needNormals)
+    {
+        __global float* nrm = (__global float*)(normalsptr +
+                                                normals_offset +
+                                                whereToWrite*normals_step);
+        ev[evn++] = async_work_group_copy(nrm, normAddr, localCtr*elemStep, 0);
+    }
+
+    wait_group_events(evn, ev);
+}
diff --git a/modules/3d/src/polynom_solver.cpp b/modules/3d/src/polynom_solver.cpp
index 201d09c439..48aa1f12b3 100644
--- a/modules/3d/src/polynom_solver.cpp
+++ b/modules/3d/src/polynom_solver.cpp
@@ -79,11 +79,11 @@ int solve_deg3(double a, double b, double c, double d,
 
   if (D <= 0) {
     // Three real roots
-    double theta = acos(R / sqrt(-Q3));
-    double sqrt_Q = sqrt(-Q);
-    x0 = 2 * sqrt_Q * cos(theta             / 3.0) - b_a_3;
-    x1 = 2 * sqrt_Q * cos((theta + 2 * CV_PI)/ 3.0) - b_a_3;
-    x2 = 2 * sqrt_Q * cos((theta + 4 * CV_PI)/ 3.0) - b_a_3;
+    double theta = std::acos(R / std::sqrt(-Q3));
+    double sqrt_Q = std::sqrt(-Q);
+    x0 = 2 * sqrt_Q * std::cos(theta             / 3.0) - b_a_3;
+    x1 = 2 * sqrt_Q * std::cos((theta + 2 * CV_PI)/ 3.0) - b_a_3;
+    x2 = 2 * sqrt_Q * std::cos((theta + 4 * CV_PI)/ 3.0) - b_a_3;
 
     return 3;
   }
diff --git a/modules/3d/src/precomp.hpp b/modules/3d/src/precomp.hpp
index 52cd86f3f5..7ff3d8d304 100644
--- a/modules/3d/src/precomp.hpp
+++ b/modules/3d/src/precomp.hpp
@@ -46,11 +46,35 @@
 
 #include "opencv2/core/private.hpp"
 
+#include "opencv2/core/affine.hpp"
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/matx.hpp"
+#include "opencv2/core/quaternion.hpp"
+#include "opencv2/core/dualquaternion.hpp"
+#include "opencv2/core/types.hpp"
+#include "opencv2/core/utils/logger.hpp"
+#include "opencv2/core/utils/trace.hpp"
+
 #include "opencv2/3d.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/features2d.hpp"
 
 #include "opencv2/core/ocl.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+#include "opencv2/3d/detail/pose_graph.hpp"
+#include "opencv2/3d/detail/kinfu_frame.hpp"
+
+#include <atomic>
+#include <functional>
+#include <limits>
+#include <vector>
+#include <list>
+#include <set>
+#include <unordered_set>
+#include <map>
+#include <unordered_map>
 
 #define GET_OPTIMIZED(func) (func)
 
diff --git a/modules/3d/src/rgbd/colored_tsdf.cpp b/modules/3d/src/rgbd/colored_tsdf.cpp
new file mode 100644
index 0000000000..a1b17cf48d
--- /dev/null
+++ b/modules/3d/src/rgbd/colored_tsdf.cpp
@@ -0,0 +1,1024 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#include "../precomp.hpp"
+#include "colored_tsdf.hpp"
+
+#define USE_INTERPOLATION_IN_GETNORMAL 1
+
+namespace cv {
+
+ColoredTSDFVolume::ColoredTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
+                       int _maxWeight, Point3i _resolution, bool zFirstMemOrder)
+    : Volume(_voxelSize, _pose, _raycastStepFactor),
+      volResolution(_resolution),
+      maxWeight( WeightType(_maxWeight) )
+{
+    CV_Assert(_maxWeight < 255);
+    // Unlike original code, this should work with any volume size
+    // Not only when (x,y,z % 32) == 0
+    volSize   = Point3f(volResolution) * voxelSize;
+    truncDist = std::max(_truncDist, 2.1f * voxelSize);
+
+    // (xRes*yRes*zRes) array
+    // Depending on zFirstMemOrder arg:
+    // &elem(x, y, z) = data + x*zRes*yRes + y*zRes + z;
+    // &elem(x, y, z) = data + x + y*xRes + z*xRes*yRes;
+    int xdim, ydim, zdim;
+    if(zFirstMemOrder)
+    {
+        xdim = volResolution.z * volResolution.y;
+        ydim = volResolution.z;
+        zdim = 1;
+    }
+    else
+    {
+        xdim = 1;
+        ydim = volResolution.x;
+        zdim = volResolution.x * volResolution.y;
+    }
+
+    volDims = Vec4i(xdim, ydim, zdim);
+    neighbourCoords = Vec8i(
+        volDims.dot(Vec4i(0, 0, 0)),
+        volDims.dot(Vec4i(0, 0, 1)),
+        volDims.dot(Vec4i(0, 1, 0)),
+        volDims.dot(Vec4i(0, 1, 1)),
+        volDims.dot(Vec4i(1, 0, 0)),
+        volDims.dot(Vec4i(1, 0, 1)),
+        volDims.dot(Vec4i(1, 1, 0)),
+        volDims.dot(Vec4i(1, 1, 1))
+    );
+}
+
+class ColoredTSDFVolumeCPU : public ColoredTSDFVolume
+{
+public:
+    // dimension in voxels, size in meters
+    ColoredTSDFVolumeCPU(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor, float _truncDist,
+                         int _maxWeight, Vec3i _resolution, bool zFirstMemOrder = true);
+    virtual void integrate(InputArray, float, const Matx44f&, const Matx33f&, const int) override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); };
+    virtual void integrate(InputArray _depth, InputArray _rgb, float depthFactor, const Matx44f& cameraPose,
+                           const Matx33f& depth_intrinsics, const Matx33f& rgb_intrinsics, const int frameId = 0) override;
+    virtual void raycast(const Matx44f& cameraPose, const Matx33f& depth_intrinsics, const Size& frameSize,
+                         OutputArray points, OutputArray normals, OutputArray colors) const override;
+    virtual void raycast(const Matx44f&, const Matx33f&, const Size&, OutputArray, OutputArray) const override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); };
+
+    virtual void fetchNormals(InputArray points, OutputArray _normals) const override;
+    virtual void fetchPointsNormals(OutputArray points, OutputArray normals) const override;
+
+    virtual void reset() override;
+    virtual RGBTsdfVoxel at(const Vec3i& volumeIdx) const;
+
+    float interpolateVoxel(const cv::Point3f& p) const;
+    Point3f getNormalVoxel(const cv::Point3f& p) const;
+    float interpolateColor(float tx, float ty, float tz, float vx[8]) const;
+    Point3f getColorVoxel(const cv::Point3f& p) const;
+
+#if USE_INTRINSICS
+    float interpolateVoxel(const v_float32x4& p) const;
+    v_float32x4 getNormalVoxel(const v_float32x4& p) const;
+    v_float32x4 getColorVoxel(const v_float32x4& p) const;
+#endif
+
+    Vec4i volStrides;
+    Vec6f frameParams;
+    Mat pixNorms;
+    // See zFirstMemOrder arg of parent class constructor
+    // for the array layout info
+    // Consist of Voxel elements
+    Mat volume;
+};
+
+// dimension in voxels, size in meters
+ColoredTSDFVolumeCPU::ColoredTSDFVolumeCPU(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor,
+                             float _truncDist, int _maxWeight, Vec3i _resolution,
+                             bool zFirstMemOrder)
+    : ColoredTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _resolution,
+                 zFirstMemOrder)
+{
+    int xdim, ydim, zdim;
+    if (zFirstMemOrder)
+    {
+        xdim = volResolution.z * volResolution.y;
+        ydim = volResolution.z;
+        zdim = 1;
+    }
+    else
+    {
+        xdim = 1;
+        ydim = volResolution.x;
+        zdim = volResolution.x * volResolution.y;
+    }
+    volStrides = Vec4i(xdim, ydim, zdim);
+
+    volume = Mat(1, volResolution.x * volResolution.y * volResolution.z, rawType<RGBTsdfVoxel>());
+
+    reset();
+}
+
+// zero volume, leave rest params the same
+void ColoredTSDFVolumeCPU::reset()
+{
+    CV_TRACE_FUNCTION();
+
+    volume.forEach<VecRGBTsdfVoxel>([](VecRGBTsdfVoxel& vv, const int* /* position */)
+    {
+        RGBTsdfVoxel& v = reinterpret_cast<RGBTsdfVoxel&>(vv);
+        v.tsdf = floatToTsdf(0.0f); v.weight = 0;
+    });
+}
+
+RGBTsdfVoxel ColoredTSDFVolumeCPU::at(const Vec3i& volumeIdx) const
+{
+    //! Out of bounds
+    if ((volumeIdx[0] >= volResolution.x || volumeIdx[0] < 0) ||
+        (volumeIdx[1] >= volResolution.y || volumeIdx[1] < 0) ||
+        (volumeIdx[2] >= volResolution.z || volumeIdx[2] < 0))
+    {
+        return RGBTsdfVoxel(floatToTsdf(1.f), 0, 160, 160, 160);
+    }
+
+    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
+    int coordBase =
+        volumeIdx[0] * volDims[0] + volumeIdx[1] * volDims[1] + volumeIdx[2] * volDims[2];
+    return volData[coordBase];
+}
+
+// use depth instead of distance (optimization)
+void ColoredTSDFVolumeCPU::integrate(InputArray _depth, InputArray _rgb, float depthFactor, const Matx44f& cameraPose,
+                                     const Matx33f& _depth_intrinsics, const Matx33f& _rgb_intrinsics, const int frameId)
+{
+    CV_TRACE_FUNCTION();
+    CV_UNUSED(frameId);
+    CV_Assert(_depth.type() == DEPTH_TYPE);
+    CV_Assert(!_depth.empty());
+    Depth depth = _depth.getMat();
+    Colors rgb = _rgb.getMat();
+    Intr depth_intrinsics(_depth_intrinsics);
+    Intr rgb_intrinsics(_rgb_intrinsics);
+    Vec6f newParams((float)depth.rows, (float)depth.cols,
+                    depth_intrinsics.fx, depth_intrinsics.fy,
+                    depth_intrinsics.cx, depth_intrinsics.cy);
+    if (!(frameParams == newParams))
+    {
+        frameParams = newParams;
+        pixNorms = preCalculationPixNorm(depth.size(), depth_intrinsics);
+    }
+
+    integrateRGBVolumeUnit(truncDist, voxelSize, maxWeight, (this->pose).matrix, volResolution, volStrides, depth, rgb,
+        depthFactor, cameraPose, depth_intrinsics, rgb_intrinsics, pixNorms, volume);
+}
+
+#if USE_INTRINSICS
+// all coordinate checks should be done in inclosing cycle
+inline float ColoredTSDFVolumeCPU::interpolateVoxel(const Point3f& _p) const
+{
+    v_float32x4 p(_p.x, _p.y, _p.z, 0);
+    return interpolateVoxel(p);
+}
+
+inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
+{
+    // tx, ty, tz = floor(p)
+    v_int32x4 ip = v_floor(p);
+    v_float32x4 t = p - v_cvt_f32(ip);
+    float tx = t.get0();
+    t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+    float ty = t.get0();
+    t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+    float tz = t.get0();
+
+    int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
+
+    int ix = ip.get0();
+    ip = v_rotate_right<1>(ip);
+    int iy = ip.get0();
+    ip = v_rotate_right<1>(ip);
+    int iz = ip.get0();
+
+    int coordBase = ix * xdim + iy * ydim + iz * zdim;
+
+    TsdfType vx[8];
+    for (int i = 0; i < 8; i++)
+        vx[i] = volData[neighbourCoords[i] + coordBase].tsdf;
+
+    v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
+    v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
+    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+
+    v_float32x4 v00_10 = vxx;
+    v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
+
+    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
+    float v0 = v0_1.get0();
+    v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
+    float v1 = v0_1.get0();
+
+    return v0 + tx * (v1 - v0);
+}
+#else
+inline float ColoredTSDFVolumeCPU::interpolateVoxel(const Point3f& p) const
+{
+    int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+
+    int ix = cvFloor(p.x);
+    int iy = cvFloor(p.y);
+    int iz = cvFloor(p.z);
+
+    float tx = p.x - ix;
+    float ty = p.y - iy;
+    float tz = p.z - iz;
+
+    int coordBase = ix*xdim + iy*ydim + iz*zdim;
+    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
+
+    float vx[8];
+    for (int i = 0; i < 8; i++)
+        vx[i] = tsdfToFloat(volData[neighbourCoords[i] + coordBase].tsdf);
+
+    float v00 = vx[0] + tz*(vx[1] - vx[0]);
+    float v01 = vx[2] + tz*(vx[3] - vx[2]);
+    float v10 = vx[4] + tz*(vx[5] - vx[4]);
+    float v11 = vx[6] + tz*(vx[7] - vx[6]);
+
+    float v0 = v00 + ty*(v01 - v00);
+    float v1 = v10 + ty*(v11 - v10);
+
+    return v0 + tx*(v1 - v0);
+}
+#endif
+
+
+#if USE_INTRINSICS
+//gradientDeltaFactor is fixed at 1.0 of voxel size
+inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
+{
+    v_float32x4 p(_p.x, _p.y, _p.z, 0.f);
+    v_float32x4 result = getNormalVoxel(p);
+    float CV_DECL_ALIGNED(16) ares[4];
+    v_store_aligned(ares, result);
+    return Point3f(ares[0], ares[1], ares[2]);
+}
+
+inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
+{
+    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
+        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+            (float)(volResolution.y - 2),
+            (float)(volResolution.z - 2), 1.f))
+        )
+        return nanv;
+
+    v_int32x4 ip = v_floor(p);
+    v_float32x4 t = p - v_cvt_f32(ip);
+    float tx = t.get0();
+    t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+    float ty = t.get0();
+    t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+    float tz = t.get0();
+
+    const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
+
+    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
+    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
+    int iz = ip.get0();
+
+    int coordBase = ix * xdim + iy * ydim + iz * zdim;
+
+    float CV_DECL_ALIGNED(16) an[4];
+    an[0] = an[1] = an[2] = an[3] = 0.f;
+    for (int c = 0; c < 3; c++)
+    {
+        const int dim = volDims[c];
+        float& nv = an[c];
+
+        float vx[8];
+        for (int i = 0; i < 8; i++)
+            vx[i] = tsdfToFloat(volData[neighbourCoords[i] + coordBase + 1 * dim].tsdf) -
+                    tsdfToFloat(volData[neighbourCoords[i] + coordBase - 1 * dim].tsdf);
+
+        v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]);
+        v_float32x4 v1357(vx[1], vx[3], vx[5], vx[7]);
+        v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+
+        v_float32x4 v00_10 = vxx;
+        v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
+
+        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
+        float v0 = v0_1.get0();
+        v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
+        float v1 = v0_1.get0();
+
+        nv = v0 + tx * (v1 - v0);
+    }
+
+    v_float32x4 n = v_load_aligned(an);
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n * n)));
+
+    return Norm.get0() < 0.0001f ? nanv : n / Norm;
+}
+#else
+inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
+{
+    const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
+
+    if(p.x < 1 || p.x >= volResolution.x - 2 ||
+       p.y < 1 || p.y >= volResolution.y - 2 ||
+       p.z < 1 || p.z >= volResolution.z - 2)
+        return nan3;
+
+    int ix = cvFloor(p.x);
+    int iy = cvFloor(p.y);
+    int iz = cvFloor(p.z);
+
+    float tx = p.x - ix;
+    float ty = p.y - iy;
+    float tz = p.z - iz;
+
+    int coordBase = ix*xdim + iy*ydim + iz*zdim;
+
+    Vec3f an;
+    for(int c = 0; c < 3; c++)
+    {
+        const int dim = volDims[c];
+        float& nv = an[c];
+
+        float vx[8];
+        for(int i = 0; i < 8; i++)
+            vx[i] = tsdfToFloat(volData[neighbourCoords[i] + coordBase + 1 * dim].tsdf) -
+                    tsdfToFloat(volData[neighbourCoords[i] + coordBase - 1 * dim].tsdf);
+
+        float v00 = vx[0] + tz*(vx[1] - vx[0]);
+        float v01 = vx[2] + tz*(vx[3] - vx[2]);
+        float v10 = vx[4] + tz*(vx[5] - vx[4]);
+        float v11 = vx[6] + tz*(vx[7] - vx[6]);
+
+        float v0 = v00 + ty*(v01 - v00);
+        float v1 = v10 + ty*(v11 - v10);
+
+        nv = v0 + tx*(v1 - v0);
+    }
+
+    float nv = sqrt(an[0] * an[0] +
+                    an[1] * an[1] +
+                    an[2] * an[2]);
+    return nv < 0.0001f ? nan3 : an / nv;
+}
+#endif
+
+#if USE_INTRINSICS
+inline float ColoredTSDFVolumeCPU::interpolateColor(float tx, float ty, float tz, float vx[8]) const
+{
+    v_float32x4 v0246, v1357;
+    v_load_deinterleave(vx, v0246, v1357);
+
+    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+
+    v_float32x4 v00_10 = vxx;
+    v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
+
+    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
+    float v0 = v0_1.get0();
+    v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
+    float v1 = v0_1.get0();
+
+    return v0 + tx * (v1 - v0);
+}
+#else
+inline float ColoredTSDFVolumeCPU::interpolateColor(float tx, float ty, float tz, float vx[8]) const
+{
+    float v00 = vx[0] + tz * (vx[1] - vx[0]);
+    float v01 = vx[2] + tz * (vx[3] - vx[2]);
+    float v10 = vx[4] + tz * (vx[5] - vx[4]);
+    float v11 = vx[6] + tz * (vx[7] - vx[6]);
+
+    float v0 = v00 + ty * (v01 - v00);
+    float v1 = v10 + ty * (v11 - v10);
+
+    return v0 + tx * (v1 - v0);
+}
+#endif
+
+#if USE_INTRINSICS
+//gradientDeltaFactor is fixed at 1.0 of voxel size
+inline Point3f ColoredTSDFVolumeCPU::getColorVoxel(const Point3f& _p) const
+{
+    v_float32x4 p(_p.x, _p.y, _p.z, 0.f);
+    v_float32x4 result = getColorVoxel(p);
+    float CV_DECL_ALIGNED(16) ares[4];
+    v_store_aligned(ares, result);
+    return Point3f(ares[0], ares[1], ares[2]);
+}
+inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) const
+{
+    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
+        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+            (float)(volResolution.y - 2),
+            (float)(volResolution.z - 2), 1.f))
+        )
+        return nanv;
+
+    v_int32x4 ip = v_floor(p);
+
+    const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
+
+    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
+    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
+    int iz = ip.get0();
+
+    int coordBase = ix * xdim + iy * ydim + iz * zdim;
+    float CV_DECL_ALIGNED(16) rgb[4];
+
+#if USE_INTERPOLATION_IN_GETNORMAL
+    float r[8], g[8], b[8];
+    for (int i = 0; i < 8; i++)
+    {
+        r[i] = (float)volData[neighbourCoords[i] + coordBase].r;
+        g[i] = (float)volData[neighbourCoords[i] + coordBase].g;
+        b[i] = (float)volData[neighbourCoords[i] + coordBase].b;
+    }
+
+    v_float32x4 vsi(voxelSizeInv, voxelSizeInv, voxelSizeInv, voxelSizeInv);
+    v_float32x4 ptVox = p * vsi;
+    v_int32x4 iptVox = v_floor(ptVox);
+    v_float32x4 t = ptVox - v_cvt_f32(iptVox);
+    float tx = t.get0(); t = v_rotate_right<1>(t);
+    float ty = t.get0(); t = v_rotate_right<1>(t);
+    float tz = t.get0();
+    rgb[0] = interpolateColor(tx, ty, tz, r);
+    rgb[1] = interpolateColor(tx, ty, tz, g);
+    rgb[2] = interpolateColor(tx, ty, tz, b);
+    rgb[3] = 0.f;
+#else
+    rgb[0] = volData[coordBase].r;
+    rgb[1] = volData[coordBase].g;
+    rgb[2] = volData[coordBase].b;
+    rgb[3] = 0.f;
+#endif
+    v_float32x4 res = v_load_aligned(rgb);
+    return res;
+}
+#else
+inline Point3f ColoredTSDFVolumeCPU::getColorVoxel(const Point3f& p) const
+{
+    const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
+
+
+
+    if(p.x < 1 || p.x >= volResolution.x - 2 ||
+       p.y < 1 || p.y >= volResolution.y - 2 ||
+       p.z < 1 || p.z >= volResolution.z - 2)
+        return nan3;
+
+    int ix = cvFloor(p.x);
+    int iy = cvFloor(p.y);
+    int iz = cvFloor(p.z);
+
+    int coordBase = ix*xdim + iy*ydim + iz*zdim;
+    Point3f res;
+
+#if USE_INTERPOLATION_IN_GETNORMAL
+    // TODO: create better interpolation or remove this simple version
+    float r[8], g[8], b[8];
+    for (int i = 0; i < 8; i++)
+    {
+        r[i] = (float) volData[neighbourCoords[i] + coordBase].r;
+        g[i] = (float) volData[neighbourCoords[i] + coordBase].g;
+        b[i] = (float) volData[neighbourCoords[i] + coordBase].b;
+    }
+
+    Point3f ptVox = p * voxelSizeInv;
+    Vec3i iptVox(cvFloor(ptVox.x), cvFloor(ptVox.y), cvFloor(ptVox.z));
+    float tx = ptVox.x - iptVox[0];
+    float ty = ptVox.y - iptVox[1];
+    float tz = ptVox.z - iptVox[2];
+
+    res=Point3f(interpolateColor(tx, ty, tz, r),
+                interpolateColor(tx, ty, tz, g),
+                interpolateColor(tx, ty, tz, b));
+#else
+    res=Point3f(volData[coordBase].r, volData[coordBase].g, volData[coordBase].b);
+#endif
+    colorFix(res);
+    return res;
+}
+#endif
+
+struct ColorRaycastInvoker : ParallelLoopBody
+{
+    ColorRaycastInvoker(Points& _points, Normals& _normals, Colors& _colors, const Matx44f& cameraPose,
+                        const Intr& depth_intrinsics, const ColoredTSDFVolumeCPU& _volume) :
+        ParallelLoopBody(),
+        points(_points),
+        normals(_normals),
+        colors(_colors),
+        volume(_volume),
+        tstep(volume.truncDist * volume.raycastStepFactor),
+        // We do subtract voxel size to minimize checks after
+        // Note: origin of volume coordinate is placed
+        // in the center of voxel (0,0,0), not in the corner of the voxel!
+        boxMax(volume.volSize - Point3f(volume.voxelSize,
+                                        volume.voxelSize,
+                                        volume.voxelSize)),
+        boxMin(),
+        cam2vol(volume.pose.inv() * Affine3f(cameraPose)),
+        vol2cam(Affine3f(cameraPose.inv()) * volume.pose),
+        reprojDepth(depth_intrinsics.makeReprojector())
+    {  }
+#if USE_INTRINSICS
+    virtual void operator() (const Range& range) const override
+    {
+        const v_float32x4 vfxy(reprojDepth.fxinv, reprojDepth.fyinv, 0, 0);
+        const v_float32x4 vcxy(reprojDepth.cx, reprojDepth.cy, 0, 0);
+
+        const float(&cm)[16] = cam2vol.matrix.val;
+        const v_float32x4 camRot0(cm[0], cm[4], cm[8], 0);
+        const v_float32x4 camRot1(cm[1], cm[5], cm[9], 0);
+        const v_float32x4 camRot2(cm[2], cm[6], cm[10], 0);
+        const v_float32x4 camTrans(cm[3], cm[7], cm[11], 0);
+
+        const v_float32x4 boxDown(boxMin.x, boxMin.y, boxMin.z, 0.f);
+        const v_float32x4 boxUp(boxMax.x, boxMax.y, boxMax.z, 0.f);
+
+        const v_float32x4 invVoxelSize = v_float32x4(volume.voxelSizeInv,
+            volume.voxelSizeInv,
+            volume.voxelSizeInv, 1.f);
+
+        const float(&vm)[16] = vol2cam.matrix.val;
+        const v_float32x4 volRot0(vm[0], vm[4], vm[8], 0);
+        const v_float32x4 volRot1(vm[1], vm[5], vm[9], 0);
+        const v_float32x4 volRot2(vm[2], vm[6], vm[10], 0);
+        const v_float32x4 volTrans(vm[3], vm[7], vm[11], 0);
+
+        for (int y = range.start; y < range.end; y++)
+        {
+            ptype* ptsRow = points[y];
+            ptype* nrmRow = normals[y];
+            ptype* clrRow = colors[y];
+
+            for (int x = 0; x < points.cols; x++)
+            {
+                v_float32x4 point = nanv, normal = nanv, color = nanv;
+
+                v_float32x4 orig = camTrans;
+
+                // get direction through pixel in volume space:
+
+                // 1. reproject (x, y) on projecting plane where z = 1.f
+                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy) * vfxy;
+                planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
+
+                // 2. rotate to volume space
+                planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
+
+                // 3. normalize
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed * planed)));
+                v_float32x4 dir = planed * invNorm;
+
+                // compute intersection of ray with all six bbox planes
+                v_float32x4 rayinv = v_setall_f32(1.f) / dir;
+                // div by zero should be eliminated by these products
+                v_float32x4 tbottom = rayinv * (boxDown - orig);
+                v_float32x4 ttop = rayinv * (boxUp - orig);
+
+                // re-order intersections to find smallest and largest on each axis
+                v_float32x4 minAx = v_min(ttop, tbottom);
+                v_float32x4 maxAx = v_max(ttop, tbottom);
+
+                // near clipping plane
+                const float clip = 0.f;
+                float _minAx[4], _maxAx[4];
+                v_store(_minAx, minAx);
+                v_store(_maxAx, maxAx);
+                float tmin = max({ _minAx[0], _minAx[1], _minAx[2], clip });
+                float tmax = min({ _maxAx[0], _maxAx[1], _maxAx[2] });
+
+                // precautions against getting coordinates out of bounds
+                tmin = tmin + tstep;
+                tmax = tmax - tstep;
+
+                if (tmin < tmax)
+                {
+                    // interpolation optimized a little
+                    orig *= invVoxelSize;
+                    dir *= invVoxelSize;
+
+                    int xdim = volume.volDims[0];
+                    int ydim = volume.volDims[1];
+                    int zdim = volume.volDims[2];
+                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
+                    v_float32x4 next = (orig + dir * v_setall_f32(tmin));
+                    float f = volume.interpolateVoxel(next), fnext = f;
+
+                    //raymarch
+                    int steps = 0;
+                    int nSteps = cvFloor((tmax - tmin) / tstep);
+                    for (; steps < nSteps; steps++)
+                    {
+                        next += rayStep;
+                        v_int32x4 ip = v_round(next);
+                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
+                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
+                        int iz = ip.get0();
+                        int coord = ix * xdim + iy * ydim + iz * zdim;
+
+                        fnext = tsdfToFloat(volume.volume.at<RGBTsdfVoxel>(coord).tsdf);
+                        if (fnext != f)
+                        {
+                            fnext = volume.interpolateVoxel(next);
+
+                            // when ray crosses a surface
+                            if (std::signbit(f) != std::signbit(fnext))
+                                break;
+
+                            f = fnext;
+                        }
+                    }
+
+                    // if ray penetrates a surface from outside
+                    // linearly interpolate t between two f values
+                    if (f > 0.f && fnext < 0.f)
+                    {
+                        v_float32x4 tp = next - rayStep;
+                        float ft = volume.interpolateVoxel(tp);
+                        float ftdt = volume.interpolateVoxel(next);
+                        float ts = tmin + tstep * (steps - ft / (ftdt - ft));
+
+                        // avoid division by zero
+                        if (!cvIsNaN(ts) && !cvIsInf(ts))
+                        {
+                            v_float32x4 pv = (orig + dir * v_setall_f32(ts));
+                            v_float32x4 nv = volume.getNormalVoxel(pv);
+                            v_float32x4 cv = volume.getColorVoxel(pv);
+
+                            if (!isNaN(nv))
+                            {
+                                color = cv;
+                                //convert pv and nv to camera space
+                                normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
+                                // interpolation optimized a little
+                                point = v_matmuladd(pv * v_float32x4(volume.voxelSize,
+                                    volume.voxelSize,
+                                    volume.voxelSize, 1.f),
+                                    volRot0, volRot1, volRot2, volTrans);
+                            }
+                        }
+                    }
+                }
+
+                v_store((float*)(&ptsRow[x]), point);
+                v_store((float*)(&nrmRow[x]), normal);
+                v_store((float*)(&clrRow[x]), color);
+            }
+        }
+    }
+#else
+    virtual void operator() (const Range& range) const override
+    {
+        const Point3f camTrans = cam2vol.translation();
+        const Matx33f  camRot  = cam2vol.rotation();
+        const Matx33f  volRot  = vol2cam.rotation();
+
+        for(int y = range.start; y < range.end; y++)
+        {
+            ptype* ptsRow = points[y];
+            ptype* nrmRow = normals[y];
+            ptype* clrRow = colors[y];
+
+            for(int x = 0; x < points.cols; x++)
+            {
+                Point3f point = nan3, normal = nan3, color = nan3;
+
+                Point3f orig = camTrans;
+                // direction through pixel in volume space
+                Point3f dir = normalize(Vec3f(camRot * reprojDepth(Point3f(float(x), float(y), 1.f))));
+
+                // compute intersection of ray with all six bbox planes
+                Vec3f rayinv(1.f/dir.x, 1.f/dir.y, 1.f/dir.z);
+                Point3f tbottom = rayinv.mul(boxMin - orig);
+                Point3f ttop    = rayinv.mul(boxMax - orig);
+
+                // re-order intersections to find smallest and largest on each axis
+                Point3f minAx(min(ttop.x, tbottom.x), min(ttop.y, tbottom.y), min(ttop.z, tbottom.z));
+                Point3f maxAx(max(ttop.x, tbottom.x), max(ttop.y, tbottom.y), max(ttop.z, tbottom.z));
+
+                // near clipping plane
+                const float clip = 0.f;
+                //float tmin = max(max(max(minAx.x, minAx.y), max(minAx.x, minAx.z)), clip);
+                //float tmax =     min(min(maxAx.x, maxAx.y), min(maxAx.x, maxAx.z));
+                float tmin = max({ minAx.x, minAx.y, minAx.z, clip });
+                float tmax = min({ maxAx.x, maxAx.y, maxAx.z });
+
+                // precautions against getting coordinates out of bounds
+                tmin = tmin + tstep;
+                tmax = tmax - tstep;
+
+                if(tmin < tmax)
+                {
+                    // interpolation optimized a little
+                    orig = orig*volume.voxelSizeInv;
+                    dir  =  dir*volume.voxelSizeInv;
+
+                    Point3f rayStep = dir * tstep;
+                    Point3f next = (orig + dir * tmin);
+                    float f = volume.interpolateVoxel(next), fnext = f;
+
+                    //raymarch
+                    int steps = 0;
+                    int nSteps = int(floor((tmax - tmin)/tstep));
+                    for(; steps < nSteps; steps++)
+                    {
+                        next += rayStep;
+                        int xdim = volume.volDims[0];
+                        int ydim = volume.volDims[1];
+                        int zdim = volume.volDims[2];
+                        int ix = cvRound(next.x);
+                        int iy = cvRound(next.y);
+                        int iz = cvRound(next.z);
+                        fnext = tsdfToFloat(volume.volume.at<RGBTsdfVoxel>(ix*xdim + iy*ydim + iz*zdim).tsdf);
+                        if(fnext != f)
+                        {
+                            fnext = volume.interpolateVoxel(next);
+                            // when ray crosses a surface
+                            if(std::signbit(f) != std::signbit(fnext))
+                                break;
+
+                            f = fnext;
+                        }
+                    }
+                    // if ray penetrates a surface from outside
+                    // linearly interpolate t between two f values
+                    if(f > 0.f && fnext < 0.f)
+                    {
+                        Point3f tp    = next - rayStep;
+                        float ft   = volume.interpolateVoxel(tp);
+                        float ftdt = volume.interpolateVoxel(next);
+                        // float t = tmin + steps*tstep;
+                        // float ts = t - tstep*ft/(ftdt - ft);
+                        float ts = tmin + tstep*(steps - ft/(ftdt - ft));
+
+                        // avoid division by zero
+                        if(!cvIsNaN(ts) && !cvIsInf(ts))
+                        {
+                            Point3f pv = (orig + dir*ts);
+                            Point3f nv = volume.getNormalVoxel(pv);
+                            Point3f cv = volume.getColorVoxel(pv);
+                            if(!isNaN(nv))
+                            {
+                                //convert pv and nv to camera space
+                                normal = volRot * nv;
+                                color = cv;
+                                // interpolation optimized a little
+                                point = vol2cam * (pv*volume.voxelSize);
+                            }
+                        }
+                    }
+                }
+                ptsRow[x] = toPtype(point);
+                nrmRow[x] = toPtype(normal);
+                clrRow[x] = toPtype(color);
+            }
+        }
+    }
+#endif
+
+    Points& points;
+    Normals& normals;
+    Colors& colors;
+    const ColoredTSDFVolumeCPU& volume;
+
+    const float tstep;
+
+    const Point3f boxMax;
+    const Point3f boxMin;
+
+    const Affine3f cam2vol;
+    const Affine3f vol2cam;
+    const Intr::Reprojector reprojDepth;
+};
+
+
+void ColoredTSDFVolumeCPU::raycast(const Matx44f& cameraPose, const Matx33f& depth_intrinsics, const Size& frameSize,
+                                   OutputArray _points, OutputArray _normals, OutputArray _colors) const
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(frameSize.area() > 0);
+
+    _points.create (frameSize, POINT_TYPE);
+    _normals.create(frameSize, POINT_TYPE);
+    _colors.create(frameSize, POINT_TYPE);
+
+    Points points   =  _points.getMat();
+    Normals normals = _normals.getMat();
+    Colors colors = _colors.getMat();
+    ColorRaycastInvoker ri(points, normals, colors, cameraPose, Intr(depth_intrinsics), *this);
+
+    const int nstripes = -1;
+    parallel_for_(Range(0, points.rows), ri, nstripes);
+}
+
+
+struct ColorFetchPointsNormalsInvoker : ParallelLoopBody
+{
+    ColorFetchPointsNormalsInvoker(const ColoredTSDFVolumeCPU& _volume,
+                              std::vector<std::vector<ptype>>& _pVecs,
+                              std::vector<std::vector<ptype>>& _nVecs,
+                              bool _needNormals) :
+        ParallelLoopBody(),
+        vol(_volume),
+        pVecs(_pVecs),
+        nVecs(_nVecs),
+        needNormals(_needNormals)
+    {
+        volDataStart = vol.volume.ptr<RGBTsdfVoxel>();
+    }
+
+    inline void coord(std::vector<ptype>& points, std::vector<ptype>& normals,
+                      int x, int y, int z, Point3f V, float v0, int axis) const
+    {
+        // 0 for x, 1 for y, 2 for z
+        bool limits = false;
+        Point3i shift;
+        float Vc = 0.f;
+        if(axis == 0)
+        {
+            shift  = Point3i(1, 0, 0);
+            limits = (x + 1 < vol.volResolution.x);
+            Vc     = V.x;
+        }
+        if(axis == 1)
+        {
+            shift  = Point3i(0, 1, 0);
+            limits = (y + 1 < vol.volResolution.y);
+            Vc     = V.y;
+        }
+        if(axis == 2)
+        {
+            shift  = Point3i(0, 0, 1);
+            limits = (z + 1 < vol.volResolution.z);
+            Vc     = V.z;
+        }
+
+        if(limits)
+        {
+            const RGBTsdfVoxel& voxeld = volDataStart[(x+shift.x)*vol.volDims[0] +
+                                                      (y+shift.y)*vol.volDims[1] +
+                                                      (z+shift.z)*vol.volDims[2]];
+            float vd = tsdfToFloat(voxeld.tsdf);
+
+            if(voxeld.weight != 0 && vd != 1.f)
+            {
+                if((v0 > 0 && vd < 0) || (v0 < 0 && vd > 0))
+                {
+                    //linearly interpolate coordinate
+                    float Vn    = Vc + vol.voxelSize;
+                    float dinv  = 1.f/(abs(v0)+abs(vd));
+                    float inter = (Vc*abs(vd) + Vn*abs(v0))*dinv;
+
+                    Point3f p(shift.x ? inter : V.x,
+                              shift.y ? inter : V.y,
+                              shift.z ? inter : V.z);
+                    {
+                        points.push_back(toPtype(vol.pose * p));
+                        if(needNormals)
+                            normals.push_back(toPtype(vol.pose.rotation() *
+                                                      vol.getNormalVoxel(p*vol.voxelSizeInv)));
+                    }
+                }
+            }
+        }
+    }
+
+    virtual void operator() (const Range& range) const override
+    {
+        std::vector<ptype> points, normals;
+        for(int x = range.start; x < range.end; x++)
+        {
+            const RGBTsdfVoxel* volDataX = volDataStart + x*vol.volDims[0];
+            for(int y = 0; y < vol.volResolution.y; y++)
+            {
+                const RGBTsdfVoxel* volDataY = volDataX + y*vol.volDims[1];
+                for(int z = 0; z < vol.volResolution.z; z++)
+                {
+                    const RGBTsdfVoxel& voxel0 = volDataY[z*vol.volDims[2]];
+                    float v0             = tsdfToFloat(voxel0.tsdf);
+                    if(voxel0.weight != 0 && v0 != 1.f)
+                    {
+                        Point3f V(Point3f((float)x + 0.5f, (float)y + 0.5f, (float)z + 0.5f)*vol.voxelSize);
+
+                        coord(points, normals, x, y, z, V, v0, 0);
+                        coord(points, normals, x, y, z, V, v0, 1);
+                        coord(points, normals, x, y, z, V, v0, 2);
+
+                    } // if voxel is not empty
+                }
+            }
+        }
+
+        AutoLock al(mutex);
+        pVecs.push_back(points);
+        nVecs.push_back(normals);
+    }
+
+    const ColoredTSDFVolumeCPU& vol;
+    std::vector<std::vector<ptype>>& pVecs;
+    std::vector<std::vector<ptype>>& nVecs;
+    const RGBTsdfVoxel* volDataStart;
+    bool needNormals;
+    mutable Mutex mutex;
+};
+
+void ColoredTSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    if(_points.needed())
+    {
+        std::vector<std::vector<ptype>> pVecs, nVecs;
+        ColorFetchPointsNormalsInvoker fi(*this, pVecs, nVecs, _normals.needed());
+        Range range(0, volResolution.x);
+        const int nstripes = -1;
+        parallel_for_(range, fi, nstripes);
+        std::vector<ptype> points, normals;
+        for(size_t i = 0; i < pVecs.size(); i++)
+        {
+            points.insert(points.end(), pVecs[i].begin(), pVecs[i].end());
+            normals.insert(normals.end(), nVecs[i].begin(), nVecs[i].end());
+        }
+
+        _points.create((int)points.size(), 1, POINT_TYPE);
+        if(!points.empty())
+            Mat((int)points.size(), 1, POINT_TYPE, &points[0]).copyTo(_points.getMat());
+
+        if(_normals.needed())
+        {
+            _normals.create((int)normals.size(), 1, POINT_TYPE);
+            if(!normals.empty())
+                Mat((int)normals.size(), 1, POINT_TYPE, &normals[0]).copyTo(_normals.getMat());
+        }
+    }
+}
+
+void ColoredTSDFVolumeCPU::fetchNormals(InputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(!_points.empty());
+    if(_normals.needed())
+    {
+        Points points = _points.getMat();
+        CV_Assert(points.type() == POINT_TYPE);
+
+        _normals.createSameSize(_points, _points.type());
+        Normals normals = _normals.getMat();
+
+        const ColoredTSDFVolumeCPU& _vol = *this;
+        auto PushNormals = [&](const ptype& pp, const int* position)
+        {
+            const ColoredTSDFVolumeCPU& vol(_vol);
+            Affine3f invPose(vol.pose.inv());
+            Point3f p = fromPtype(pp);
+            Point3f n = nan3;
+            if (!isNaN(p))
+            {
+                Point3f voxPt = (invPose * p);
+                voxPt = voxPt * vol.voxelSizeInv;
+                n = vol.pose.rotation() * vol.getNormalVoxel(voxPt);
+            }
+            normals(position[0], position[1]) = toPtype(n);
+        };
+        points.forEach(PushNormals);
+    }
+}
+
+Ptr<ColoredTSDFVolume> makeColoredTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor,
+                                   float _truncDist, int _maxWeight, Point3i _resolution)
+{
+    return makePtr<ColoredTSDFVolumeCPU>(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _resolution);
+}
+
+Ptr<ColoredTSDFVolume> makeColoredTSDFVolume(const VolumeParams& _params)
+{
+    Mat pm = _params.pose;
+    CV_Assert(pm.size() == Size(4, 4));
+    CV_Assert(pm.type() == CV_32F);
+    Matx44f pose = pm;
+
+    cv::Point3i resolution(_params.resolutionX, _params.resolutionY, _params.resolutionZ);
+    return makePtr<ColoredTSDFVolumeCPU>(_params.voxelSize, pose, _params.raycastStepFactor,
+                                         _params.tsdfTruncDist, _params.maxWeight, resolution);
+}
+
+} // namespace cv
diff --git a/modules/3d/src/rgbd/colored_tsdf.hpp b/modules/3d/src/rgbd/colored_tsdf.hpp
new file mode 100644
index 0000000000..c66e01e535
--- /dev/null
+++ b/modules/3d/src/rgbd/colored_tsdf.hpp
@@ -0,0 +1,39 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_3D_COLORED_TSDF_HPP
+#define OPENCV_3D_COLORED_TSDF_HPP
+
+#include "../precomp.hpp"
+#include "tsdf_functions.hpp"
+
+namespace cv
+{
+
+class ColoredTSDFVolume : public Volume
+{
+   public:
+    // dimension in voxels, size in meters
+    ColoredTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
+                      int _maxWeight, Point3i _resolution, bool zFirstMemOrder = true);
+    virtual ~ColoredTSDFVolume() = default;
+
+   public:
+
+    Point3i volResolution;
+    WeightType maxWeight;
+
+    Point3f volSize;
+    float truncDist;
+    Vec4i volDims;
+    Vec8i neighbourCoords;
+};
+
+Ptr<ColoredTSDFVolume> makeColoredTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor,
+                                             float _truncDist, int _maxWeight, Point3i _resolution);
+Ptr<ColoredTSDFVolume> makeColoredTSDFVolume(const VolumeParams& _params);
+
+}  // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/src/rgbd/depth_registration.cpp b/modules/3d/src/rgbd/depth_registration.cpp
new file mode 100644
index 0000000000..ad641ba280
--- /dev/null
+++ b/modules/3d/src/rgbd/depth_registration.cpp
@@ -0,0 +1,315 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../precomp.hpp"
+
+namespace cv
+{
+
+///////////////////////////////////////////////////////////////////////////////////
+
+// Our three input types have a different value for a depth pixel with no depth
+template<typename DepthDepth>
+inline DepthDepth noDepthSentinelValue()
+{
+    return 0;
+}
+
+template<>
+inline float
+noDepthSentinelValue<float>()
+{
+    return std::numeric_limits<float>::quiet_NaN();
+}
+
+template<>
+inline double
+noDepthSentinelValue<double>()
+{
+    return std::numeric_limits<double>::quiet_NaN();
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+// Testing for depth pixels with no depth isn't straightforward for NaN values. We
+// need to specialize the equality check for floats and doubles.
+template<typename DepthDepth>
+inline bool
+isEqualToNoDepthSentinelValue(const DepthDepth& value)
+{
+    return value == noDepthSentinelValue<DepthDepth>();
+}
+
+template<>
+inline bool
+isEqualToNoDepthSentinelValue<float>(const float& value)
+{
+    return cvIsNaN(value) != 0;
+}
+
+template<>
+inline bool
+isEqualToNoDepthSentinelValue<double>(const double& value)
+{
+    return cvIsNaN(value) != 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+
+// When using the unsigned short representation, we'd like to round the values to the nearest
+// integer value. The float/double representations don't need to be rounded
+template<typename DepthDepth>
+inline DepthDepth
+floatToInputDepth(const float& value)
+{
+    return (DepthDepth)value;
+}
+
+template<>
+inline unsigned short
+floatToInputDepth<unsigned short>(const float& value)
+{
+    return (unsigned short)(value + 0.5);
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+
+/** Computes a registered depth image from an unregistered image.
+ *
+ * @param unregisteredDepth the input depth data
+ * @param unregisteredCameraMatrix the camera matrix of the depth camera
+ * @param registeredCameraMatrix the camera matrix of the external camera
+ * @param registeredDistCoeffs the distortion coefficients of the external camera
+ * @param rbtRgb2Depth the rigid body transform between the cameras.
+ * @param outputImagePlaneSize the image plane dimensions of the external camera (width, height)
+ * @param depthDilation whether or not the depth is dilated to avoid holes and occlusion errors
+ * @param inputDepthToMetersScale the scale needed to transform the input depth units to meters
+ * @param registeredDepth the result of transforming the depth into the external camera
+ */
+template<typename DepthDepth>
+void performRegistration(const Mat_<DepthDepth>& unregisteredDepth,
+                         const Matx33f& unregisteredCameraMatrix,
+                         const Matx33f& registeredCameraMatrix,
+                         const Mat_<float>& registeredDistCoeffs,
+                         const Matx44f& rbtRgb2Depth,
+                         const Size outputImagePlaneSize,
+                         const bool depthDilation,
+                         const float inputDepthToMetersScale,
+                         Mat& registeredDepth)
+{
+    // Create output Mat of the correct type, filled with an initial value indicating no depth
+    registeredDepth = Mat_<DepthDepth>(outputImagePlaneSize, noDepthSentinelValue<DepthDepth>());
+
+    // Figure out whether we'll have to apply a distortion
+    bool hasDistortion = (countNonZero(registeredDistCoeffs) > 0);
+
+    // A point (i,j,1) will have to be converted to 3d first, by multiplying it by K.inv()
+    // It will then be transformed by rbtRgb2Depth.
+    // Finally, it will be projected into the external camera via registeredCameraMatrix and
+    // its distortion coefficients. If there is no distortion in the external camera, we
+    // can linearly chain all three operations together.
+
+    Matx44f K = Matx44f::zeros();
+    for (unsigned char j = 0; j < 3; ++j)
+        for (unsigned char i = 0; i < 3; ++i)
+        {
+            K(j, i) = unregisteredCameraMatrix(j, i);
+        }
+    K(3, 3) = 1;
+
+    Matx44f initialProjection;
+    if (hasDistortion)
+    {
+        // The projection into the external camera will be done separately with distortion
+        initialProjection = rbtRgb2Depth * K.inv();
+    }
+    else
+    {
+        // No distortion, so all operations can be chained
+        initialProjection = Matx44f::zeros();
+        for (unsigned char j = 0; j < 3; ++j)
+            for (unsigned char i = 0; i < 3; ++i)
+                initialProjection(j, i) = registeredCameraMatrix(j, i);
+        initialProjection(3, 3) = 1;
+
+        initialProjection = initialProjection * rbtRgb2Depth * K.inv();
+    }
+
+    // Apply the initial projection to the input depth
+    Mat_<Point3f> transformedCloud;
+    {
+        Mat_<Point3f> point_tmp(outputImagePlaneSize, Point3f(0., 0., 0.));
+        for (int j = 0; j < unregisteredDepth.rows; ++j)
+        {
+            const DepthDepth* unregisteredDepthPtr = unregisteredDepth[j];
+
+            Point3f* point = point_tmp[j];
+            for (int i = 0; i < unregisteredDepth.cols; ++i, ++unregisteredDepthPtr, ++point)
+            {
+                float rescaled_depth = float(*unregisteredDepthPtr) * inputDepthToMetersScale;
+
+                // If the DepthDepth is of type unsigned short, zero is a sentinel value to indicate
+                // no depth. CV_32F and CV_64F should already have NaN for no depth values.
+                if (rescaled_depth == 0)
+                {
+                    rescaled_depth = std::numeric_limits<float>::quiet_NaN();
+                }
+
+                point->x = i * rescaled_depth;
+                point->y = j * rescaled_depth;
+                point->z = rescaled_depth;
+            }
+        }
+
+        perspectiveTransform(point_tmp, transformedCloud, initialProjection);
+    }
+
+    std::vector<Point2f> transformedAndProjectedPoints(transformedCloud.cols);
+    const float metersToInputUnitsScale = 1 / inputDepthToMetersScale;
+    const Rect registeredDepthBounds(Point(), outputImagePlaneSize);
+
+    for (int y = 0; y < transformedCloud.rows; y++)
+    {
+        if (hasDistortion)
+        {
+
+            // Project an entire row of points with distortion.
+            // Doing this for the entire image at once would require more memory.
+            projectPoints(transformedCloud.row(y),
+                          Vec3f(0, 0, 0),
+                          Vec3f(0, 0, 0),
+                          registeredCameraMatrix,
+                          registeredDistCoeffs,
+                          transformedAndProjectedPoints);
+        }
+        else
+        {
+            // With no distortion, we just have to dehomogenize the point since all major transforms
+            // already happened with initialProjection.
+            Point2f* point2d = &transformedAndProjectedPoints[0];
+            const Point2f* point2d_end = point2d + transformedAndProjectedPoints.size();
+            const Point3f* point3d = transformedCloud[y];
+            for (; point2d < point2d_end; ++point2d, ++point3d)
+            {
+                point2d->x = point3d->x / point3d->z;
+                point2d->y = point3d->y / point3d->z;
+            }
+        }
+
+        const Point2f* outputProjectedPoint = &transformedAndProjectedPoints[0];
+        const Point3f* p = transformedCloud[y], * p_end = p + transformedCloud.cols;
+
+        for (; p < p_end; ++outputProjectedPoint, ++p)
+        {
+            // Skip this one if there isn't a valid depth
+            const Point2f projectedPixelFloatLocation = *outputProjectedPoint;
+            if (cvIsNaN(projectedPixelFloatLocation.x))
+                continue;
+
+            //Get integer pixel location
+            const Point2i projectedPixelLocation = projectedPixelFloatLocation;
+
+            // Ensure that the projected point is actually contained in our output image
+            if (!registeredDepthBounds.contains(projectedPixelLocation))
+                continue;
+
+            // Go back to our original scale, since that's what our output will be
+            // The templated function is to ensure that integer values are rounded to the nearest integer
+            const DepthDepth cloudDepth = floatToInputDepth<DepthDepth>(p->z * metersToInputUnitsScale);
+
+            DepthDepth& outputDepth = registeredDepth.at<DepthDepth>(projectedPixelLocation.y, projectedPixelLocation.x);
+
+            // Occlusion check
+            if (isEqualToNoDepthSentinelValue<DepthDepth>(outputDepth) || (outputDepth > cloudDepth))
+                outputDepth = cloudDepth;
+
+            // If desired, dilate this point to avoid holes in the final image
+            if (depthDilation)
+            {
+                // Choosing to dilate in a 2x2 region, where the original projected location is in the bottom right of this
+                // region. This is what's done on PrimeSense devices, but a more accurate scheme could be used.
+                const Point2i dilatedProjectedLocations[3] = { Point2i(projectedPixelLocation.x - 1, projectedPixelLocation.y),
+                                                               Point2i(projectedPixelLocation.x    , projectedPixelLocation.y - 1),
+                                                               Point2i(projectedPixelLocation.x - 1, projectedPixelLocation.y - 1) };
+
+                for (int i = 0; i < 3; i++)
+                {
+                    const Point2i& dilatedCoordinates = dilatedProjectedLocations[i];
+
+                    if (!registeredDepthBounds.contains(dilatedCoordinates))
+                        continue;
+
+                    DepthDepth& outputDilatedDepth = registeredDepth.at<DepthDepth>(dilatedCoordinates.y, dilatedCoordinates.x);
+
+                    // Occlusion check
+                    if (isEqualToNoDepthSentinelValue(outputDilatedDepth) || (outputDilatedDepth > cloudDepth))
+                        outputDilatedDepth = cloudDepth;
+                }
+
+            } // depthDilation
+
+        } // iterate cols
+    } // iterate rows
+}
+
+
+
+void
+registerDepth(InputArray unregisteredCameraMatrix, InputArray registeredCameraMatrix, InputArray registeredDistCoeffs,
+              InputArray Rt, InputArray unregisteredDepth, const Size& outputImagePlaneSize,
+              OutputArray registeredDepth, bool depthDilation)
+{
+    CV_Assert(unregisteredCameraMatrix.depth() == CV_64F || unregisteredCameraMatrix.depth() == CV_32F);
+    CV_Assert(registeredCameraMatrix.depth() == CV_64F   || registeredCameraMatrix.depth() == CV_32F);
+    CV_Assert(registeredDistCoeffs.empty() || registeredDistCoeffs.depth() == CV_64F || registeredDistCoeffs.depth() == CV_32F);
+    CV_Assert(Rt.depth() == CV_64F || Rt.depth() == CV_32F);
+
+    CV_Assert(unregisteredDepth.cols() > 0 && unregisteredDepth.rows() > 0 &&
+              (unregisteredDepth.depth() == CV_32F || unregisteredDepth.depth() == CV_64F || unregisteredDepth.depth() == CV_16U));
+    CV_Assert(outputImagePlaneSize.height > 0 && outputImagePlaneSize.width > 0);
+
+    // Implicitly checking dimensions of the InputArrays
+    Matx33f _unregisteredCameraMatrix = unregisteredCameraMatrix.getMat();
+    Matx33f _registeredCameraMatrix = registeredCameraMatrix.getMat();
+    Mat_<float> _registeredDistCoeffs = registeredDistCoeffs.getMat();
+    Matx44f _rbtRgb2Depth = Rt.getMat();
+
+    Mat& registeredDepthMat = registeredDepth.getMatRef();
+
+    switch (unregisteredDepth.depth())
+    {
+    case CV_16U:
+    {
+        performRegistration<unsigned short>(unregisteredDepth.getMat(), _unregisteredCameraMatrix,
+            _registeredCameraMatrix, _registeredDistCoeffs,
+            _rbtRgb2Depth, outputImagePlaneSize, depthDilation,
+            .001f, registeredDepthMat);
+        break;
+    }
+    case CV_32F:
+    {
+        performRegistration<float>(unregisteredDepth.getMat(), _unregisteredCameraMatrix,
+                                   _registeredCameraMatrix, _registeredDistCoeffs,
+                                   _rbtRgb2Depth, outputImagePlaneSize, depthDilation,
+                                   1.0f, registeredDepthMat);
+        break;
+    }
+    case CV_64F:
+    {
+        performRegistration<double>(unregisteredDepth.getMat(), _unregisteredCameraMatrix,
+                                    _registeredCameraMatrix, _registeredDistCoeffs,
+                                    _rbtRgb2Depth, outputImagePlaneSize, depthDilation,
+                                    1.0f, registeredDepthMat);
+        break;
+    }
+    default:
+    {
+        CV_Error(Error::StsUnsupportedFormat, "Input depth must be unsigned short, float, or double.");
+    }
+    }
+}
+
+} /* namespace cv */
diff --git a/modules/3d/src/rgbd/depth_to_3d.cpp b/modules/3d/src/rgbd/depth_to_3d.cpp
new file mode 100644
index 0000000000..79aea236f4
--- /dev/null
+++ b/modules/3d/src/rgbd/depth_to_3d.cpp
@@ -0,0 +1,217 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../precomp.hpp"
+#include "depth_to_3d.hpp"
+
+namespace cv
+{
+
+/**
+ * @param K
+ * @param depth the depth image
+ * @param mask the mask of the points to consider (can be empty)
+ * @param points3d the resulting 3d points, a 3-channel matrix
+ */
+static void depthTo3d_from_uvz(const cv::Mat& in_K, const cv::Mat& u_mat, const cv::Mat& v_mat, const cv::Mat& z_mat,
+                               cv::Mat& points3d)
+{
+    CV_Assert((u_mat.size() == z_mat.size()) && (v_mat.size() == z_mat.size()));
+    if (u_mat.empty())
+        return;
+    CV_Assert((u_mat.type() == z_mat.type()) && (v_mat.type() == z_mat.type()));
+
+    //grab camera params
+    cv::Mat_<float> K;
+
+    if (in_K.depth() == CV_32F)
+        K = in_K;
+    else
+        in_K.convertTo(K, CV_32F);
+
+    float fx = K(0, 0);
+    float fy = K(1, 1);
+    float s = K(0, 1);
+    float cx = K(0, 2);
+    float cy = K(1, 2);
+
+    std::vector<cv::Mat> coordinates(3);
+
+    coordinates[0] = (u_mat - cx) / fx;
+
+    if (s != 0)
+        coordinates[0] = coordinates[0] + (-(s / fy) * v_mat + cy * s / fy) / fx;
+
+    coordinates[0] = coordinates[0].mul(z_mat);
+    coordinates[1] = (v_mat - cy).mul(z_mat) * (1. / fy);
+    coordinates[2] = z_mat;
+    cv::merge(coordinates, points3d);
+}
+
+/**
+ * @param K
+ * @param depth the depth image
+ * @param mask the mask of the points to consider (can be empty)
+ * @param points3d the resulting 3d points
+ */
+static void depthTo3dMask(const cv::Mat& depth, const cv::Mat& K, const cv::Mat& mask, cv::Mat& points3d)
+{
+    // Create 3D points in one go.
+    cv::Mat_<float> u_mat, v_mat, z_mat;
+
+    cv::Mat_<uchar> uchar_mask = mask;
+
+    if (mask.depth() != (CV_8U))
+        mask.convertTo(uchar_mask, CV_8U);
+
+    // Figure out the interesting indices
+    size_t n_points;
+
+    if (depth.depth() == CV_16U)
+        n_points = convertDepthToFloat<ushort>(depth, mask, 1.0f / 1000.0f, u_mat, v_mat, z_mat);
+    else if (depth.depth() == CV_16S)
+        n_points = convertDepthToFloat<short>(depth, mask, 1.0f / 1000.0f, u_mat, v_mat, z_mat);
+    else
+    {
+        CV_Assert(depth.type() == CV_32F);
+        n_points = convertDepthToFloat<float>(depth, mask, 1.0f, u_mat, v_mat, z_mat);
+    }
+
+    if (n_points == 0)
+        return;
+
+    u_mat.resize(n_points);
+    v_mat.resize(n_points);
+    z_mat.resize(n_points);
+
+    depthTo3d_from_uvz(K, u_mat, v_mat, z_mat, points3d);
+    points3d = points3d.reshape(3, 1);
+}
+
+/**
+ * @param K
+ * @param depth the depth image
+ * @param points3d the resulting 3d points
+ */
+template<typename T>
+void depthTo3dNoMask(const cv::Mat& in_depth, const cv::Mat_<T>& K, cv::Mat& points3d)
+{
+    const T inv_fx = T(1) / K(0, 0);
+    const T inv_fy = T(1) / K(1, 1);
+    const T ox = K(0, 2);
+    const T oy = K(1, 2);
+
+    // Build z
+    cv::Mat_<T> z_mat;
+    if (z_mat.depth() == in_depth.depth())
+        z_mat = in_depth;
+    else
+        rescaleDepthTemplated<T>(in_depth, z_mat);
+
+    // Pre-compute some constants
+    cv::Mat_<T> x_cache(1, in_depth.cols), y_cache(in_depth.rows, 1);
+    T* x_cache_ptr = x_cache[0], * y_cache_ptr = y_cache[0];
+    for (int x = 0; x < in_depth.cols; ++x, ++x_cache_ptr)
+        *x_cache_ptr = (x - ox) * inv_fx;
+    for (int y = 0; y < in_depth.rows; ++y, ++y_cache_ptr)
+        *y_cache_ptr = (y - oy) * inv_fy;
+
+    y_cache_ptr = y_cache[0];
+    for (int y = 0; y < in_depth.rows; ++y, ++y_cache_ptr)
+    {
+        cv::Vec<T, 3>* point = points3d.ptr<cv::Vec<T, 3> >(y);
+        const T* x_cache_ptr_end = x_cache[0] + in_depth.cols;
+        const T* depth = z_mat[y];
+        for (x_cache_ptr = x_cache[0]; x_cache_ptr != x_cache_ptr_end; ++x_cache_ptr, ++point, ++depth)
+        {
+            T z = *depth;
+            (*point)[0] = (*x_cache_ptr) * z;
+            (*point)[1] = (*y_cache_ptr) * z;
+            (*point)[2] = z;
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @param K
+ * @param depth the depth image
+ * @param u_mat the list of x coordinates
+ * @param v_mat the list of matching y coordinates
+ * @param points3d the resulting 3d points
+ */
+void depthTo3dSparse(InputArray depth_in, InputArray K_in, InputArray points_in, OutputArray points3d_out)
+{
+    // Make sure we use foat types
+    cv::Mat points = points_in.getMat();
+    cv::Mat depth = depth_in.getMat();
+
+    cv::Mat points_float;
+    if (points.depth() != CV_32F)
+        points.convertTo(points_float, CV_32FC2);
+    else
+        points_float = points;
+
+    // Fill the depth matrix
+    cv::Mat_<float> z_mat;
+
+    if (depth.depth() == CV_16U)
+        convertDepthToFloat<ushort>(depth, 1.0f / 1000.0f, points_float, z_mat);
+    else if (depth.depth() == CV_16U)
+        convertDepthToFloat<short>(depth, 1.0f / 1000.0f, points_float, z_mat);
+    else
+    {
+        CV_Assert(depth.type() == CV_32F);
+        convertDepthToFloat<float>(depth, 1.0f, points_float, z_mat);
+    }
+
+    std::vector<cv::Mat> channels(2);
+    cv::split(points_float, channels);
+
+    points3d_out.create(channels[0].rows, channels[0].cols, CV_32FC3);
+    cv::Mat points3d = points3d_out.getMat();
+    depthTo3d_from_uvz(K_in.getMat(), channels[0], channels[1], z_mat, points3d);
+}
+
+/**
+ * @param depth the depth image (if given as short int CV_U, it is assumed to be the depth in millimeters
+ *              (as done with the Microsoft Kinect), otherwise, if given as CV_32F, it is assumed in meters)
+ * @param K The calibration matrix
+ * @param points3d the resulting 3d points. They are of depth the same as `depth` if it is CV_32F or CV_64F, and the
+ *        depth of `K` if `depth` is of depth CV_U
+ * @param mask the mask of the points to consider (can be empty)
+ */
+void depthTo3d(InputArray depth_in, InputArray K_in, OutputArray points3d_out, InputArray mask_in)
+{
+    cv::Mat depth = depth_in.getMat();
+    cv::Mat K = K_in.getMat();
+    cv::Mat mask = mask_in.getMat();
+    CV_Assert(K.cols == 3 && K.rows == 3 && (K.depth() == CV_64F || K.depth() == CV_32F));
+    CV_Assert(depth.type() == CV_64FC1 || depth.type() == CV_32FC1 || depth.type() == CV_16UC1 || depth.type() == CV_16SC1);
+    CV_Assert(mask.empty() || mask.channels() == 1);
+
+    cv::Mat K_new;
+    K.convertTo(K_new, depth.depth() == CV_64F ? CV_64F : CV_32F); // issue #1021
+
+    // Create 3D points in one go.
+    if (!mask.empty())
+    {
+        cv::Mat points3d;
+        depthTo3dMask(depth, K_new, mask, points3d);
+        points3d_out.create(points3d.size(), CV_MAKETYPE(K_new.depth(), 3));
+        points3d.copyTo(points3d_out.getMat());
+    }
+    else
+    {
+        points3d_out.create(depth.size(), CV_MAKETYPE(K_new.depth(), 3));
+        cv::Mat points3d = points3d_out.getMat();
+        if (K_new.depth() == CV_64F)
+            depthTo3dNoMask<double>(depth, K_new, points3d);
+        else
+            depthTo3dNoMask<float>(depth, K_new, points3d);
+    }
+}
+
+}
diff --git a/modules/3d/src/rgbd/depth_to_3d.hpp b/modules/3d/src/rgbd/depth_to_3d.hpp
new file mode 100644
index 0000000000..14be3cc373
--- /dev/null
+++ b/modules/3d/src/rgbd/depth_to_3d.hpp
@@ -0,0 +1,87 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_3D_DEPTH_TO_3D_HPP
+#define OPENCV_3D_DEPTH_TO_3D_HPP
+
+#include "../precomp.hpp"
+#include "utils.hpp"
+
+namespace cv
+{
+
+/**
+ * @param depth the depth image, containing depth with the value T
+ * @param the mask, containing CV_8UC1
+ */
+template <typename T>
+size_t convertDepthToFloat(const cv::Mat& depth, const cv::Mat& mask, float scale, cv::Mat_<float> &u_mat, cv::Mat_<float> &v_mat, cv::Mat_<float> &z_mat)
+{
+    CV_Assert(depth.size == mask.size);
+
+    cv::Size depth_size = depth.size();
+
+    cv::Mat_<uchar> uchar_mask = mask;
+
+    if (mask.depth() != CV_8U)
+        mask.convertTo(uchar_mask, CV_8U);
+
+    u_mat = cv::Mat_<float>(depth_size.area(), 1);
+    v_mat = cv::Mat_<float>(depth_size.area(), 1);
+    z_mat = cv::Mat_<float>(depth_size.area(), 1);
+
+    // Raw data from the Kinect has int
+    size_t n_points = 0;
+
+    for (int v = 0; v < depth_size.height; v++)
+    {
+        uchar* r = uchar_mask.ptr<uchar>(v, 0);
+
+        for (int u = 0; u < depth_size.width; u++, ++r)
+            if (*r)
+            {
+                u_mat((int)n_points, 0) = (float)u;
+                v_mat((int)n_points, 0) = (float)v;
+                T depth_i = depth.at<T>(v, u);
+
+                if (cvIsNaN((float)depth_i) || (depth_i == std::numeric_limits<T>::min()) || (depth_i == std::numeric_limits<T>::max()))
+                    z_mat((int)n_points, 0) = std::numeric_limits<float>::quiet_NaN();
+                else
+                    z_mat((int)n_points, 0) = depth_i * scale;
+
+                ++n_points;
+            }
+    }
+
+    return n_points;
+}
+
+/**
+ * @param depth the depth image, containing depth with the value T
+ * @param the mask, containing CV_8UC1
+ */
+template <typename T>
+void convertDepthToFloat(const cv::Mat& depth, float scale, const cv::Mat &uv_mat, cv::Mat_<float> &z_mat)
+{
+    z_mat = cv::Mat_<float>(uv_mat.size());
+
+    // Raw data from the Kinect has int
+    float* z_mat_iter = reinterpret_cast<float*>(z_mat.data);
+
+    for (cv::Mat_<cv::Vec2f>::const_iterator uv_iter = uv_mat.begin<cv::Vec2f>(), uv_end = uv_mat.end<cv::Vec2f>();
+        uv_iter != uv_end; ++uv_iter, ++z_mat_iter)
+    {
+        T depth_i = depth.at < T >((int)(*uv_iter)[1], (int)(*uv_iter)[0]);
+
+        if (cvIsNaN((float)depth_i) || (depth_i == std::numeric_limits < T > ::min())
+            || (depth_i == std::numeric_limits < T > ::max()))
+            *z_mat_iter = std::numeric_limits<float>::quiet_NaN();
+        else
+            *z_mat_iter = depth_i * scale;
+    }
+}
+
+}
+
+#endif // include guard
diff --git a/modules/3d/src/rgbd/fast_icp.cpp b/modules/3d/src/rgbd/fast_icp.cpp
new file mode 100644
index 0000000000..cc9966977e
--- /dev/null
+++ b/modules/3d/src/rgbd/fast_icp.cpp
@@ -0,0 +1,656 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#include "../precomp.hpp"
+#include "fast_icp.hpp"
+#include "opencl_kernels_3d.hpp"
+
+using namespace std;
+
+namespace cv {
+namespace kinfu {
+
+enum
+{
+    UTSIZE = 27
+};
+
+ICP::ICP(const Matx33f _intrinsics, const std::vector<int>& _iterations, float _angleThreshold, float _distanceThreshold) :
+    iterations(_iterations), angleThreshold(_angleThreshold), distanceThreshold(_distanceThreshold),
+    intrinsics(_intrinsics)
+{ }
+
+class ICPImpl : public ICP
+{
+public:
+    ICPImpl(const cv::Matx33f _intrinsics, const std::vector<int> &_iterations, float _angleThreshold, float _distanceThreshold);
+
+    virtual bool estimateTransform(cv::Affine3f& transform,
+                                   InputArray oldPoints, InputArray oldNormals,
+                                   InputArray newPoints, InputArray newNormals
+                                   ) const override;
+    template < typename T >
+    bool estimateTransformT(cv::Affine3f& transform,
+                            const vector<T>& oldPoints, const vector<T>& oldNormals,
+                            const vector<T>& newPoints, const vector<T>& newNormals
+                            ) const;
+
+    virtual ~ICPImpl() { }
+
+    template < typename T >
+    void getAb(const T& oldPts, const T& oldNrm, const T& newPts, const T& newNrm,
+               cv::Affine3f pose, int level, cv::Matx66f& A, cv::Vec6f& b) const;
+
+private:
+
+    mutable vector<UMat> groupedSumBuffers;
+
+};
+
+ICPImpl::ICPImpl(const Matx33f _intrinsics, const std::vector<int> &_iterations, float _angleThreshold, float _distanceThreshold) :
+    ICP(_intrinsics, _iterations, _angleThreshold, _distanceThreshold),
+    groupedSumBuffers(_iterations.size())
+{ }
+
+
+bool ICPImpl::estimateTransform(cv::Affine3f& transform,
+                                InputArray _oldPoints, InputArray _oldNormals,
+                                InputArray _newPoints, InputArray _newNormals
+                                ) const
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_oldPoints.size() == _oldNormals.size());
+    CV_Assert(_newPoints.size() == _newNormals.size());
+    CV_Assert(_oldPoints.size() == _newPoints.size());
+
+#ifdef HAVE_OPENCL
+    if(cv::ocl::isOpenCLActivated() &&
+       _oldPoints.isUMatVector() && _oldNormals.isUMatVector() &&
+       _newPoints.isUMatVector() && _newNormals.isUMatVector())
+    {
+        std::vector<UMat> op, np, on, nn;
+        _oldPoints.getUMatVector(op);
+        _newPoints.getUMatVector(np);
+        _oldNormals.getUMatVector(on);
+        _newNormals.getUMatVector(nn);
+        return estimateTransformT<UMat>(transform, op, on, np, nn);
+    }
+#endif
+
+    std::vector<Mat> op, on, np, nn;
+    _oldPoints.getMatVector(op);
+    _newPoints.getMatVector(np);
+    _oldNormals.getMatVector(on);
+    _newNormals.getMatVector(nn);
+    return estimateTransformT<Mat>(transform, op, on, np, nn);
+}
+
+template < typename T >
+bool ICPImpl::estimateTransformT(cv::Affine3f& transform,
+                                 const vector<T>& oldPoints, const vector<T>& oldNormals,
+                                 const vector<T>& newPoints, const vector<T>& newNormals
+                                 ) const
+{
+    CV_TRACE_FUNCTION();
+
+    transform = Affine3f::Identity();
+    for(size_t l = 0; l < iterations.size(); l++)
+    {
+        size_t level = iterations.size() - 1 - l;
+
+        const T& oldPts = oldPoints [level], newPts = newPoints [level];
+        const T& oldNrm = oldNormals[level], newNrm = newNormals[level];
+
+        for(int iter = 0; iter < iterations[level]; iter++)
+        {
+            Matx66f A;
+            Vec6f b;
+
+            getAb(oldPts, oldNrm, newPts, newNrm, transform, (int)level, A, b);
+
+            double det = cv::determinant(A);
+
+            if (abs (det) < 1e-15 || cvIsNaN(det))
+                return false;
+
+            Vec6f x;
+            // theoretically, any method of solving is applicable
+            // since there are usual least square matrices
+            solve(A, b, x, DECOMP_SVD);
+            Affine3f tinc(Vec3f(x.val), Vec3f(x.val+3));
+            transform = tinc * transform;
+        }
+    }
+
+    return true;
+}
+
+
+///////// CPU implementation /////////
+
+// 1 any coord to check is enough since we know the generation
+
+
+#if USE_INTRINSICS
+static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1)
+{
+    float check = (p0.get0() + p1.get0());
+    return !cvIsNaN(check);
+}
+
+static inline void getCrossPerm(const v_float32x4& a, v_float32x4& yzx, v_float32x4& zxy)
+{
+    v_uint32x4 aa = v_reinterpret_as_u32(a);
+    v_uint32x4 yz00 = v_extract<1>(aa, v_setzero_u32());
+    v_uint32x4 x0y0, tmp;
+    v_zip(aa, v_setzero_u32(), x0y0, tmp);
+    v_uint32x4 yzx0 = v_combine_low(yz00, x0y0);
+    v_uint32x4 y000 = v_extract<2>(x0y0, v_setzero_u32());
+    v_uint32x4 zx00 = v_extract<1>(yzx0, v_setzero_u32());
+    zxy = v_reinterpret_as_f32(v_combine_low(zx00, y000));
+    yzx = v_reinterpret_as_f32(yzx0);
+}
+
+static inline v_float32x4 crossProduct(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 ayzx, azxy, byzx, bzxy;
+    getCrossPerm(a, ayzx, azxy);
+    getCrossPerm(b, byzx, bzxy);
+    return ayzx*bzxy - azxy*byzx;
+}
+#else
+static inline bool fastCheck(const Point3f& p)
+{
+    return !cvIsNaN(p.x);
+}
+
+#endif
+
+typedef Matx<float, 6, 7> ABtype;
+
+struct GetAbInvoker : ParallelLoopBody
+{
+    GetAbInvoker(ABtype& _globalAb, Mutex& _mtx,
+                 const Points& _oldPts, const Normals& _oldNrm, const Points& _newPts, const Normals& _newNrm,
+                 Affine3f _pose, Intr::Projector _proj, float _sqDistanceThresh, float _minCos) :
+        ParallelLoopBody(),
+        globalSumAb(_globalAb), mtx(_mtx),
+        oldPts(_oldPts), oldNrm(_oldNrm), newPts(_newPts), newNrm(_newNrm), pose(_pose),
+        proj(_proj), sqDistanceThresh(_sqDistanceThresh), minCos(_minCos)
+    { }
+
+    virtual void operator ()(const Range& range) const override
+    {
+#if USE_INTRINSICS
+        CV_Assert(ptype::channels == 4);
+
+        const size_t utBufferSize = 9;
+        float CV_DECL_ALIGNED(16) upperTriangle[utBufferSize*4];
+        for(size_t i = 0; i < utBufferSize*4; i++)
+            upperTriangle[i] = 0;
+        // how values are kept in upperTriangle
+        const int NA = 0;
+        const size_t utPos[] =
+        {
+           0,  1,  2,  4,  5,  6,  3,
+          NA,  9, 10, 12, 13, 14, 11,
+          NA, NA, 18, 20, 21, 22, 19,
+          NA, NA, NA, 24, 28, 30, 32,
+          NA, NA, NA, NA, 25, 29, 33,
+          NA, NA, NA, NA, NA, 26, 34
+        };
+
+        const float (&pm)[16] = pose.matrix.val;
+        v_float32x4 poseRot0(pm[0], pm[4], pm[ 8], 0);
+        v_float32x4 poseRot1(pm[1], pm[5], pm[ 9], 0);
+        v_float32x4 poseRot2(pm[2], pm[6], pm[10], 0);
+        v_float32x4 poseTrans(pm[3], pm[7], pm[11], 0);
+
+        v_float32x4 vfxy(proj.fx, proj.fy, 0, 0), vcxy(proj.cx, proj.cy, 0, 0);
+        v_float32x4 vframe((float)(oldPts.cols - 1), (float)(oldPts.rows - 1), 1.f, 1.f);
+
+        float sqThresh = sqDistanceThresh;
+        float cosThresh = minCos;
+
+        for(int y = range.start; y < range.end; y++)
+        {
+            const CV_DECL_ALIGNED(16) float* newPtsRow = (const float*)newPts[y];
+            const CV_DECL_ALIGNED(16) float* newNrmRow = (const float*)newNrm[y];
+
+            for(int x = 0; x < newPts.cols; x++)
+            {
+                v_float32x4 newP = v_load_aligned(newPtsRow + x*4);
+                v_float32x4 newN = v_load_aligned(newNrmRow + x*4);
+
+                if(!fastCheck(newP, newN))
+                    continue;
+
+                //transform to old coord system
+                newP = v_matmuladd(newP, poseRot0, poseRot1, poseRot2, poseTrans);
+                newN = v_matmuladd(newN, poseRot0, poseRot1, poseRot2, v_setzero_f32());
+
+                //find correspondence by projecting the point
+                v_float32x4 oldCoords;
+                float pz = (v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))).get0());
+                // x, y, 0, 0
+                oldCoords = v_muladd(newP/v_setall_f32(pz), vfxy, vcxy);
+
+                if(!v_check_all((oldCoords >= v_setzero_f32()) & (oldCoords < vframe)))
+                    continue;
+
+                // bilinearly interpolate oldPts and oldNrm under oldCoords point
+                v_float32x4 oldP;
+                v_float32x4 oldN;
+                {
+                    v_int32x4 ixy = v_floor(oldCoords);
+                    v_float32x4 txy = oldCoords - v_cvt_f32(ixy);
+                    int xi = ixy.get0();
+                    int yi = v_rotate_right<1>(ixy).get0();
+                    v_float32x4 tx = v_setall_f32(txy.get0());
+                    txy = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(txy)));
+                    v_float32x4 ty = v_setall_f32(txy.get0());
+
+                    const float* prow0 = (const float*)oldPts[yi+0];
+                    const float* prow1 = (const float*)oldPts[yi+1];
+
+                    v_float32x4 p00 = v_load(prow0 + (xi+0)*4);
+                    v_float32x4 p01 = v_load(prow0 + (xi+1)*4);
+                    v_float32x4 p10 = v_load(prow1 + (xi+0)*4);
+                    v_float32x4 p11 = v_load(prow1 + (xi+1)*4);
+
+                    // do not fix missing data
+                    // NaN check is done later
+
+                    const float* nrow0 = (const float*)oldNrm[yi+0];
+                    const float* nrow1 = (const float*)oldNrm[yi+1];
+
+                    v_float32x4 n00 = v_load(nrow0 + (xi+0)*4);
+                    v_float32x4 n01 = v_load(nrow0 + (xi+1)*4);
+                    v_float32x4 n10 = v_load(nrow1 + (xi+0)*4);
+                    v_float32x4 n11 = v_load(nrow1 + (xi+1)*4);
+
+                    // NaN check is done later
+
+                    v_float32x4 p0 = p00 + tx*(p01 - p00);
+                    v_float32x4 p1 = p10 + tx*(p11 - p10);
+                    oldP = p0 + ty*(p1 - p0);
+
+                    v_float32x4 n0 = n00 + tx*(n01 - n00);
+                    v_float32x4 n1 = n10 + tx*(n11 - n10);
+                    oldN = n0 + ty*(n1 - n0);
+                }
+
+                bool oldPNcheck = fastCheck(oldP, oldN);
+
+                //filter by distance
+                v_float32x4 diff = newP - oldP;
+                bool distCheck = !(v_reduce_sum(diff*diff) > sqThresh);
+
+                //filter by angle
+                bool angleCheck = !(abs(v_reduce_sum(newN*oldN)) < cosThresh);
+
+                if(!(oldPNcheck && distCheck && angleCheck))
+                    continue;
+
+                // build point-wise vector ab = [ A | b ]
+                v_float32x4 VxNv = crossProduct(newP, oldN);
+                Point3f VxN;
+                VxN.x = VxNv.get0();
+                VxN.y = v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
+                VxN.z = v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
+
+                float dotp = -v_reduce_sum(oldN*diff);
+
+                // build point-wise upper-triangle matrix [ab^T * ab] w/o last row
+                // which is [A^T*A | A^T*b]
+                // and gather sum
+
+                v_float32x4 vd = VxNv | v_float32x4(0, 0, 0, dotp);
+                v_float32x4 n = oldN;
+                v_float32x4 nyzx;
+                {
+                    v_uint32x4 aa = v_reinterpret_as_u32(n);
+                    v_uint32x4 yz00 = v_extract<1>(aa, v_setzero_u32());
+                    v_uint32x4 x0y0, tmp;
+                    v_zip(aa, v_setzero_u32(), x0y0, tmp);
+                    nyzx = v_reinterpret_as_f32(v_combine_low(yz00, x0y0));
+                }
+
+                v_float32x4 vutg[utBufferSize];
+                for(size_t i = 0; i < utBufferSize; i++)
+                    vutg[i] = v_load_aligned(upperTriangle + i*4);
+
+                int p = 0;
+                v_float32x4 v;
+                // vx * vd, vx * n
+                v = v_setall_f32(VxN.x);
+                v_store_aligned(upperTriangle + p*4, v_muladd(v, vd, vutg[p])); p++;
+                v_store_aligned(upperTriangle + p*4, v_muladd(v,  n, vutg[p])); p++;
+                // vy * vd, vy * n
+                v = v_setall_f32(VxN.y);
+                v_store_aligned(upperTriangle + p*4, v_muladd(v, vd, vutg[p])); p++;
+                v_store_aligned(upperTriangle + p*4, v_muladd(v,  n, vutg[p])); p++;
+                // vz * vd, vz * n
+                v = v_setall_f32(VxN.z);
+                v_store_aligned(upperTriangle + p*4, v_muladd(v, vd, vutg[p])); p++;
+                v_store_aligned(upperTriangle + p*4, v_muladd(v,  n, vutg[p])); p++;
+                // nx^2, ny^2, nz^2
+                v_store_aligned(upperTriangle + p*4, v_muladd(n, n, vutg[p])); p++;
+                // nx*ny, ny*nz, nx*nz
+                v_store_aligned(upperTriangle + p*4, v_muladd(n, nyzx, vutg[p])); p++;
+                // nx*d, ny*d, nz*d
+                v = v_setall_f32(dotp);
+                v_store_aligned(upperTriangle + p*4, v_muladd(n, v, vutg[p])); p++;
+            }
+        }
+
+        ABtype sumAB = ABtype::zeros();
+        for(int i = 0; i < 6; i++)
+        {
+            for(int j = i; j < 7; j++)
+            {
+                size_t p = utPos[i*7+j];
+                sumAB(i, j) = upperTriangle[p];
+            }
+        }
+
+#else
+
+        float upperTriangle[UTSIZE];
+        for(int i = 0; i < UTSIZE; i++)
+            upperTriangle[i] = 0;
+
+        for(int y = range.start; y < range.end; y++)
+        {
+            const ptype* newPtsRow = newPts[y];
+            const ptype* newNrmRow = newNrm[y];
+
+            for(int x = 0; x < newPts.cols; x++)
+            {
+                Point3f newP = fromPtype(newPtsRow[x]);
+                Point3f newN = fromPtype(newNrmRow[x]);
+
+                Point3f oldP(nan3), oldN(nan3);
+
+                if(!(fastCheck(newP) && fastCheck(newN)))
+                    continue;
+
+                //transform to old coord system
+                newP = pose * newP;
+                newN = pose.rotation() * newN;
+
+                //find correspondence by projecting the point
+                Point2f oldCoords = proj(newP);
+                if(!(oldCoords.x >= 0 && oldCoords.x < oldPts.cols - 1 &&
+                     oldCoords.y >= 0 && oldCoords.y < oldPts.rows - 1))
+                    continue;
+
+                // bilinearly interpolate oldPts and oldNrm under oldCoords point
+                int xi = cvFloor(oldCoords.x), yi = cvFloor(oldCoords.y);
+                float tx  = oldCoords.x - xi, ty = oldCoords.y - yi;
+
+                const ptype* prow0 = oldPts[yi+0];
+                const ptype* prow1 = oldPts[yi+1];
+
+                Point3f p00 = fromPtype(prow0[xi+0]);
+                Point3f p01 = fromPtype(prow0[xi+1]);
+                Point3f p10 = fromPtype(prow1[xi+0]);
+                Point3f p11 = fromPtype(prow1[xi+1]);
+
+                //do not fix missing data
+                if(!(fastCheck(p00) && fastCheck(p01) &&
+                     fastCheck(p10) && fastCheck(p11)))
+                    continue;
+
+                const ptype* nrow0 = oldNrm[yi+0];
+                const ptype* nrow1 = oldNrm[yi+1];
+
+                Point3f n00 = fromPtype(nrow0[xi+0]);
+                Point3f n01 = fromPtype(nrow0[xi+1]);
+                Point3f n10 = fromPtype(nrow1[xi+0]);
+                Point3f n11 = fromPtype(nrow1[xi+1]);
+
+                if(!(fastCheck(n00) && fastCheck(n01) &&
+                     fastCheck(n10) && fastCheck(n11)))
+                    continue;
+
+                Point3f p0 = p00 + tx*(p01 - p00);
+                Point3f p1 = p10 + tx*(p11 - p10);
+                oldP = p0 + ty*(p1 - p0);
+
+                Point3f n0 = n00 + tx*(n01 - n00);
+                Point3f n1 = n10 + tx*(n11 - n10);
+                oldN = n0 + ty*(n1 - n0);
+
+                if(!(fastCheck(oldP) && fastCheck(oldN)))
+                    continue;
+
+                //filter by distance
+                Point3f diff = newP - oldP;
+                if(diff.dot(diff) > sqDistanceThresh)
+                {
+                    continue;
+                }
+
+                //filter by angle
+                if(abs(newN.dot(oldN)) < minCos)
+                {
+                    continue;
+                }
+
+                // build point-wise vector ab = [ A | b ]
+
+                //try to optimize
+                Point3f VxN = newP.cross(oldN);
+                float ab[7] = {VxN.x, VxN.y, VxN.z, oldN.x, oldN.y, oldN.z, oldN.dot(-diff)};
+                // build point-wise upper-triangle matrix [ab^T * ab] w/o last row
+                // which is [A^T*A | A^T*b]
+                // and gather sum
+                int pos = 0;
+                for(int i = 0; i < 6; i++)
+                {
+                    for(int j = i; j < 7; j++)
+                    {
+                        upperTriangle[pos++] += ab[i]*ab[j];
+                    }
+                }
+            }
+        }
+
+        ABtype sumAB = ABtype::zeros();
+        int pos = 0;
+        for(int i = 0; i < 6; i++)
+        {
+            for(int j = i; j < 7; j++)
+            {
+                sumAB(i, j) = upperTriangle[pos++];
+            }
+        }
+#endif
+
+        AutoLock al(mtx);
+        globalSumAb += sumAB;
+    }
+
+    ABtype& globalSumAb;
+    Mutex& mtx;
+    const Points& oldPts;
+    const Normals& oldNrm;
+    const Points& newPts;
+    const Normals& newNrm;
+    Affine3f pose;
+    const Intr::Projector proj;
+    float sqDistanceThresh;
+    float minCos;
+};
+
+
+template <>
+void ICPImpl::getAb<Mat>(const Mat& oldPts, const Mat& oldNrm, const Mat& newPts, const Mat& newNrm,
+                         cv::Affine3f pose, int level, cv::Matx66f& A, cv::Vec6f& b) const
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(oldPts.size() == oldNrm.size());
+    CV_Assert(newPts.size() == newNrm.size());
+
+    ABtype sumAB = ABtype::zeros();
+    Mutex mutex;
+    const Points  op(oldPts), on(oldNrm);
+    const Normals np(newPts), nn(newNrm);
+    GetAbInvoker invoker(sumAB, mutex, op, on, np, nn, pose,
+                         intrinsics.scale(level).makeProjector(),
+                         distanceThreshold*distanceThreshold, std::cos(angleThreshold));
+    Range range(0, newPts.rows);
+    const int nstripes = -1;
+    parallel_for_(range, invoker, nstripes);
+
+    // splitting AB matrix to A and b
+    for(int i = 0; i < 6; i++)
+    {
+        // augment lower triangle of A by symmetry
+        for(int j = i; j < 6; j++)
+        {
+            A(i, j) = A(j, i) = sumAB(i, j);
+        }
+
+        b(i) = sumAB(i, 6);
+    }
+}
+
+///////// GPU implementation /////////
+
+#ifdef HAVE_OPENCL
+
+template <>
+void ICPImpl::getAb<UMat>(const UMat& oldPts, const UMat& oldNrm, const UMat& newPts, const UMat& newNrm,
+                          Affine3f pose, int level, Matx66f &A, Vec6f &b) const
+{
+    CV_TRACE_FUNCTION();
+
+    Size oldSize = oldPts.size(), newSize = newPts.size();
+    CV_Assert(oldSize == oldNrm.size());
+    CV_Assert(newSize == newNrm.size());
+
+    // calculate 1x7 vector ab to produce b and upper triangle of A:
+    // [A|b] = ab*(ab^t)
+    // and then reduce it across work groups
+
+    cv::String errorStr;
+    ocl::ProgramSource source = ocl::_3d::icp_oclsrc;
+    cv::String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create("getAb", source, options, &errorStr);
+
+    if(k.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)newPts.cols;
+    globalSize[1] = (size_t)newPts.rows;
+
+    const ocl::Device& device = ocl::Device::getDefault();
+    size_t wgsLimit = device.maxWorkGroupSize();
+    size_t memSize = device.localMemSize();
+    // local memory should keep upperTriangles for all threads in group for reduce
+    const size_t ltsz = UTSIZE*sizeof(float);
+    const size_t lcols = 32;
+    size_t lrows = min(memSize/ltsz, wgsLimit)/lcols;
+    // round lrows down to 2^n
+    lrows = roundDownPow2(lrows);
+    size_t localSize[2] = {lcols, lrows};
+    Size ngroups((int)divUp(globalSize[0], (unsigned int)localSize[0]),
+                 (int)divUp(globalSize[1], (unsigned int)localSize[1]));
+
+    // size of local buffer for group-wide reduce
+    size_t lsz = localSize[0]*localSize[1]*ltsz;
+
+    Intr::Projector proj = intrinsics.scale(level).makeProjector();
+    Vec2f fxy(proj.fx, proj.fy), cxy(proj.cx, proj.cy);
+
+    UMat& groupedSumGpu = groupedSumBuffers[level];
+    groupedSumGpu.create(Size(ngroups.width*UTSIZE, ngroups.height),
+                         CV_32F);
+    groupedSumGpu.setTo(0);
+
+    // TODO: optimization possible:
+    // samplers instead of oldPts/oldNrm (mask needed)
+    k.args(ocl::KernelArg::ReadOnlyNoSize(oldPts),
+           ocl::KernelArg::ReadOnlyNoSize(oldNrm),
+           oldSize,
+           ocl::KernelArg::ReadOnlyNoSize(newPts),
+           ocl::KernelArg::ReadOnlyNoSize(newNrm),
+           newSize,
+           ocl::KernelArg::Constant(pose.matrix.val,
+                                    sizeof(pose.matrix.val)),
+           fxy.val, cxy.val,
+           distanceThreshold*distanceThreshold,
+           std::cos(angleThreshold),
+           ocl::KernelArg::Local(lsz),
+           ocl::KernelArg::WriteOnlyNoSize(groupedSumGpu)
+           );
+
+    if(!k.run(2, globalSize, localSize, true))
+        throw std::runtime_error("Failed to run kernel");
+
+    float upperTriangle[UTSIZE];
+    for(int i = 0; i < UTSIZE; i++)
+        upperTriangle[i] = 0;
+
+    Mat groupedSumCpu = groupedSumGpu.getMat(ACCESS_READ);
+
+    for(int y = 0; y < ngroups.height; y++)
+    {
+        const float* rowr = groupedSumCpu.ptr<float>(y);
+        for(int x = 0; x < ngroups.width; x++)
+        {
+            const float* p = rowr + x*UTSIZE;
+            for(int j = 0; j < UTSIZE; j++)
+            {
+                upperTriangle[j] += p[j];
+            }
+        }
+    }
+    groupedSumCpu.release();
+
+    ABtype sumAB = ABtype::zeros();
+    int pos = 0;
+    for(int i = 0; i < 6; i++)
+    {
+        for(int j = i; j < 7; j++)
+        {
+            sumAB(i, j) = upperTriangle[pos++];
+        }
+    }
+
+    // splitting AB matrix to A and b
+    for(int i = 0; i < 6; i++)
+    {
+        // augment lower triangle of A by symmetry
+        for(int j = i; j < 6; j++)
+        {
+            A(i, j) = A(j, i) = sumAB(i, j);
+        }
+
+        b(i) = sumAB(i, 6);
+    }
+}
+
+#endif
+
+///
+
+
+cv::Ptr<ICP> makeICP(const cv::Intr _intrinsics, const std::vector<int> &_iterations,
+                     float _angleThreshold, float _distanceThreshold)
+{
+    return makePtr<ICPImpl>(_intrinsics.getMat(), _iterations, _angleThreshold, _distanceThreshold);
+}
+
+} // namespace kinfu
+} // namespace cv
diff --git a/modules/3d/src/rgbd/fast_icp.hpp b/modules/3d/src/rgbd/fast_icp.hpp
new file mode 100644
index 0000000000..b82c217b7f
--- /dev/null
+++ b/modules/3d/src/rgbd/fast_icp.hpp
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#ifndef OPENCV_3D_FAST_ICP_HPP
+#define OPENCV_3D_FAST_ICP_HPP
+
+#include "../precomp.hpp"
+#include "utils.hpp"
+
+namespace cv {
+namespace kinfu {
+
+class ICP
+{
+public:
+    ICP(const cv::Matx33f _intrinsics, const std::vector<int> &_iterations, float _angleThreshold, float _distanceThreshold);
+
+    virtual bool estimateTransform(cv::Affine3f& transform,
+                                   InputArray oldPoints, InputArray oldNormals,
+                                   InputArray newPoints, InputArray newNormals
+                                   ) const = 0;
+    virtual ~ICP() { }
+
+protected:
+
+    std::vector<int> iterations;
+    float angleThreshold;
+    float distanceThreshold;
+    cv::Intr intrinsics;
+};
+
+cv::Ptr<ICP> makeICP(const cv::Intr _intrinsics, const std::vector<int> &_iterations,
+                     float _angleThreshold, float _distanceThreshold);
+
+} // namespace kinfu
+} // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/src/rgbd/hash_tsdf.cpp b/modules/3d/src/rgbd/hash_tsdf.cpp
new file mode 100644
index 0000000000..87982bd8af
--- /dev/null
+++ b/modules/3d/src/rgbd/hash_tsdf.cpp
@@ -0,0 +1,1796 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../precomp.hpp"
+#include "hash_tsdf.hpp"
+#include "opencl_kernels_3d.hpp"
+
+#define USE_INTERPOLATION_IN_GETNORMAL 1
+#define VOLUMES_SIZE 8192
+
+namespace cv
+{
+
+HashTSDFVolume::HashTSDFVolume(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor,
+                               float _truncDist, int _maxWeight, float _truncateThreshold,
+                               int _volumeUnitRes, bool _zFirstMemOrder)
+    : Volume(_voxelSize, _pose, _raycastStepFactor),
+    maxWeight(_maxWeight),
+    truncateThreshold(_truncateThreshold),
+    volumeUnitResolution(_volumeUnitRes),
+    volumeUnitSize(voxelSize* volumeUnitResolution),
+    zFirstMemOrder(_zFirstMemOrder)
+{
+    truncDist = std::max(_truncDist, 4.0f * voxelSize);
+
+    if (!(volumeUnitResolution & (volumeUnitResolution - 1)))
+    {
+        // vuRes is a power of 2, let's get this power
+        volumeUnitDegree = trailingZeros32(volumeUnitResolution);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg, "Volume unit resolution should be a power of 2");
+    }
+
+    int xdim, ydim, zdim;
+    if (zFirstMemOrder)
+    {
+        xdim = volumeUnitResolution * volumeUnitResolution;
+        ydim = volumeUnitResolution;
+        zdim = 1;
+    }
+    else
+    {
+        xdim = 1;
+        ydim = volumeUnitResolution;
+        zdim = volumeUnitResolution * volumeUnitResolution;
+    }
+    volStrides = Vec4i(xdim, ydim, zdim);
+}
+
+//! Spatial hashing
+struct tsdf_hash
+{
+    size_t operator()(const Vec3i& x) const noexcept
+    {
+        size_t seed = 0;
+        constexpr uint32_t GOLDEN_RATIO = 0x9e3779b9;
+        for (uint16_t i = 0; i < 3; i++)
+        {
+            seed ^= std::hash<int>()(x[i]) + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+};
+
+struct VolumeUnit
+{
+    cv::Vec3i coord;
+    int index;
+    cv::Matx44f pose;
+    int lastVisibleIndex = 0;
+    bool isActive;
+};
+
+typedef std::unordered_set<cv::Vec3i, tsdf_hash> VolumeUnitIndexSet;
+typedef std::unordered_map<cv::Vec3i, VolumeUnit, tsdf_hash> VolumeUnitIndexes;
+
+class HashTSDFVolumeCPU : public HashTSDFVolume
+{
+public:
+    // dimension in voxels, size in meters
+    HashTSDFVolumeCPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist, int _maxWeight,
+        float _truncateThreshold, int _volumeUnitRes, bool zFirstMemOrder = true);
+
+    HashTSDFVolumeCPU(const VolumeParams& _volumeParams, bool zFirstMemOrder = true);
+
+    virtual void integrate(InputArray, InputArray, float, const Matx44f&, const Matx33f&, const Matx33f&, const int) override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); }
+    void integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const Matx33f& intrinsics,
+                   const int frameId = 0) override;
+    void raycast(const Matx44f& cameraPose, const Matx33f& intrinsics, const Size& frameSize, OutputArray points,
+                 OutputArray normals) const override;
+    void raycast(const Matx44f&, const Matx33f&, const Size&, OutputArray, OutputArray, OutputArray) const override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); }
+    void fetchNormals(InputArray points, OutputArray _normals) const override;
+    void fetchPointsNormals(OutputArray points, OutputArray normals) const override;
+
+    void reset() override;
+    size_t getTotalVolumeUnits() const override { return volumeUnits.size(); }
+    int getVisibleBlocks(int currFrameId, int frameThreshold) const override;
+
+    //! Return the voxel given the voxel index in the universal volume (1 unit = 1 voxel_length)
+    TsdfVoxel at(const Vec3i& volumeIdx) const;
+
+    //! Return the voxel given the point in volume coordinate system i.e., (metric scale 1 unit =
+    //! 1m)
+    virtual TsdfVoxel at(const cv::Point3f& point) const;
+    virtual TsdfVoxel _at(const cv::Vec3i& volumeIdx, int indx) const;
+
+    TsdfVoxel atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, VolumeUnitIndexes::const_iterator it) const;
+
+
+    float interpolateVoxelPoint(const Point3f& point) const;
+    float interpolateVoxel(const cv::Point3f& point) const;
+    Point3f getNormalVoxel(const cv::Point3f& p) const;
+
+    //! Utility functions for coordinate transformations
+    Vec3i volumeToVolumeUnitIdx(const Point3f& point) const;
+    Point3f volumeUnitIdxToVolume(const Vec3i& volumeUnitIdx) const;
+
+    Point3f voxelCoordToVolume(const Vec3i& voxelIdx) const;
+    Vec3i volumeToVoxelCoord(const Point3f& point) const;
+
+public:
+    Vec6f frameParams;
+    Mat pixNorms;
+    VolumeUnitIndexes volumeUnits;
+    cv::Mat volUnitsData;
+    int lastVolIndex;
+};
+
+
+HashTSDFVolumeCPU::HashTSDFVolumeCPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist,
+                                     int _maxWeight, float _truncateThreshold, int _volumeUnitRes, bool _zFirstMemOrder)
+    :HashTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _truncateThreshold, _volumeUnitRes,
+                    _zFirstMemOrder)
+{
+    reset();
+}
+
+HashTSDFVolumeCPU::HashTSDFVolumeCPU(const VolumeParams& _params, bool _zFirstMemOrder)
+    : HashTSDFVolumeCPU(_params.voxelSize,
+                        Matx44f(_params.pose),
+                        _params.raycastStepFactor, _params.tsdfTruncDist, _params.maxWeight,
+                        _params.depthTruncThreshold, _params.unitResolution, _zFirstMemOrder)
+{
+}
+
+// zero volume, leave rest params the same
+void HashTSDFVolumeCPU::reset()
+{
+    CV_TRACE_FUNCTION();
+    lastVolIndex = 0;
+    volUnitsData = cv::Mat(VOLUMES_SIZE, volumeUnitResolution * volumeUnitResolution * volumeUnitResolution, rawType<TsdfVoxel>());
+    frameParams = Vec6f();
+    pixNorms = Mat();
+    volumeUnits = VolumeUnitIndexes();
+}
+
+void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const Matx33f& _intrinsics, const int frameId)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_depth.type() == DEPTH_TYPE);
+    Depth depth = _depth.getMat();
+
+    //! Compute volumes to be allocated
+    const int depthStride = volumeUnitDegree;
+    const float invDepthFactor = 1.f / depthFactor;
+    const Intr intrinsics(_intrinsics);
+    const Intr::Reprojector reproj(intrinsics.makeReprojector());
+    const Affine3f cam2vol(pose.inv() * Affine3f(cameraPose));
+    const Point3f truncPt(truncDist, truncDist, truncDist);
+    VolumeUnitIndexSet newIndices;
+    Mutex mutex;
+    Range allocateRange(0, depth.rows);
+
+    auto AllocateVolumeUnitsInvoker = [&](const Range& range) {
+        VolumeUnitIndexSet localAccessVolUnits;
+        for (int y = range.start; y < range.end; y += depthStride)
+        {
+            const depthType* depthRow = depth[y];
+            for (int x = 0; x < depth.cols; x += depthStride)
+            {
+                depthType z = depthRow[x] * invDepthFactor;
+                if (z <= 0 || z > this->truncateThreshold)
+                    continue;
+                Point3f camPoint = reproj(Point3f((float)x, (float)y, z));
+                Point3f volPoint = cam2vol * camPoint;
+                //! Find accessed TSDF volume unit for valid 3D vertex
+                Vec3i lower_bound = this->volumeToVolumeUnitIdx(volPoint - truncPt);
+                Vec3i upper_bound = this->volumeToVolumeUnitIdx(volPoint + truncPt);
+
+                for (int i = lower_bound[0]; i <= upper_bound[0]; i++)
+                    for (int j = lower_bound[1]; j <= upper_bound[1]; j++)
+                        for (int k = lower_bound[2]; k <= upper_bound[2]; k++)
+                        {
+                            const Vec3i tsdf_idx = Vec3i(i, j, k);
+                            if (localAccessVolUnits.count(tsdf_idx) <= 0 && this->volumeUnits.count(tsdf_idx) <= 0)
+                            {
+                                //! This volume unit will definitely be required for current integration
+                                localAccessVolUnits.emplace(tsdf_idx);
+                            }
+                        }
+            }
+        }
+
+        mutex.lock();
+        for (const auto& tsdf_idx : localAccessVolUnits)
+        {
+            //! If the insert into the global set passes
+            if (!newIndices.count(tsdf_idx))
+            {
+                // Volume allocation can be performed outside of the lock
+                newIndices.emplace(tsdf_idx);
+            }
+        }
+        mutex.unlock();
+    };
+    parallel_for_(allocateRange, AllocateVolumeUnitsInvoker);
+
+    //! Perform the allocation
+    for (auto idx : newIndices)
+    {
+        VolumeUnit& vu = this->volumeUnits.emplace(idx, VolumeUnit()).first->second;
+
+        Matx44f subvolumePose = pose.translate(volumeUnitIdxToVolume(idx)).matrix;
+
+        vu.pose = subvolumePose;
+        vu.index = lastVolIndex; lastVolIndex++;
+        if (lastVolIndex > int(volUnitsData.size().height))
+        {
+            volUnitsData.resize((lastVolIndex - 1) * 2);
+        }
+        volUnitsData.row(vu.index).forEach<VecTsdfVoxel>([](VecTsdfVoxel& vv, const int* /* position */)
+            {
+                TsdfVoxel& v = reinterpret_cast<TsdfVoxel&>(vv);
+                v.tsdf = floatToTsdf(0.0f); v.weight = 0;
+            });
+        //! This volume unit will definitely be required for current integration
+        vu.lastVisibleIndex = frameId;
+        vu.isActive = true;
+    }
+
+    //! Get keys for all the allocated volume Units
+    std::vector<Vec3i> totalVolUnits;
+    for (const auto& keyvalue : volumeUnits)
+    {
+        totalVolUnits.push_back(keyvalue.first);
+    }
+
+    //! Mark volumes in the camera frustum as active
+    Range inFrustumRange(0, (int)volumeUnits.size());
+    parallel_for_(inFrustumRange, [&](const Range& range) {
+        const Affine3f vol2cam(Affine3f(cameraPose.inv()) * pose);
+        const Intr::Projector proj(intrinsics.makeProjector());
+
+        for (int i = range.start; i < range.end; ++i)
+        {
+            Vec3i tsdf_idx = totalVolUnits[i];
+            VolumeUnitIndexes::iterator it = volumeUnits.find(tsdf_idx);
+            if (it == volumeUnits.end())
+                continue;
+
+            Point3f volumeUnitPos = volumeUnitIdxToVolume(it->first);
+            Point3f volUnitInCamSpace = vol2cam * volumeUnitPos;
+            if (volUnitInCamSpace.z < 0 || volUnitInCamSpace.z > truncateThreshold)
+            {
+                it->second.isActive = false;
+                continue;
+            }
+            Point2f cameraPoint = proj(volUnitInCamSpace);
+            if (cameraPoint.x >= 0 && cameraPoint.y >= 0 && cameraPoint.x < depth.cols && cameraPoint.y < depth.rows)
+            {
+                assert(it != volumeUnits.end());
+                it->second.lastVisibleIndex = frameId;
+                it->second.isActive         = true;
+            }
+        }
+        });
+
+    Vec6f newParams((float)depth.rows, (float)depth.cols,
+                    intrinsics.fx, intrinsics.fy,
+                    intrinsics.cx, intrinsics.cy);
+    if ( !(frameParams==newParams) )
+    {
+        frameParams = newParams;
+        pixNorms = preCalculationPixNorm(depth.size(), intrinsics);
+    }
+
+    //! Integrate the correct volumeUnits
+    parallel_for_(Range(0, (int)totalVolUnits.size()), [&](const Range& range) {
+        for (int i = range.start; i < range.end; i++)
+        {
+            Vec3i tsdf_idx = totalVolUnits[i];
+            VolumeUnitIndexes::iterator it = volumeUnits.find(tsdf_idx);
+            if (it == volumeUnits.end())
+                return;
+
+            VolumeUnit& volumeUnit = it->second;
+            if (volumeUnit.isActive)
+            {
+                //! The volume unit should already be added into the Volume from the allocator
+                integrateVolumeUnit(truncDist, voxelSize, maxWeight, volumeUnit.pose,
+                                    Point3i(volumeUnitResolution, volumeUnitResolution, volumeUnitResolution), volStrides, depth,
+                                    depthFactor, cameraPose, intrinsics, pixNorms, volUnitsData.row(volumeUnit.index));
+
+                //! Ensure all active volumeUnits are set to inactive for next integration
+                volumeUnit.isActive = false;
+            }
+        }
+        });
+}
+
+cv::Vec3i HashTSDFVolumeCPU::volumeToVolumeUnitIdx(const cv::Point3f& p) const
+{
+    return cv::Vec3i(cvFloor(p.x / volumeUnitSize), cvFloor(p.y / volumeUnitSize),
+                     cvFloor(p.z / volumeUnitSize));
+}
+
+cv::Point3f HashTSDFVolumeCPU::volumeUnitIdxToVolume(const cv::Vec3i& volumeUnitIdx) const
+{
+    return cv::Point3f(volumeUnitIdx[0] * volumeUnitSize, volumeUnitIdx[1] * volumeUnitSize,
+                       volumeUnitIdx[2] * volumeUnitSize);
+}
+
+cv::Point3f HashTSDFVolumeCPU::voxelCoordToVolume(const cv::Vec3i& voxelIdx) const
+{
+    return cv::Point3f(voxelIdx[0] * voxelSize, voxelIdx[1] * voxelSize, voxelIdx[2] * voxelSize);
+}
+
+cv::Vec3i HashTSDFVolumeCPU::volumeToVoxelCoord(const cv::Point3f& point) const
+{
+    return cv::Vec3i(cvFloor(point.x * voxelSizeInv), cvFloor(point.y * voxelSizeInv),
+                     cvFloor(point.z * voxelSizeInv));
+}
+
+inline TsdfVoxel HashTSDFVolumeCPU::_at(const cv::Vec3i& volumeIdx, int indx) const
+{
+    //! Out of bounds
+    if ((volumeIdx[0] >= volumeUnitResolution || volumeIdx[0] < 0) ||
+        (volumeIdx[1] >= volumeUnitResolution || volumeIdx[1] < 0) ||
+        (volumeIdx[2] >= volumeUnitResolution || volumeIdx[2] < 0))
+    {
+        return TsdfVoxel(floatToTsdf(1.f), 0);
+    }
+
+    const TsdfVoxel* volData = volUnitsData.ptr<TsdfVoxel>(indx);
+    int coordBase =
+        volumeIdx[0] * volStrides[0] + volumeIdx[1] * volStrides[1] + volumeIdx[2] * volStrides[2];
+    return volData[coordBase];
+}
+
+inline TsdfVoxel HashTSDFVolumeCPU::at(const cv::Vec3i& volumeIdx) const
+{
+    Vec3i volumeUnitIdx = Vec3i(volumeIdx[0] >> volumeUnitDegree,
+                                volumeIdx[1] >> volumeUnitDegree,
+                                volumeIdx[2] >> volumeUnitDegree);
+
+    VolumeUnitIndexes::const_iterator it = volumeUnits.find(volumeUnitIdx);
+
+    if (it == volumeUnits.end())
+    {
+        return TsdfVoxel(floatToTsdf(1.f), 0);
+    }
+
+    cv::Vec3i volUnitLocalIdx = volumeIdx - cv::Vec3i(volumeUnitIdx[0] << volumeUnitDegree,
+                                                      volumeUnitIdx[1] << volumeUnitDegree,
+                                                      volumeUnitIdx[2] << volumeUnitDegree);
+
+    volUnitLocalIdx =
+        cv::Vec3i(abs(volUnitLocalIdx[0]), abs(volUnitLocalIdx[1]), abs(volUnitLocalIdx[2]));
+    return _at(volUnitLocalIdx, it->second.index);
+
+}
+
+TsdfVoxel HashTSDFVolumeCPU::at(const Point3f& point) const
+{
+    cv::Vec3i volumeUnitIdx = volumeToVolumeUnitIdx(point);
+    VolumeUnitIndexes::const_iterator it = volumeUnits.find(volumeUnitIdx);
+
+    if (it == volumeUnits.end())
+    {
+        return TsdfVoxel(floatToTsdf(1.f), 0);
+    }
+
+    cv::Point3f volumeUnitPos = volumeUnitIdxToVolume(volumeUnitIdx);
+    cv::Vec3i volUnitLocalIdx = volumeToVoxelCoord(point - volumeUnitPos);
+    volUnitLocalIdx =
+        cv::Vec3i(abs(volUnitLocalIdx[0]), abs(volUnitLocalIdx[1]), abs(volUnitLocalIdx[2]));
+    return _at(volUnitLocalIdx, it->second.index);
+}
+
+TsdfVoxel HashTSDFVolumeCPU::atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, VolumeUnitIndexes::const_iterator it) const
+{
+    if (it == volumeUnits.end())
+    {
+        return TsdfVoxel(floatToTsdf(1.f), 0);
+    }
+    Vec3i volUnitLocalIdx = point - Vec3i(volumeUnitIdx[0] << volumeUnitDegree,
+                                          volumeUnitIdx[1] << volumeUnitDegree,
+                                          volumeUnitIdx[2] << volumeUnitDegree);
+
+    // expanding at(), removing bounds check
+    const TsdfVoxel* volData = volUnitsData.ptr<TsdfVoxel>(it->second.index);
+    int coordBase = volUnitLocalIdx[0] * volStrides[0] + volUnitLocalIdx[1] * volStrides[1] + volUnitLocalIdx[2] * volStrides[2];
+    return volData[coordBase];
+}
+
+#if USE_INTRINSICS
+inline float interpolate(float tx, float ty, float tz, float vx[8])
+{
+    v_float32x4 v0246, v1357;
+    v_load_deinterleave(vx, v0246, v1357);
+
+    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+
+    v_float32x4 v00_10 = vxx;
+    v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
+
+    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
+    float v0 = v0_1.get0();
+    v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
+    float v1 = v0_1.get0();
+
+    return v0 + tx * (v1 - v0);
+}
+
+#else
+inline float interpolate(float tx, float ty, float tz, float vx[8])
+{
+    float v00 = vx[0] + tz * (vx[1] - vx[0]);
+    float v01 = vx[2] + tz * (vx[3] - vx[2]);
+    float v10 = vx[4] + tz * (vx[5] - vx[4]);
+    float v11 = vx[6] + tz * (vx[7] - vx[6]);
+
+    float v0 = v00 + ty * (v01 - v00);
+    float v1 = v10 + ty * (v11 - v10);
+
+    return v0 + tx * (v1 - v0);
+}
+#endif
+
+float HashTSDFVolumeCPU::interpolateVoxelPoint(const Point3f& point) const
+{
+    const Vec3i neighbourCoords[] = { {0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1},
+                                      {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1} };
+
+    // A small hash table to reduce a number of find() calls
+    bool queried[8];
+    VolumeUnitIndexes::const_iterator iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = volumeUnits.end();
+        queried[i] = false;
+    }
+
+    int ix = cvFloor(point.x);
+    int iy = cvFloor(point.y);
+    int iz = cvFloor(point.z);
+
+    float tx = point.x - ix;
+    float ty = point.y - iy;
+    float tz = point.z - iz;
+
+    Vec3i iv(ix, iy, iz);
+    float vx[8];
+    for (int i = 0; i < 8; i++)
+    {
+        Vec3i pt = iv + neighbourCoords[i];
+
+        Vec3i volumeUnitIdx = Vec3i(pt[0] >> volumeUnitDegree, pt[1] >> volumeUnitDegree, pt[2] >> volumeUnitDegree);
+        int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
+        auto it = iterMap[dictIdx];
+        if (!queried[dictIdx])
+        {
+            it = volumeUnits.find(volumeUnitIdx);
+            iterMap[dictIdx] = it;
+            queried[dictIdx] = true;
+        }
+
+        vx[i] = atVolumeUnit(pt, volumeUnitIdx, it).tsdf;
+    }
+
+    return interpolate(tx, ty, tz, vx);
+}
+
+inline float HashTSDFVolumeCPU::interpolateVoxel(const cv::Point3f& point) const
+{
+    return interpolateVoxelPoint(point * voxelSizeInv);
+}
+
+
+Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
+{
+    Vec3f normal = Vec3f(0, 0, 0);
+
+    Point3f ptVox = point * voxelSizeInv;
+    Vec3i iptVox(cvFloor(ptVox.x), cvFloor(ptVox.y), cvFloor(ptVox.z));
+
+    // A small hash table to reduce a number of find() calls
+    bool queried[8];
+    VolumeUnitIndexes::const_iterator iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = volumeUnits.end();
+        queried[i] = false;
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    const Vec3i offsets[] = { { 1,  0,  0}, {-1,  0,  0}, { 0,  1,  0}, // 0-3
+                              { 0, -1,  0}, { 0,  0,  1}, { 0,  0, -1}  // 4-7
+    };
+    const int nVals = 6;
+
+#else
+    const Vec3i offsets[] = { { 0,  0,  0}, { 0,  0,  1}, { 0,  1,  0}, { 0,  1,  1}, //  0-3
+                              { 1,  0,  0}, { 1,  0,  1}, { 1,  1,  0}, { 1,  1,  1}, //  4-7
+                              {-1,  0,  0}, {-1,  0,  1}, {-1,  1,  0}, {-1,  1,  1}, //  8-11
+                              { 2,  0,  0}, { 2,  0,  1}, { 2,  1,  0}, { 2,  1,  1}, // 12-15
+                              { 0, -1,  0}, { 0, -1,  1}, { 1, -1,  0}, { 1, -1,  1}, // 16-19
+                              { 0,  2,  0}, { 0,  2,  1}, { 1,  2,  0}, { 1,  2,  1}, // 20-23
+                              { 0,  0, -1}, { 0,  1, -1}, { 1,  0, -1}, { 1,  1, -1}, // 24-27
+                              { 0,  0,  2}, { 0,  1,  2}, { 1,  0,  2}, { 1,  1,  2}, // 28-31
+    };
+    const int nVals = 32;
+#endif
+
+    float vals[nVals];
+    for (int i = 0; i < nVals; i++)
+    {
+        Vec3i pt = iptVox + offsets[i];
+
+        Vec3i volumeUnitIdx = Vec3i(pt[0] >> volumeUnitDegree, pt[1] >> volumeUnitDegree, pt[2] >> volumeUnitDegree);
+
+        int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
+        auto it = iterMap[dictIdx];
+        if (!queried[dictIdx])
+        {
+            it = volumeUnits.find(volumeUnitIdx);
+            iterMap[dictIdx] = it;
+            queried[dictIdx] = true;
+        }
+
+        vals[i] = tsdfToFloat(atVolumeUnit(pt, volumeUnitIdx, it).tsdf);
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    for (int c = 0; c < 3; c++)
+    {
+        normal[c] = vals[c * 2] - vals[c * 2 + 1];
+    }
+#else
+
+    float cxv[8], cyv[8], czv[8];
+
+    // How these numbers were obtained:
+    // 1. Take the basic interpolation sequence:
+    // 000, 001, 010, 011, 100, 101, 110, 111
+    // where each digit corresponds to shift by x, y, z axis respectively.
+    // 2. Add +1 for next or -1 for prev to each coordinate to corresponding axis
+    // 3. Search corresponding values in offsets
+    const int idxxp[8] = {  8,  9, 10, 11,  0,  1,  2,  3 };
+    const int idxxn[8] = {  4,  5,  6,  7, 12, 13, 14, 15 };
+    const int idxyp[8] = { 16, 17,  0,  1, 18, 19,  4,  5 };
+    const int idxyn[8] = {  2,  3, 20, 21,  6,  7, 22, 23 };
+    const int idxzp[8] = { 24,  0, 25,  2, 26,  4, 27,  6 };
+    const int idxzn[8] = {  1, 28,  3, 29,  5, 30,  7, 31 };
+
+#if !USE_INTRINSICS
+    for (int i = 0; i < 8; i++)
+    {
+        cxv[i] = vals[idxxn[i]] - vals[idxxp[i]];
+        cyv[i] = vals[idxyn[i]] - vals[idxyp[i]];
+        czv[i] = vals[idxzn[i]] - vals[idxzp[i]];
+    }
+#else
+
+# if CV_SIMD >= 32
+    v_float32x8 cxp = v_lut(vals, idxxp);
+    v_float32x8 cxn = v_lut(vals, idxxn);
+
+    v_float32x8 cyp = v_lut(vals, idxyp);
+    v_float32x8 cyn = v_lut(vals, idxyn);
+
+    v_float32x8 czp = v_lut(vals, idxzp);
+    v_float32x8 czn = v_lut(vals, idxzn);
+
+    v_float32x8 vcxv = cxn - cxp;
+    v_float32x8 vcyv = cyn - cyp;
+    v_float32x8 vczv = czn - czp;
+
+    v_store(cxv, vcxv);
+    v_store(cyv, vcyv);
+    v_store(czv, vczv);
+# else
+    v_float32x4 cxp0 = v_lut(vals, idxxp + 0); v_float32x4 cxp1 = v_lut(vals, idxxp + 4);
+    v_float32x4 cxn0 = v_lut(vals, idxxn + 0); v_float32x4 cxn1 = v_lut(vals, idxxn + 4);
+
+    v_float32x4 cyp0 = v_lut(vals, idxyp + 0); v_float32x4 cyp1 = v_lut(vals, idxyp + 4);
+    v_float32x4 cyn0 = v_lut(vals, idxyn + 0); v_float32x4 cyn1 = v_lut(vals, idxyn + 4);
+
+    v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
+    v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
+
+    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
+    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
+    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+
+    v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
+    v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
+    v_store(czv + 0, czv0); v_store(czv + 4, czv1);
+#endif
+
+#endif
+
+    float tx = ptVox.x - iptVox[0];
+    float ty = ptVox.y - iptVox[1];
+    float tz = ptVox.z - iptVox[2];
+
+    normal[0] = interpolate(tx, ty, tz, cxv);
+    normal[1] = interpolate(tx, ty, tz, cyv);
+    normal[2] = interpolate(tx, ty, tz, czv);
+#endif
+
+    float nv = sqrt(normal[0] * normal[0] +
+                    normal[1] * normal[1] +
+                    normal[2] * normal[2]);
+    return nv < 0.0001f ? nan3 : normal / nv;
+}
+
+void HashTSDFVolumeCPU::raycast(const Matx44f& cameraPose, const Matx33f& _intrinsics, const Size& frameSize,
+                                OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(frameSize.area() > 0);
+
+    _points.create(frameSize, POINT_TYPE);
+    _normals.create(frameSize, POINT_TYPE);
+
+    Points points1   = _points.getMat();
+    Normals normals1 = _normals.getMat();
+
+    Points& points(points1);
+    Normals& normals(normals1);
+    const HashTSDFVolumeCPU& volume(*this);
+    const float tstep(volume.truncDist * volume.raycastStepFactor);
+    const Affine3f cam2vol(volume.pose.inv() * Affine3f(cameraPose));
+    const Affine3f vol2cam(Affine3f(cameraPose.inv()) * volume.pose);
+    const Intr intrinsics(_intrinsics);
+    const Intr::Reprojector reproj(intrinsics.makeReprojector());
+
+    const int nstripes = -1;
+
+    auto _HashRaycastInvoker = [&](const Range& range)
+    {
+        const Point3f cam2volTrans = cam2vol.translation();
+        const Matx33f cam2volRot = cam2vol.rotation();
+        const Matx33f vol2camRot = vol2cam.rotation();
+
+        const float blockSize = volume.volumeUnitSize;
+
+        for (int y = range.start; y < range.end; y++)
+        {
+            ptype* ptsRow = points[y];
+            ptype* nrmRow = normals[y];
+
+            for (int x = 0; x < points.cols; x++)
+            {
+                //! Initialize default value
+                Point3f point = nan3, normal = nan3;
+
+                //! Ray origin and direction in the volume coordinate frame
+                Point3f orig    = cam2volTrans;
+                Point3f rayDirV = normalize(Vec3f(cam2volRot * reproj(Point3f(float(x), float(y), 1.f))));
+
+                float tmin = 0;
+                float tmax = volume.truncateThreshold;
+                float tcurr = tmin;
+
+                cv::Vec3i prevVolumeUnitIdx =
+                    cv::Vec3i(std::numeric_limits<int>::min(), std::numeric_limits<int>::min(),
+                        std::numeric_limits<int>::min());
+
+                float tprev = tcurr;
+                float prevTsdf = volume.truncDist;
+                while (tcurr < tmax)
+                {
+                    Point3f currRayPos = orig + tcurr * rayDirV;
+                    cv::Vec3i currVolumeUnitIdx = volume.volumeToVolumeUnitIdx(currRayPos);
+
+                    VolumeUnitIndexes::const_iterator it = volume.volumeUnits.find(currVolumeUnitIdx);
+
+                    float currTsdf = prevTsdf;
+                    int currWeight = 0;
+                    float stepSize = 0.5f * blockSize;
+                    cv::Vec3i volUnitLocalIdx;
+
+
+                    //! The subvolume exists in hashtable
+                    if (it != volume.volumeUnits.end())
+                    {
+                        cv::Point3f currVolUnitPos =
+                            volume.volumeUnitIdxToVolume(currVolumeUnitIdx);
+                        volUnitLocalIdx = volume.volumeToVoxelCoord(currRayPos - currVolUnitPos);
+
+                        //! TODO: Figure out voxel interpolation
+                        TsdfVoxel currVoxel = _at(volUnitLocalIdx, it->second.index);
+                        currTsdf = tsdfToFloat(currVoxel.tsdf);
+                        currWeight = currVoxel.weight;
+                        stepSize = tstep;
+                    }
+                    //! Surface crossing
+                    if (prevTsdf > 0.f && currTsdf <= 0.f && currWeight > 0)
+                    {
+                        float tInterp = (tcurr * prevTsdf - tprev * currTsdf) / (prevTsdf - currTsdf);
+                        if (!cvIsNaN(tInterp) && !cvIsInf(tInterp))
+                        {
+                            Point3f pv = orig + tInterp * rayDirV;
+                            Point3f nv = volume.getNormalVoxel(pv);
+
+                            if (!isNaN(nv))
+                            {
+                                normal = vol2camRot * nv;
+                                point = vol2cam * pv;
+                            }
+                        }
+                        break;
+                    }
+                    prevVolumeUnitIdx = currVolumeUnitIdx;
+                    prevTsdf = currTsdf;
+                    tprev = tcurr;
+                    tcurr += stepSize;
+                }
+                ptsRow[x] = toPtype(point);
+                nrmRow[x] = toPtype(normal);
+            }
+        }
+    };
+
+    parallel_for_(Range(0, points.rows), _HashRaycastInvoker, nstripes);
+}
+
+void HashTSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    if (_points.needed())
+    {
+        std::vector<std::vector<ptype>> pVecs, nVecs;
+
+        std::vector<Vec3i> totalVolUnits;
+        for (const auto& keyvalue : volumeUnits)
+        {
+            totalVolUnits.push_back(keyvalue.first);
+        }
+        Range fetchRange(0, (int)totalVolUnits.size());
+        const int nstripes = -1;
+
+        const HashTSDFVolumeCPU& volume(*this);
+        bool needNormals(_normals.needed());
+        Mutex mutex;
+
+        auto HashFetchPointsNormalsInvoker = [&](const Range& range)
+        {
+            std::vector<ptype> points, normals;
+            for (int i = range.start; i < range.end; i++)
+            {
+                cv::Vec3i tsdf_idx = totalVolUnits[i];
+
+                VolumeUnitIndexes::const_iterator it = volume.volumeUnits.find(tsdf_idx);
+                Point3f base_point = volume.volumeUnitIdxToVolume(tsdf_idx);
+                if (it != volume.volumeUnits.end())
+                {
+                    std::vector<ptype> localPoints;
+                    std::vector<ptype> localNormals;
+                    for (int x = 0; x < volume.volumeUnitResolution; x++)
+                        for (int y = 0; y < volume.volumeUnitResolution; y++)
+                            for (int z = 0; z < volume.volumeUnitResolution; z++)
+                            {
+                                cv::Vec3i voxelIdx(x, y, z);
+                                TsdfVoxel voxel = _at(voxelIdx, it->second.index);
+
+                                if (voxel.tsdf != -128 && voxel.weight != 0)
+                                {
+                                    Point3f point = base_point + volume.voxelCoordToVolume(voxelIdx);
+                                    localPoints.push_back(toPtype(point));
+                                    if (needNormals)
+                                    {
+                                        Point3f normal = volume.getNormalVoxel(point);
+                                        localNormals.push_back(toPtype(normal));
+                                    }
+                                }
+                            }
+
+                    AutoLock al(mutex);
+                    pVecs.push_back(localPoints);
+                    nVecs.push_back(localNormals);
+                }
+            }
+        };
+
+        parallel_for_(fetchRange, HashFetchPointsNormalsInvoker, nstripes);
+
+        std::vector<ptype> points, normals;
+        for (size_t i = 0; i < pVecs.size(); i++)
+        {
+            points.insert(points.end(), pVecs[i].begin(), pVecs[i].end());
+            normals.insert(normals.end(), nVecs[i].begin(), nVecs[i].end());
+        }
+
+        _points.create((int)points.size(), 1, POINT_TYPE);
+        if (!points.empty())
+            Mat((int)points.size(), 1, POINT_TYPE, &points[0]).copyTo(_points.getMat());
+
+        if (_normals.needed())
+        {
+            _normals.create((int)normals.size(), 1, POINT_TYPE);
+            if (!normals.empty())
+                Mat((int)normals.size(), 1, POINT_TYPE, &normals[0]).copyTo(_normals.getMat());
+        }
+    }
+}
+
+void HashTSDFVolumeCPU::fetchNormals(InputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    if (_normals.needed())
+    {
+        Points points = _points.getMat();
+        CV_Assert(points.type() == POINT_TYPE);
+
+        _normals.createSameSize(_points, _points.type());
+        Normals normals = _normals.getMat();
+
+        const HashTSDFVolumeCPU& _volume = *this;
+        auto HashPushNormals             = [&](const ptype& point, const int* position) {
+            const HashTSDFVolumeCPU& volume(_volume);
+            Affine3f invPose(volume.pose.inv());
+            Point3f p = fromPtype(point);
+            Point3f n = nan3;
+            if (!isNaN(p))
+            {
+                Point3f voxelPoint = invPose * p;
+                n                  = volume.pose.rotation() * volume.getNormalVoxel(voxelPoint);
+            }
+            normals(position[0], position[1]) = toPtype(n);
+        };
+        points.forEach(HashPushNormals);
+    }
+}
+
+int HashTSDFVolumeCPU::getVisibleBlocks(int currFrameId, int frameThreshold) const
+{
+    int numVisibleBlocks = 0;
+    //! TODO: Iterate over map parallely?
+    for (const auto& keyvalue : volumeUnits)
+    {
+        const VolumeUnit& volumeUnit = keyvalue.second;
+        if (volumeUnit.lastVisibleIndex > (currFrameId - frameThreshold))
+            numVisibleBlocks++;
+    }
+    return numVisibleBlocks;
+}
+
+
+///////// GPU implementation /////////
+
+#ifdef HAVE_OPENCL
+
+class HashTSDFVolumeGPU : public HashTSDFVolume
+{
+public:
+    HashTSDFVolumeGPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist, int _maxWeight,
+        float _truncateThreshold, int _volumeUnitRes, bool zFirstMemOrder = false);
+
+    HashTSDFVolumeGPU(const VolumeParams& _volumeParams, bool zFirstMemOrder = false);
+
+    void reset() override;
+
+    void integrateAllVolumeUnitsGPU(const UMat& depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics);
+
+    void allocateVolumeUnits(const UMat& depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics);
+
+    void markActive(const Matx44f& cameraPose, const Intr& intrinsics, const Size frameSz, const int frameId);
+
+    virtual void integrate(InputArray, InputArray, float, const Matx44f&, const Matx33f&, const Matx33f&, const int) override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); };
+    void integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const Matx33f& intrinsics,
+                   const int frameId = 0) override;
+    void raycast(const Matx44f& cameraPose, const Matx33f& intrinsics, const Size& frameSize, OutputArray points,
+                 OutputArray normals) const override;
+    void raycast(const Matx44f&, const Matx33f&, const Size&, OutputArray, OutputArray, OutputArray) const override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); };
+
+    void fetchNormals(InputArray points, OutputArray _normals) const override;
+    void fetchPointsNormals(OutputArray points, OutputArray normals) const override;
+
+    size_t getTotalVolumeUnits() const override { return size_t(hashTable.last); }
+    int getVisibleBlocks(int currFrameId, int frameThreshold) const override;
+
+
+
+    //! Return the voxel given the point in volume coordinate system i.e., (metric scale 1 unit =
+    //! 1m)
+    virtual TsdfVoxel new_at(const cv::Vec3i& volumeIdx, int indx) const;
+    TsdfVoxel new_atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, int indx) const;
+
+
+    float interpolateVoxelPoint(const Point3f& point) const;
+    float interpolateVoxel(const cv::Point3f& point) const;
+    Point3f getNormalVoxel(const cv::Point3f& p) const;
+
+    //! Utility functions for coordinate transformations
+    Vec3i volumeToVolumeUnitIdx(const Point3f& point) const;
+    Point3f volumeUnitIdxToVolume(const Vec3i& volumeUnitIdx) const;
+
+    Point3f voxelCoordToVolume(const Vec3i& voxelIdx) const;
+    Vec3i volumeToVoxelCoord(const Point3f& point) const;
+
+public:
+    Vec6f frameParams;
+    int bufferSizeDegree;
+
+    // per-volume-unit data
+    cv::UMat lastVisibleIndices;
+
+    cv::UMat isActiveFlags;
+
+    cv::UMat volUnitsData;
+    //TODO: remove it when there's no CPU parts
+    cv::Mat volUnitsDataCopy;
+
+    cv::UMat pixNorms;
+
+    //TODO: move indexes.volumes to GPU
+    CustomHashSet hashTable;
+};
+
+HashTSDFVolumeGPU::HashTSDFVolumeGPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist, int _maxWeight,
+    float _truncateThreshold, int _volumeUnitRes, bool _zFirstMemOrder)
+    :HashTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _truncateThreshold, _volumeUnitRes, _zFirstMemOrder)
+{
+    if (truncDist >= volumeUnitSize)
+    {
+        std::cout << "truncDist exceeds volume unit size, allocation may work incorrectly" << std::endl;
+    }
+
+    reset();
+}
+
+HashTSDFVolumeGPU::HashTSDFVolumeGPU(const VolumeParams & _params, bool _zFirstMemOrder)
+    : HashTSDFVolumeGPU(_params.voxelSize,
+                        Matx44f(_params.pose),
+                        _params.raycastStepFactor, _params.tsdfTruncDist, _params.maxWeight,
+                        _params.depthTruncThreshold, _params.unitResolution, _zFirstMemOrder)
+{
+}
+
+// zero volume, leave rest params the same
+void HashTSDFVolumeGPU::reset()
+{
+    CV_TRACE_FUNCTION();
+
+    bufferSizeDegree = 15;
+    int buff_lvl = (int) (1 << bufferSizeDegree);
+
+    int volCubed = volumeUnitResolution * volumeUnitResolution * volumeUnitResolution;
+    volUnitsDataCopy = cv::Mat(buff_lvl, volCubed, rawType<TsdfVoxel>());
+
+    volUnitsData = cv::UMat(buff_lvl, volCubed, CV_8UC2);
+
+    lastVisibleIndices = cv::UMat(buff_lvl, 1, CV_32S);
+
+    isActiveFlags = cv::UMat(buff_lvl, 1, CV_8U);
+
+    hashTable = CustomHashSet();
+
+    frameParams = Vec6f();
+    pixNorms = UMat();
+}
+
+
+void HashTSDFVolumeGPU::integrateAllVolumeUnitsGPU(const UMat& depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics)
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(!depth.empty());
+
+    String errorStr;
+    String name = "integrateAllVolumeUnits";
+    ocl::ProgramSource source = ocl::_3d::hash_tsdf_oclsrc;
+    String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if (k.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    float dfac = 1.f / depthFactor;
+    Vec2f fxy(intrinsics.fx, intrinsics.fy), cxy(intrinsics.cx, intrinsics.cy);
+    Matx44f vol2camMatrix = (Affine3f(cameraPose).inv() * pose).matrix;
+    Matx44f camInvMatrix = Affine3f(cameraPose).inv().matrix;
+
+    UMat hashesGpu = Mat(hashTable.hashes, false).getUMat(ACCESS_READ);
+    UMat hashDataGpu = Mat(hashTable.data, false).getUMat(ACCESS_READ);
+
+    k.args(ocl::KernelArg::ReadOnly(depth),
+           ocl::KernelArg::PtrReadOnly(hashesGpu),
+           ocl::KernelArg::PtrReadOnly(hashDataGpu),
+           ocl::KernelArg::ReadWrite(volUnitsData),
+           ocl::KernelArg::ReadOnly(pixNorms),
+           ocl::KernelArg::ReadOnly(isActiveFlags),
+           vol2camMatrix,
+           camInvMatrix,
+           voxelSize,
+           volumeUnitResolution,
+           volStrides.val,
+           fxy.val,
+           cxy.val,
+           dfac,
+           truncDist,
+           int(maxWeight)
+    );
+
+    int resol = volumeUnitResolution;
+    size_t globalSize[3];
+    globalSize[0] = (size_t)resol;
+    globalSize[1] = (size_t)resol;
+    globalSize[2] = (size_t)hashTable.last; // num of volume units
+
+    if (!k.run(3, globalSize, NULL, true))
+        throw std::runtime_error("Failed to run kernel");
+}
+
+
+void HashTSDFVolumeGPU::allocateVolumeUnits(const UMat& _depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics)
+{
+    constexpr int pixCapacity = 16;
+    typedef std::array<Vec3i, pixCapacity> LocalVolUnits;
+
+    Depth depth = _depth.getMat(ACCESS_READ);
+
+    //! Compute volumes to be allocated
+    const int depthStride = volumeUnitDegree;
+    const float invDepthFactor = 1.f / depthFactor;
+    const Intr::Reprojector reproj(intrinsics.makeReprojector());
+    const Affine3f cam2vol(pose.inv() * Affine3f(cameraPose));
+    const Point3f truncPt(truncDist, truncDist, truncDist);
+    Mutex mutex;
+
+    // for new indices
+    CustomHashSet thm;
+
+    auto fillLocalAcessVolUnits = [&](const Range& xrange, const Range& yrange, CustomHashSet& ghm)
+    {
+        for (int y = yrange.start; y < yrange.end; y += depthStride)
+        {
+            const depthType* depthRow = depth[y];
+            for (int x = xrange.start; x < xrange.end; x += depthStride)
+            {
+                depthType z = depthRow[x] * invDepthFactor;
+                if (z <= 0 || z > this->truncateThreshold)
+                    continue;
+                Point3f camPoint = reproj(Point3f((float)x, (float)y, z));
+                Point3f volPoint = cam2vol * camPoint;
+                //! Find accessed TSDF volume unit for valid 3D vertex
+                Vec3i lower_bound = this->volumeToVolumeUnitIdx(volPoint - truncPt);
+                Vec3i upper_bound = this->volumeToVolumeUnitIdx(volPoint + truncPt);
+
+                int pixLocalCounter = 0;
+                LocalVolUnits pixLocalVolUnits;
+                for (int i = lower_bound[0]; i <= upper_bound[0]; i++)
+                    for (int j = lower_bound[1]; j <= upper_bound[1]; j++)
+                        for (int k = lower_bound[2]; k <= upper_bound[2]; k++)
+                        {
+                            const Vec3i tsdf_idx = Vec3i(i, j, k);
+
+                            if (hashTable.find(tsdf_idx) < 0)
+                            {
+                                bool found = false;
+                                for (int c = 0; c < pixLocalCounter; c++)
+                                {
+                                    if (pixLocalVolUnits[c] == tsdf_idx)
+                                    {
+                                        found = true; break;
+                                    }
+                                }
+                                if (!found)
+                                {
+                                    pixLocalVolUnits[pixLocalCounter++] = tsdf_idx;
+                                    if (pixLocalCounter >= pixCapacity)
+                                    {
+                                        return;
+                                    }
+                                }
+                            }
+                        }
+
+                // lock localAccessVolUnits somehow
+                for (int i = 0; i < pixLocalCounter; i++)
+                {
+                    Vec3i idx = pixLocalVolUnits[i];
+                    if (!ghm.insert(idx))
+                    {
+                        //return;
+                    }
+                }
+                // unlock
+            }
+        }
+    };
+
+    Rect dim(0, 0, depth.cols, depth.rows);
+    Size gsz(32, 32);
+    Size gg(divUp(dim.width, gsz.width), divUp(dim.height, gsz.height));
+
+    bool needReallocation = false;
+    auto allocateLambda = [&](const Range& r)
+    {
+
+    for (int yg = r.start; yg < r.end; yg++)
+    {
+        for (int xg = 0; xg < gg.width; xg++)
+        {
+            Rect gr(xg * gsz.width, yg * gsz.height, (xg + 1) * gsz.width, (yg + 1) * gsz.height);
+            gr = gr & dim;
+            Range xr(gr.tl().x, gr.br().x), yr(gr.tl().y, gr.br().y);
+
+            CustomHashSet ghm;
+
+            fillLocalAcessVolUnits(xr, yr, ghm);
+
+            if (ghm.last)
+            {
+                std::lock_guard<std::recursive_mutex> al(mutex);
+
+                for (int i = 0; i < ghm.last; i++)
+                {
+                    Vec4i node = ghm.data[i];
+                    Vec3i idx(node[0], node[1], node[2]);
+
+                    //TODO: 1. add to separate hash map instead, then merge on GPU side
+
+                    int result = thm.insert(idx);
+                    if (!result)
+                    {
+                        needReallocation = true;
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    };
+
+    do
+    {
+        if (needReallocation)
+        {
+            thm.capacity *= 2;
+            thm.data.resize(thm.capacity);
+
+            needReallocation = false;
+        }
+
+        parallel_for_(Range(0, gg.height), allocateLambda);
+    } while (needReallocation);
+
+
+    auto pushToGlobal = [](const CustomHashSet _thm, CustomHashSet& _globalHashMap,
+                           bool& _needReallocation, Mutex& _mutex)
+    {
+        for (int i = 0; i < _thm.last; i++)
+        {
+                Vec4i node = _thm.data[i];
+                Vec3i idx(node[0], node[1], node[2]);
+
+                std::lock_guard<std::recursive_mutex> al(_mutex);
+
+                int result = _globalHashMap.insert(idx);
+                if (result == 0)
+                {
+                    _needReallocation = true;
+                    return;
+                }
+        }
+    };
+
+    needReallocation = false;
+    do
+    {
+        if (needReallocation)
+        {
+            hashTable.capacity *= 2;
+            hashTable.data.resize(hashTable.capacity);
+
+            needReallocation = false;
+        }
+
+        pushToGlobal(thm, hashTable, needReallocation, mutex);
+    } while (needReallocation);
+}
+
+
+void HashTSDFVolumeGPU::markActive(const Matx44f& cameraPose, const Intr& intrinsics, const Size frameSz, const int frameId)
+{
+    //! Mark volumes in the camera frustum as active
+    String errorStr;
+    String name = "markActive";
+    ocl::ProgramSource source = ocl::_3d::hash_tsdf_oclsrc;
+    String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if (k.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    const Affine3f vol2cam(Affine3f(cameraPose.inv()) * pose);
+    const Intr::Projector proj(intrinsics.makeProjector());
+    Vec2f fxy(proj.fx, proj.fy), cxy(proj.cx, proj.cy);
+
+    UMat hashDataGpu = Mat(hashTable.data, false).getUMat(ACCESS_READ);
+
+    k.args(
+        ocl::KernelArg::PtrReadOnly(hashDataGpu),
+        ocl::KernelArg::WriteOnly(isActiveFlags),
+        ocl::KernelArg::WriteOnly(lastVisibleIndices),
+        vol2cam.matrix,
+        fxy,
+        cxy,
+        frameSz,
+        volumeUnitSize,
+        hashTable.last,
+        truncateThreshold,
+        frameId
+    );
+
+    size_t globalSize[1] = { (size_t)hashTable.last };
+    if (!k.run(1, globalSize, nullptr, true))
+        throw std::runtime_error("Failed to run kernel");
+}
+
+
+void HashTSDFVolumeGPU::integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const Matx33f& _intrinsics, const int frameId)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_depth.type() == DEPTH_TYPE);
+    UMat depth = _depth.getUMat();
+
+    Intr intrinsics(_intrinsics);
+
+    // Save length to fill new data in ranges
+    int sizeBefore = hashTable.last;
+    allocateVolumeUnits(depth, depthFactor, cameraPose, intrinsics);
+    int sizeAfter = hashTable.last;
+    //! Perform the allocation
+
+    // Grow buffers
+    int buff_lvl = (int)(1 << bufferSizeDegree);
+    if (sizeAfter >= buff_lvl)
+    {
+        bufferSizeDegree = (int)(log2(sizeAfter) + 1); // clz() would be better
+        int oldBuffSize = buff_lvl;
+        buff_lvl = (int)pow(2, bufferSizeDegree);
+
+        volUnitsDataCopy.resize(buff_lvl);
+
+        Range oldr(0, oldBuffSize);
+        int volCubed = volumeUnitResolution * volumeUnitResolution * volumeUnitResolution;
+        UMat newData(buff_lvl, volCubed, CV_8UC2);
+        volUnitsData.copyTo(newData.rowRange(oldr));
+        volUnitsData = newData;
+
+        UMat newLastVisibleIndices(buff_lvl, 1, CV_32S);
+        lastVisibleIndices.copyTo(newLastVisibleIndices.rowRange(oldr));
+        lastVisibleIndices = newLastVisibleIndices;
+
+        UMat newIsActiveFlags(buff_lvl, 1, CV_8U);
+        isActiveFlags.copyTo(newIsActiveFlags.rowRange(oldr));
+        isActiveFlags = newIsActiveFlags;
+    }
+
+    // Fill data for new volume units
+    Range r(sizeBefore, sizeAfter);
+    if (r.start < r.end)
+    {
+        lastVisibleIndices.rowRange(r) = frameId;
+        isActiveFlags.rowRange(r) = 1;
+
+        TsdfVoxel emptyVoxel(floatToTsdf(0.0f), 0);
+        volUnitsData.rowRange(r) = Vec2b((uchar)(emptyVoxel.tsdf), (uchar)(emptyVoxel.weight));
+    }
+
+    //! Mark volumes in the camera frustum as active
+    markActive(cameraPose, intrinsics, depth.size(), frameId);
+
+    Vec6f newParams((float)depth.rows, (float)depth.cols,
+                     intrinsics.fx, intrinsics.fy,
+                     intrinsics.cx, intrinsics.cy);
+    if (!(frameParams == newParams))
+    {
+        frameParams = newParams;
+        pixNorms = preCalculationPixNormGPU(depth.size(), intrinsics);
+    }
+
+    //! Integrate the correct volumeUnits
+    integrateAllVolumeUnitsGPU(depth, depthFactor, cameraPose, intrinsics);
+}
+
+
+//TODO: remove these functions when everything is done on GPU
+cv::Vec3i HashTSDFVolumeGPU::volumeToVolumeUnitIdx(const cv::Point3f& p) const
+{
+    return cv::Vec3i(cvFloor(p.x / volumeUnitSize), cvFloor(p.y / volumeUnitSize),
+        cvFloor(p.z / volumeUnitSize));
+}
+
+cv::Point3f HashTSDFVolumeGPU::volumeUnitIdxToVolume(const cv::Vec3i& volumeUnitIdx) const
+{
+    return cv::Point3f(volumeUnitIdx[0] * volumeUnitSize, volumeUnitIdx[1] * volumeUnitSize,
+        volumeUnitIdx[2] * volumeUnitSize);
+}
+
+cv::Point3f HashTSDFVolumeGPU::voxelCoordToVolume(const cv::Vec3i& voxelIdx) const
+{
+    return cv::Point3f(voxelIdx[0] * voxelSize, voxelIdx[1] * voxelSize, voxelIdx[2] * voxelSize);
+}
+
+cv::Vec3i HashTSDFVolumeGPU::volumeToVoxelCoord(const cv::Point3f& point) const
+{
+    return cv::Vec3i(cvFloor(point.x * voxelSizeInv), cvFloor(point.y * voxelSizeInv),
+        cvFloor(point.z * voxelSizeInv));
+}
+
+inline TsdfVoxel HashTSDFVolumeGPU::new_at(const cv::Vec3i& volumeIdx, int indx) const
+{
+    //! Out of bounds
+    if ((volumeIdx[0] >= volumeUnitResolution || volumeIdx[0] < 0) ||
+        (volumeIdx[1] >= volumeUnitResolution || volumeIdx[1] < 0) ||
+        (volumeIdx[2] >= volumeUnitResolution || volumeIdx[2] < 0))
+    {
+        return TsdfVoxel(floatToTsdf(1.0f), 0);
+    }
+
+    const TsdfVoxel* volData = volUnitsDataCopy.ptr<TsdfVoxel>(indx);
+    int coordBase =
+        volumeIdx[0] * volStrides[0] +
+        volumeIdx[1] * volStrides[1] +
+        volumeIdx[2] * volStrides[2];
+    return volData[coordBase];
+}
+
+TsdfVoxel HashTSDFVolumeGPU::new_atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, int indx) const
+{
+    if (indx < 0)
+    {
+        return TsdfVoxel(floatToTsdf(1.f), 0);
+    }
+    Vec3i volUnitLocalIdx = point - Vec3i(volumeUnitIdx[0] << volumeUnitDegree,
+                                          volumeUnitIdx[1] << volumeUnitDegree,
+                                          volumeUnitIdx[2] << volumeUnitDegree);
+
+    // expanding at(), removing bounds check
+    const TsdfVoxel* volData = volUnitsDataCopy.ptr<TsdfVoxel>(indx);
+    int coordBase = volUnitLocalIdx[0] * volStrides[0] +
+        volUnitLocalIdx[1] * volStrides[1] +
+        volUnitLocalIdx[2] * volStrides[2];
+    return volData[coordBase];
+}
+
+float HashTSDFVolumeGPU::interpolateVoxelPoint(const Point3f& point) const
+{
+    const Vec3i local_neighbourCoords[] = { {0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1},
+                                            {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1} };
+
+    // A small hash table to reduce a number of find() calls
+    // -2 and lower means not queried yet
+    // -1 means not found
+    // 0+ means found
+    int iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = -2;
+    }
+
+    int ix = cvFloor(point.x);
+    int iy = cvFloor(point.y);
+    int iz = cvFloor(point.z);
+
+    float tx = point.x - ix;
+    float ty = point.y - iy;
+    float tz = point.z - iz;
+
+    Vec3i iv(ix, iy, iz);
+    float vx[8];
+    for (int i = 0; i < 8; i++)
+    {
+        Vec3i pt = iv + local_neighbourCoords[i];
+
+        Vec3i volumeUnitIdx = Vec3i(pt[0] >> volumeUnitDegree, pt[1] >> volumeUnitDegree, pt[2] >> volumeUnitDegree);
+        int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
+        auto it = iterMap[dictIdx];
+        if (it < -1)
+        {
+            it = hashTable.find(volumeUnitIdx);
+            iterMap[dictIdx] = it;
+        }
+
+        vx[i] = new_atVolumeUnit(pt, volumeUnitIdx, it).tsdf;
+    }
+
+    return interpolate(tx, ty, tz, vx);
+}
+
+inline float HashTSDFVolumeGPU::interpolateVoxel(const cv::Point3f& point) const
+{
+    return interpolateVoxelPoint(point * voxelSizeInv);
+}
+
+Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
+{
+    Vec3f normal = Vec3f(0, 0, 0);
+
+    Point3f ptVox = point * voxelSizeInv;
+    Vec3i iptVox(cvFloor(ptVox.x), cvFloor(ptVox.y), cvFloor(ptVox.z));
+
+    // A small hash table to reduce a number of find() calls
+    // -2 and lower means not queried yet
+    // -1 means not found
+    // 0+ means found
+    int iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = -2;
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    const Vec3i offsets[] = { { 1,  0,  0}, {-1,  0,  0}, { 0,  1,  0}, // 0-3
+                              { 0, -1,  0}, { 0,  0,  1}, { 0,  0, -1}  // 4-7
+    };
+    const int nVals = 6;
+
+#else
+    const Vec3i offsets[] = { { 0,  0,  0}, { 0,  0,  1}, { 0,  1,  0}, { 0,  1,  1}, //  0-3
+                              { 1,  0,  0}, { 1,  0,  1}, { 1,  1,  0}, { 1,  1,  1}, //  4-7
+                              {-1,  0,  0}, {-1,  0,  1}, {-1,  1,  0}, {-1,  1,  1}, //  8-11
+                              { 2,  0,  0}, { 2,  0,  1}, { 2,  1,  0}, { 2,  1,  1}, // 12-15
+                              { 0, -1,  0}, { 0, -1,  1}, { 1, -1,  0}, { 1, -1,  1}, // 16-19
+                              { 0,  2,  0}, { 0,  2,  1}, { 1,  2,  0}, { 1,  2,  1}, // 20-23
+                              { 0,  0, -1}, { 0,  1, -1}, { 1,  0, -1}, { 1,  1, -1}, // 24-27
+                              { 0,  0,  2}, { 0,  1,  2}, { 1,  0,  2}, { 1,  1,  2}, // 28-31
+    };
+    const int nVals = 32;
+#endif
+
+    float vals[nVals];
+    for (int i = 0; i < nVals; i++)
+    {
+        Vec3i pt = iptVox + offsets[i];
+
+        Vec3i volumeUnitIdx = Vec3i(pt[0] >> volumeUnitDegree, pt[1] >> volumeUnitDegree, pt[2] >> volumeUnitDegree);
+
+        int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
+        auto it = iterMap[dictIdx];
+        if (it < -1)
+        {
+            it = hashTable.find(volumeUnitIdx);
+            iterMap[dictIdx] = it;
+        }
+
+        vals[i] = tsdfToFloat(new_atVolumeUnit(pt, volumeUnitIdx, it).tsdf);
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    for (int c = 0; c < 3; c++)
+    {
+        normal[c] = vals[c * 2] - vals[c * 2 + 1];
+    }
+#else
+
+    float cxv[8], cyv[8], czv[8];
+
+    // How these numbers were obtained:
+    // 1. Take the basic interpolation sequence:
+    // 000, 001, 010, 011, 100, 101, 110, 111
+    // where each digit corresponds to shift by x, y, z axis respectively.
+    // 2. Add +1 for next or -1 for prev to each coordinate to corresponding axis
+    // 3. Search corresponding values in offsets
+    const int idxxp[8] = { 8,  9, 10, 11,  0,  1,  2,  3 };
+    const int idxxn[8] = { 4,  5,  6,  7, 12, 13, 14, 15 };
+    const int idxyp[8] = { 16, 17,  0,  1, 18, 19,  4,  5 };
+    const int idxyn[8] = { 2,  3, 20, 21,  6,  7, 22, 23 };
+    const int idxzp[8] = { 24,  0, 25,  2, 26,  4, 27,  6 };
+    const int idxzn[8] = { 1, 28,  3, 29,  5, 30,  7, 31 };
+
+#if !USE_INTRINSICS
+    for (int i = 0; i < 8; i++)
+    {
+        cxv[i] = vals[idxxn[i]] - vals[idxxp[i]];
+        cyv[i] = vals[idxyn[i]] - vals[idxyp[i]];
+        czv[i] = vals[idxzn[i]] - vals[idxzp[i]];
+    }
+#else
+
+# if CV_SIMD >= 32
+    v_float32x8 cxp = v_lut(vals, idxxp);
+    v_float32x8 cxn = v_lut(vals, idxxn);
+
+    v_float32x8 cyp = v_lut(vals, idxyp);
+    v_float32x8 cyn = v_lut(vals, idxyn);
+
+    v_float32x8 czp = v_lut(vals, idxzp);
+    v_float32x8 czn = v_lut(vals, idxzn);
+
+    v_float32x8 vcxv = cxn - cxp;
+    v_float32x8 vcyv = cyn - cyp;
+    v_float32x8 vczv = czn - czp;
+
+    v_store(cxv, vcxv);
+    v_store(cyv, vcyv);
+    v_store(czv, vczv);
+# else
+    v_float32x4 cxp0 = v_lut(vals, idxxp + 0); v_float32x4 cxp1 = v_lut(vals, idxxp + 4);
+    v_float32x4 cxn0 = v_lut(vals, idxxn + 0); v_float32x4 cxn1 = v_lut(vals, idxxn + 4);
+
+    v_float32x4 cyp0 = v_lut(vals, idxyp + 0); v_float32x4 cyp1 = v_lut(vals, idxyp + 4);
+    v_float32x4 cyn0 = v_lut(vals, idxyn + 0); v_float32x4 cyn1 = v_lut(vals, idxyn + 4);
+
+    v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
+    v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
+
+    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
+    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
+    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+
+    v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
+    v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
+    v_store(czv + 0, czv0); v_store(czv + 4, czv1);
+#endif
+
+#endif
+
+    float tx = ptVox.x - iptVox[0];
+    float ty = ptVox.y - iptVox[1];
+    float tz = ptVox.z - iptVox[2];
+
+    normal[0] = interpolate(tx, ty, tz, cxv);
+    normal[1] = interpolate(tx, ty, tz, cyv);
+    normal[2] = interpolate(tx, ty, tz, czv);
+#endif
+    float nv = sqrt(normal[0] * normal[0] +
+                    normal[1] * normal[1] +
+                    normal[2] * normal[2]);
+    return nv < 0.0001f ? nan3 : normal / nv;
+}
+
+
+void HashTSDFVolumeGPU::raycast(const Matx44f& cameraPose, const Matx33f& _intrinsics, const Size& frameSize,
+                                OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(frameSize.area() > 0);
+
+    String errorStr;
+    String name = "raycast";
+    ocl::ProgramSource source = ocl::_3d::hash_tsdf_oclsrc;
+    String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if (k.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    _points.create(frameSize, CV_32FC4);
+    _normals.create(frameSize, CV_32FC4);
+
+    UMat points = _points.getUMat();
+    UMat normals = _normals.getUMat();
+
+    Intr intrinsics(_intrinsics);
+    Intr::Reprojector r = intrinsics.makeReprojector();
+    Vec2f finv(r.fxinv, r.fyinv), cxy(r.cx, r.cy);
+
+    Vec4f boxMin, boxMax(volumeUnitSize - voxelSize,
+                         volumeUnitSize - voxelSize,
+                         volumeUnitSize - voxelSize);
+
+    float tstep = truncDist * raycastStepFactor;
+
+    const HashTSDFVolumeGPU& volume(*this);
+    const Affine3f cam2vol(volume.pose.inv() * Affine3f(cameraPose));
+    const Affine3f vol2cam(Affine3f(cameraPose.inv()) * volume.pose);
+
+    Matx44f cam2volRotGPU = cam2vol.matrix;
+    Matx44f vol2camRotGPU = vol2cam.matrix;
+
+    UMat volPoseGpu = Mat(pose.matrix).getUMat(ACCESS_READ);
+    UMat invPoseGpu = Mat(pose.inv().matrix).getUMat(ACCESS_READ);
+
+    UMat hashesGpu = Mat(hashTable.hashes, false).getUMat(ACCESS_READ);
+    UMat hashDataGpu = Mat(hashTable.data, false).getUMat(ACCESS_READ);
+
+    k.args(
+        ocl::KernelArg::PtrReadOnly(hashesGpu),
+        ocl::KernelArg::PtrReadOnly(hashDataGpu),
+        ocl::KernelArg::WriteOnlyNoSize(points),
+        ocl::KernelArg::WriteOnlyNoSize(normals),
+        frameSize,
+        ocl::KernelArg::ReadOnly(volUnitsData),
+        cam2volRotGPU,
+        vol2camRotGPU,
+        float(volume.truncateThreshold),
+        finv.val, cxy.val,
+        boxMin.val, boxMax.val,
+        tstep,
+        voxelSize,
+        voxelSizeInv,
+        volumeUnitSize,
+        volume.truncDist,
+        volumeUnitDegree,
+        volStrides
+    );
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)frameSize.width;
+    globalSize[1] = (size_t)frameSize.height;
+
+    if (!k.run(2, globalSize, NULL, true))
+        throw std::runtime_error("Failed to run kernel");
+}
+
+void HashTSDFVolumeGPU::fetchPointsNormals(OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    if (_points.needed())
+    {
+        //TODO: remove it when it works w/o CPU code
+        volUnitsData.copyTo(volUnitsDataCopy);
+        //TODO: remove it when it works w/o CPU code
+        //TODO: enable it when it's on GPU
+        //UMat hashDataGpu(hashMap.capacity, 1, CV_32SC4);
+        //Mat(hashMap.data, false).copyTo(hashDataGpu);
+
+        std::vector<std::vector<ptype>> pVecs, nVecs;
+
+        Range _fetchRange(0, hashTable.last);
+
+        const int nstripes = -1;
+
+        const HashTSDFVolumeGPU& volume(*this);
+        bool needNormals(_normals.needed());
+        Mutex mutex;
+
+        auto _HashFetchPointsNormalsInvoker = [&](const Range& range)
+        {
+            std::vector<ptype> points, normals;
+            for (int row = range.start; row < range.end; row++)
+            {
+                cv::Vec4i idx4 = hashTable.data[row];
+                cv::Vec3i idx(idx4[0], idx4[1], idx4[2]);
+
+                Point3f base_point = volume.volumeUnitIdxToVolume(idx);
+
+                std::vector<ptype> localPoints;
+                std::vector<ptype> localNormals;
+                for (int x = 0; x < volume.volumeUnitResolution; x++)
+                    for (int y = 0; y < volume.volumeUnitResolution; y++)
+                        for (int z = 0; z < volume.volumeUnitResolution; z++)
+                        {
+                            cv::Vec3i voxelIdx(x, y, z);
+                            TsdfVoxel voxel = new_at(voxelIdx, row);
+
+                            if (voxel.tsdf != -128 && voxel.weight != 0)
+                            {
+                                Point3f point = base_point + volume.voxelCoordToVolume(voxelIdx);
+
+                                localPoints.push_back(toPtype(point));
+                                if (needNormals)
+                                {
+                                    Point3f normal = volume.getNormalVoxel(point);
+                                    localNormals.push_back(toPtype(normal));
+                                }
+                            }
+                        }
+
+                AutoLock al(mutex);
+                pVecs.push_back(localPoints);
+                nVecs.push_back(localNormals);
+            }
+        };
+
+        parallel_for_(_fetchRange, _HashFetchPointsNormalsInvoker, nstripes);
+
+        std::vector<ptype> points, normals;
+        for (size_t i = 0; i < pVecs.size(); i++)
+        {
+            points.insert(points.end(), pVecs[i].begin(), pVecs[i].end());
+            normals.insert(normals.end(), nVecs[i].begin(), nVecs[i].end());
+        }
+
+        _points.create((int)points.size(), 1, POINT_TYPE);
+        if (!points.empty())
+            Mat((int)points.size(), 1, POINT_TYPE, &points[0]).copyTo(_points.getMat());
+
+        if (_normals.needed())
+        {
+            _normals.create((int)normals.size(), 1, POINT_TYPE);
+            if (!normals.empty())
+                Mat((int)normals.size(), 1, POINT_TYPE, &normals[0]).copyTo(_normals.getMat());
+        }
+    }
+}
+
+void HashTSDFVolumeGPU::fetchNormals(InputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    if (_normals.needed())
+    {
+        //TODO: remove it when it works w/o CPU code
+        volUnitsData.copyTo(volUnitsDataCopy);
+
+        Points points = _points.getMat();
+        CV_Assert(points.type() == POINT_TYPE);
+        _normals.createSameSize(_points, _points.type());
+        Normals normals = _normals.getMat();
+        const HashTSDFVolumeGPU& _volume = *this;
+        auto HashPushNormals             = [&](const ptype& point, const int* position) {
+            const HashTSDFVolumeGPU& volume(_volume);
+            Affine3f invPose(volume.pose.inv());
+            Point3f p = fromPtype(point);
+            Point3f n = nan3;
+            if (!isNaN(p))
+            {
+                Point3f voxelPoint = invPose * p;
+                n                  = volume.pose.rotation() * volume.getNormalVoxel(voxelPoint);
+            }
+            normals(position[0], position[1]) = toPtype(n);
+        };
+        points.forEach(HashPushNormals);
+    }
+
+}
+
+int HashTSDFVolumeGPU::getVisibleBlocks(int currFrameId, int frameThreshold) const
+{
+    Mat cpuIndices = lastVisibleIndices.getMat(ACCESS_READ);
+
+    int numVisibleBlocks = 0;
+    //! TODO: Iterate over map parallely?
+    for (int i = 0; i < hashTable.last; i++)
+    {
+        if (cpuIndices.at<int>(i) > (currFrameId - frameThreshold))
+            numVisibleBlocks++;
+    }
+    return numVisibleBlocks;
+}
+
+#endif
+
+//template<typename T>
+Ptr<HashTSDFVolume> makeHashTSDFVolume(const VolumeParams& _params)
+{
+    Mat pm = _params.pose;
+    CV_Assert(pm.size() == Size(4, 4));
+    CV_Assert(pm.type() == CV_32F);
+    Matx44f pose = pm;
+#ifdef HAVE_OPENCL
+    if (ocl::useOpenCL())
+        return makePtr<HashTSDFVolumeGPU>(_params.voxelSize, pose, _params.raycastStepFactor, _params.tsdfTruncDist, _params.maxWeight,
+                                          _params.depthTruncThreshold, _params.unitResolution);
+#endif
+    return makePtr<HashTSDFVolumeCPU>(_params.voxelSize, pose, _params.raycastStepFactor, _params.tsdfTruncDist, _params.maxWeight,
+                                      _params.depthTruncThreshold, _params.unitResolution);
+}
+
+//template<typename T>
+Ptr<HashTSDFVolume> makeHashTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
+    int _maxWeight, float truncateThreshold, int volumeUnitResolution)
+{
+#ifdef HAVE_OPENCL
+    if (ocl::useOpenCL())
+        return makePtr<HashTSDFVolumeGPU>(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, truncateThreshold,
+            volumeUnitResolution);
+#endif
+    return makePtr<HashTSDFVolumeCPU>(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, truncateThreshold,
+        volumeUnitResolution);
+}
+
+}  // namespace cv
diff --git a/modules/3d/src/rgbd/hash_tsdf.hpp b/modules/3d/src/rgbd/hash_tsdf.hpp
new file mode 100644
index 0000000000..6e8ce5f966
--- /dev/null
+++ b/modules/3d/src/rgbd/hash_tsdf.hpp
@@ -0,0 +1,47 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_3D_HASH_TSDF_HPP
+#define OPENCV_3D_HASH_TSDF_HPP
+
+#include "../precomp.hpp"
+#include "tsdf_functions.hpp"
+
+namespace cv
+{
+
+class HashTSDFVolume : public Volume
+{
+   public:
+    // dimension in voxels, size in meters
+    //! Use fixed volume cuboid
+    HashTSDFVolume(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor, float _truncDist,
+                   int _maxWeight, float _truncateThreshold, int _volumeUnitRes,
+                   bool zFirstMemOrder = true);
+
+    virtual ~HashTSDFVolume() = default;
+
+    virtual int getVisibleBlocks(int currFrameId, int frameThreshold) const override = 0;
+    virtual size_t getTotalVolumeUnits() const override = 0;
+
+   public:
+    int maxWeight;
+    float truncDist;
+    float truncateThreshold;
+    int volumeUnitResolution;
+    int volumeUnitDegree;
+    float volumeUnitSize;
+    bool zFirstMemOrder;
+    Vec4i volStrides;
+};
+
+//template<typename T>
+Ptr<HashTSDFVolume> makeHashTSDFVolume(const VolumeParams& _volumeParams);
+//template<typename T>
+Ptr<HashTSDFVolume> makeHashTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
+                                       int _maxWeight, float truncateThreshold, int volumeUnitResolution = 16);
+
+}  // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/src/rgbd/kinfu_frame.cpp b/modules/3d/src/rgbd/kinfu_frame.cpp
new file mode 100644
index 0000000000..8c9f7418d1
--- /dev/null
+++ b/modules/3d/src/rgbd/kinfu_frame.cpp
@@ -0,0 +1,782 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#include "../precomp.hpp"
+#include "utils.hpp"
+#include "opencl_kernels_3d.hpp"
+
+namespace cv {
+
+static void computePointsNormals(const cv::Intr, float depthFactor, const Depth, Points, Normals );
+void computePointsNormalsColors(const Intr, const Intr, float, const Depth, const Colors, Points, Normals, Colors);
+static Depth pyrDownBilateral(const Depth depth, float sigma);
+static void pyrDownPointsNormals(const Points p, const Normals n, Points& pdown, Normals& ndown);
+
+template<int p>
+inline float specPow(float x)
+{
+    if(p % 2 == 0)
+    {
+        float v = specPow<p/2>(x);
+        return v*v;
+    }
+    else
+    {
+        float v = specPow<(p-1)/2>(x);
+        return v*v*x;
+    }
+}
+
+template<>
+inline float specPow<0>(float /*x*/)
+{
+    return 1.f;
+}
+
+template<>
+inline float specPow<1>(float x)
+{
+    return x;
+}
+
+
+struct RenderInvoker : ParallelLoopBody
+{
+    RenderInvoker(const Points& _points, const Normals& _normals, Mat_<Vec4b>& _img, Vec3f _lightPt, Size _sz) :
+        ParallelLoopBody(),
+        points(_points),
+        normals(_normals),
+        img(_img),
+        lightPt(_lightPt),
+        sz(_sz)
+    { }
+
+    virtual void operator ()(const Range& range) const override
+    {
+        for(int y = range.start; y < range.end; y++)
+        {
+            Vec4b* imgRow = img[y];
+            const ptype* ptsRow = points[y];
+            const ptype* nrmRow = normals[y];
+
+            for(int x = 0; x < sz.width; x++)
+            {
+                Point3f p = fromPtype(ptsRow[x]);
+                Point3f n = fromPtype(nrmRow[x]);
+
+                Vec4b color;
+
+                if(isNaN(p))
+                {
+                    color = Vec4b(0, 32, 0, 0);
+                }
+                else
+                {
+                    const float Ka = 0.3f;  //ambient coeff
+                    const float Kd = 0.5f;  //diffuse coeff
+                    const float Ks = 0.2f;  //specular coeff
+                    const int   sp = 20;  //specular power
+
+                    const float Ax = 1.f;   //ambient color,  can be RGB
+                    const float Dx = 1.f;   //diffuse color,  can be RGB
+                    const float Sx = 1.f;   //specular color, can be RGB
+                    const float Lx = 1.f;   //light color
+
+                    Point3f l = normalize(lightPt - Vec3f(p));
+                    Point3f v = normalize(-Vec3f(p));
+                    Point3f r = normalize(Vec3f(2.f*n*n.dot(l) - l));
+
+                    uchar ix = (uchar)((Ax*Ka*Dx + Lx*Kd*Dx*max(0.f, n.dot(l)) +
+                                        Lx*Ks*Sx*specPow<sp>(max(0.f, r.dot(v))))*255.f);
+                    color = Vec4b(ix, ix, ix, 0);
+                }
+
+                imgRow[x] = color;
+            }
+        }
+    }
+
+    const Points& points;
+    const Normals& normals;
+    Mat_<Vec4b>& img;
+    Vec3f lightPt;
+    Size sz;
+};
+
+struct RenderColorInvoker : ParallelLoopBody
+{
+    RenderColorInvoker(const Points& _points, const Colors& _colors, Mat_<Vec4b>& _img, Size _sz) :
+        ParallelLoopBody(),
+        points(_points),
+        colors(_colors),
+        img(_img),
+        sz(_sz)
+    { }
+
+    virtual void operator ()(const Range& range) const override
+    {
+        for(int y = range.start; y < range.end; y++)
+        {
+            Vec4b* imgRow = img[y];
+            const ptype* ptsRow = points[y];
+            const ptype* clrRow = colors[y];
+
+            for(int x = 0; x < sz.width; x++)
+            {
+                Point3f p = fromPtype(ptsRow[x]);
+                Point3f c = fromPtype(clrRow[x]);
+                Vec4b color;
+
+                if(isNaN(p) || isNaN(c))
+                {
+                    color = Vec4b(0, 32, 0, 0);
+                }
+                else
+                {
+                    color = Vec4b((uchar)c.x, (uchar)c.y, (uchar)c.z, (uchar)0);
+                }
+
+                imgRow[x] = color;
+            }
+        }
+    }
+
+    const Points& points;
+    const Colors& colors;
+    Mat_<Vec4b>& img;
+    Size sz;
+};
+
+
+void pyrDownPointsNormals(const Points p, const Normals n, Points &pdown, Normals &ndown)
+{
+    CV_TRACE_FUNCTION();
+
+    for(int y = 0; y < pdown.rows; y++)
+    {
+        ptype* ptsRow = pdown[y];
+        ptype* nrmRow = ndown[y];
+        const ptype* pUpRow0 = p[2*y];
+        const ptype* pUpRow1 = p[2*y+1];
+        const ptype* nUpRow0 = n[2*y];
+        const ptype* nUpRow1 = n[2*y+1];
+        for(int x = 0; x < pdown.cols; x++)
+        {
+            Point3f point = nan3, normal = nan3;
+
+            Point3f d00 = fromPtype(pUpRow0[2*x]);
+            Point3f d01 = fromPtype(pUpRow0[2*x+1]);
+            Point3f d10 = fromPtype(pUpRow1[2*x]);
+            Point3f d11 = fromPtype(pUpRow1[2*x+1]);
+
+            if(!(isNaN(d00) || isNaN(d01) || isNaN(d10) || isNaN(d11)))
+            {
+                point = (d00 + d01 + d10 + d11)*0.25f;
+
+                Point3f n00 = fromPtype(nUpRow0[2*x]);
+                Point3f n01 = fromPtype(nUpRow0[2*x+1]);
+                Point3f n10 = fromPtype(nUpRow1[2*x]);
+                Point3f n11 = fromPtype(nUpRow1[2*x+1]);
+
+                normal = (n00 + n01 + n10 + n11)*0.25f;
+            }
+
+            ptsRow[x] = toPtype(point);
+            nrmRow[x] = toPtype(normal);
+        }
+    }
+}
+
+struct PyrDownBilateralInvoker : ParallelLoopBody
+{
+    PyrDownBilateralInvoker(const Depth& _depth, Depth& _depthDown, float _sigma) :
+        ParallelLoopBody(),
+        depth(_depth),
+        depthDown(_depthDown),
+        sigma(_sigma)
+    { }
+
+    virtual void operator ()(const Range& range) const override
+    {
+        float sigma3 = sigma*3;
+        const int D = 5;
+
+        for(int y = range.start; y < range.end; y++)
+        {
+            depthType* downRow = depthDown[y];
+            const depthType* srcCenterRow = depth[2*y];
+
+            for(int x = 0; x < depthDown.cols; x++)
+            {
+                depthType center = srcCenterRow[2*x];
+
+                int sx = max(0, 2*x - D/2), ex = min(2*x - D/2 + D, depth.cols-1);
+                int sy = max(0, 2*y - D/2), ey = min(2*y - D/2 + D, depth.rows-1);
+
+                depthType sum = 0;
+                int count = 0;
+
+                for(int iy = sy; iy < ey; iy++)
+                {
+                    const depthType* srcRow = depth[iy];
+                    for(int ix = sx; ix < ex; ix++)
+                    {
+                        depthType val = srcRow[ix];
+                        if(abs(val - center) < sigma3)
+                        {
+                            sum += val; count ++;
+                        }
+                    }
+                }
+
+                downRow[x] = (count == 0) ? 0 : sum / count;
+            }
+        }
+    }
+
+    const Depth& depth;
+    Depth& depthDown;
+    float sigma;
+};
+
+
+Depth pyrDownBilateral(const Depth depth, float sigma)
+{
+    CV_TRACE_FUNCTION();
+
+    Depth depthDown(depth.rows/2, depth.cols/2);
+
+    PyrDownBilateralInvoker pdi(depth, depthDown, sigma);
+    Range range(0, depthDown.rows);
+    const int nstripes = -1;
+    parallel_for_(range, pdi, nstripes);
+
+    return depthDown;
+}
+
+struct ComputePointsNormalsInvoker : ParallelLoopBody
+{
+    ComputePointsNormalsInvoker(const Depth& _depth, Points& _points, Normals& _normals,
+                                const Intr::Reprojector& _reproj, float _dfac) :
+        ParallelLoopBody(),
+        depth(_depth),
+        points(_points),
+        normals(_normals),
+        reproj(_reproj),
+        dfac(_dfac)
+    { }
+
+    virtual void operator ()(const Range& range) const override
+    {
+        for(int y = range.start; y < range.end; y++)
+        {
+            const depthType* depthRow0 = depth[y];
+            const depthType* depthRow1 = (y < depth.rows - 1) ? depth[y + 1] : 0;
+            ptype    *ptsRow = points[y];
+            ptype   *normRow = normals[y];
+
+            for(int x = 0; x < depth.cols; x++)
+            {
+                depthType d00 = depthRow0[x];
+                depthType z00 = d00*dfac;
+                Point3f v00 = reproj(Point3f((float)x, (float)y, z00));
+
+                Point3f p = nan3, n = nan3;
+
+                if(x < depth.cols - 1 && y < depth.rows - 1)
+                {
+                    depthType d01 = depthRow0[x+1];
+                    depthType d10 = depthRow1[x];
+
+                    depthType z01 = d01*dfac;
+                    depthType z10 = d10*dfac;
+
+                    // before it was
+                    //if(z00*z01*z10 != 0)
+                    if(z00 != 0 && z01 != 0 && z10 != 0)
+                    {
+                        Point3f v01 = reproj(Point3f((float)(x+1), (float)(y+0), z01));
+                        Point3f v10 = reproj(Point3f((float)(x+0), (float)(y+1), z10));
+
+                        cv::Vec3f vec = (v01-v00).cross(v10-v00);
+                        n = -normalize(vec);
+                        p = v00;
+                    }
+                }
+
+                ptsRow[x] = toPtype(p);
+                normRow[x] = toPtype(n);
+            }
+        }
+    }
+
+    const Depth& depth;
+    Points& points;
+    Normals& normals;
+    const Intr::Reprojector& reproj;
+    float dfac;
+};
+
+
+void computePointsNormals(const Intr intr, float depthFactor, const Depth depth,
+                          Points points, Normals normals)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(!points.empty() && !normals.empty());
+    CV_Assert(depth.size() == points.size());
+    CV_Assert(depth.size() == normals.size());
+
+    // conversion to meters
+    // before it was:
+    //float dfac = 0.001f/depthFactor;
+    float dfac = 1.f/depthFactor;
+
+    Intr::Reprojector reproj = intr.makeReprojector();
+
+    ComputePointsNormalsInvoker ci(depth, points, normals, reproj, dfac);
+    Range range(0, depth.rows);
+    const int nstripes = -1;
+    parallel_for_(range, ci, nstripes);
+}
+
+
+///////// GPU implementation /////////
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_renderPointsNormals(const UMat points, const UMat normals, UMat image, Vec3f lightLoc);
+static bool ocl_makeFrameFromDepth(const UMat depth, OutputArrayOfArrays points, OutputArrayOfArrays normals,
+                                   const Intr intr, int levels, float depthFactor,
+                                   float sigmaDepth, float sigmaSpatial, int kernelSize,
+                                   float truncateThreshold);
+static bool ocl_buildPyramidPointsNormals(const UMat points, const UMat normals,
+                                          OutputArrayOfArrays pyrPoints, OutputArrayOfArrays pyrNormals,
+                                          int levels);
+
+static bool computePointsNormalsGpu(const Intr intr, float depthFactor, const UMat& depth, UMat& points, UMat& normals);
+static bool pyrDownBilateralGpu(const UMat& depth, UMat& depthDown, float sigma);
+static bool customBilateralFilterGpu(const UMat src, UMat& dst, int kernelSize, float sigmaDepth, float sigmaSpatial);
+static bool pyrDownPointsNormalsGpu(const UMat p, const UMat n, UMat &pdown, UMat &ndown);
+
+
+bool computePointsNormalsGpu(const Intr intr, float depthFactor, const UMat& depth,
+                             UMat& points, UMat& normals)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(!points.empty() && !normals.empty());
+    CV_Assert(depth.size() == points.size());
+    CV_Assert(depth.size() == normals.size());
+    CV_Assert(depth.type() == DEPTH_TYPE);
+    CV_Assert(points.type()  == POINT_TYPE);
+    CV_Assert(normals.type() == POINT_TYPE);
+
+    // conversion to meters
+    float dfac = 1.f/depthFactor;
+
+    Intr::Reprojector reproj = intr.makeReprojector();
+
+    cv::String errorStr;
+    cv::String name = "computePointsNormals";
+    ocl::ProgramSource source = ocl::_3d::kinfu_frame_oclsrc;
+    cv::String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if(k.empty())
+        return false;
+
+    Vec2f fxyinv(reproj.fxinv, reproj.fyinv), cxy(reproj.cx, reproj.cy);
+
+    k.args(ocl::KernelArg::WriteOnlyNoSize(points),
+           ocl::KernelArg::WriteOnlyNoSize(normals),
+           ocl::KernelArg::ReadOnly(depth),
+           fxyinv.val,
+           cxy.val,
+           dfac);
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)depth.cols;
+    globalSize[1] = (size_t)depth.rows;
+
+    return k.run(2, globalSize, NULL, true);
+}
+
+
+bool pyrDownBilateralGpu(const UMat& depth, UMat& depthDown, float sigma)
+{
+    CV_TRACE_FUNCTION();
+
+    depthDown.create(depth.rows/2, depth.cols/2, DEPTH_TYPE);
+
+    cv::String errorStr;
+    cv::String name = "pyrDownBilateral";
+    ocl::ProgramSource source = ocl::_3d::kinfu_frame_oclsrc;
+    cv::String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if(k.empty())
+        return false;
+
+    k.args(ocl::KernelArg::ReadOnly(depth),
+           ocl::KernelArg::WriteOnly(depthDown),
+           sigma);
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)depthDown.cols;
+    globalSize[1] = (size_t)depthDown.rows;
+
+    return k.run(2, globalSize, NULL, true);
+}
+
+//TODO: remove it when OpenCV's bilateral processes 32f on GPU
+bool customBilateralFilterGpu(const UMat src /* udepth */, UMat& dst /* smooth */,
+                              int kernelSize, float sigmaDepth, float sigmaSpatial)
+{
+    CV_TRACE_FUNCTION();
+
+    Size frameSize = src.size();
+
+    CV_Assert(frameSize.area() > 0);
+    CV_Assert(src.type() == DEPTH_TYPE);
+
+    dst.create(frameSize, DEPTH_TYPE);
+
+    cv::String errorStr;
+    cv::String name = "customBilateral";
+    ocl::ProgramSource source = ocl::_3d::kinfu_frame_oclsrc;
+    cv::String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if(k.empty())
+        return false;
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(src),
+           ocl::KernelArg::WriteOnlyNoSize(dst),
+           frameSize,
+           kernelSize,
+           0.5f / (sigmaSpatial * sigmaSpatial),
+           0.5f / (sigmaDepth * sigmaDepth));
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)src.cols;
+    globalSize[1] = (size_t)src.rows;
+
+    return k.run(2, globalSize, NULL, true);
+}
+
+
+bool pyrDownPointsNormalsGpu(const UMat p, const UMat n, UMat &pdown, UMat &ndown)
+{
+    CV_TRACE_FUNCTION();
+
+    cv::String errorStr;
+    cv::String name = "pyrDownPointsNormals";
+    ocl::ProgramSource source = ocl::_3d::kinfu_frame_oclsrc;
+    cv::String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if(k.empty())
+        return false;
+
+    Size downSize = pdown.size();
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(p),
+           ocl::KernelArg::ReadOnlyNoSize(n),
+           ocl::KernelArg::WriteOnlyNoSize(pdown),
+           ocl::KernelArg::WriteOnlyNoSize(ndown),
+           downSize);
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)pdown.cols;
+    globalSize[1] = (size_t)pdown.rows;
+
+    return k.run(2, globalSize, NULL, true);
+}
+
+
+static bool ocl_renderPointsNormals(const UMat points, const UMat normals,
+                                    UMat img, Vec3f lightLoc)
+{
+    CV_TRACE_FUNCTION();
+
+    cv::String errorStr;
+    cv::String name = "render";
+    ocl::ProgramSource source = ocl::_3d::kinfu_frame_oclsrc;
+    cv::String options = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if(k.empty())
+        return false;
+
+    Vec4f lightPt(lightLoc[0], lightLoc[1], lightLoc[2]);
+    Size frameSize = points.size();
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(points),
+           ocl::KernelArg::ReadOnlyNoSize(normals),
+           ocl::KernelArg::WriteOnlyNoSize(img),
+           frameSize,
+           lightPt.val);
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)points.cols;
+    globalSize[1] = (size_t)points.rows;
+
+    return k.run(2, globalSize, NULL, true);
+}
+
+
+static bool ocl_makeFrameFromDepth(const UMat depth, OutputArrayOfArrays points, OutputArrayOfArrays normals,
+                                   const Intr intr, int levels, float depthFactor,
+                                   float sigmaDepth, float sigmaSpatial, int kernelSize,
+                                   float truncateThreshold)
+{
+    CV_TRACE_FUNCTION();
+
+    // looks like OpenCV's bilateral filter works the same as KinFu's
+    UMat smooth;
+    //TODO: fix that
+    // until 32f isn't implemented in OpenCV in OpenCL, we should use our workarounds
+    //bilateralFilter(udepth, smooth, kernelSize, sigmaDepth*depthFactor, sigmaSpatial);
+    if(!customBilateralFilterGpu(depth, smooth, kernelSize, sigmaDepth*depthFactor, sigmaSpatial))
+        return false;
+
+    // depth truncation can be used in some scenes
+    UMat depthThreshold;
+    if(truncateThreshold > 0.f)
+        threshold(smooth, depthThreshold, truncateThreshold * depthFactor, 0.0, THRESH_TOZERO_INV);
+    else
+        depthThreshold = smooth;
+
+    UMat scaled = depthThreshold;
+    Size sz = smooth.size();
+    points.create(levels, 1, POINT_TYPE);
+    normals.create(levels, 1, POINT_TYPE);
+    for(int i = 0; i < levels; i++)
+    {
+        UMat& p = points.getUMatRef(i);
+        UMat& n = normals.getUMatRef(i);
+        p.create(sz, POINT_TYPE);
+        n.create(sz, POINT_TYPE);
+
+        if(!computePointsNormalsGpu(intr.scale(i), depthFactor, scaled, p, n))
+            return false;
+
+        if(i < levels - 1)
+        {
+            sz.width /= 2, sz.height /= 2;
+            UMat halfDepth(sz, DEPTH_TYPE);
+            pyrDownBilateralGpu(scaled, halfDepth, sigmaDepth*depthFactor);
+            scaled = halfDepth;
+        }
+    }
+
+    return true;
+}
+
+
+static bool ocl_buildPyramidPointsNormals(const UMat points, const UMat normals,
+                                          OutputArrayOfArrays pyrPoints, OutputArrayOfArrays pyrNormals,
+                                          int levels)
+{
+    CV_TRACE_FUNCTION();
+
+    pyrPoints .create(levels, 1, POINT_TYPE);
+    pyrNormals.create(levels, 1, POINT_TYPE);
+
+    pyrPoints .getUMatRef(0) = points;
+    pyrNormals.getUMatRef(0) = normals;
+
+    Size sz = points.size();
+    for(int i = 1; i < levels; i++)
+    {
+        UMat p1 = pyrPoints .getUMat(i-1);
+        UMat n1 = pyrNormals.getUMat(i-1);
+
+        sz.width /= 2; sz.height /= 2;
+        UMat& p0 = pyrPoints .getUMatRef(i);
+        UMat& n0 = pyrNormals.getUMatRef(i);
+        p0.create(sz, POINT_TYPE);
+        n0.create(sz, POINT_TYPE);
+
+        if(!pyrDownPointsNormalsGpu(p1, n1, p0, n0))
+            return false;
+    }
+
+    return true;
+}
+
+#endif
+
+namespace detail {
+
+void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray image, cv::Vec3f lightLoc)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_points.size().area() > 0);
+    CV_Assert(_points.size() == _normals.size());
+
+    Size sz = _points.size();
+    image.create(sz, CV_8UC4);
+
+    CV_OCL_RUN(_points.isUMat() && _normals.isUMat() && image.isUMat(),
+               ocl_renderPointsNormals(_points.getUMat(),
+                                       _normals.getUMat(),
+                                       image.getUMat(), lightLoc))
+
+    Points  points  = _points.getMat();
+    Normals normals = _normals.getMat();
+
+    Mat_<Vec4b> img = image.getMat();
+
+    RenderInvoker ri(points, normals, img, lightLoc, sz);
+    Range range(0, sz.height);
+    const int nstripes = -1;
+    parallel_for_(range, ri, nstripes);
+}
+
+void renderPointsNormalsColors(InputArray _points, InputArray _normals, InputArray _colors, OutputArray image)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_points.size().area() > 0);
+    CV_Assert(_points.size() == _normals.size());
+
+    Size sz = _points.size();
+    image.create(sz, CV_8UC4);
+
+    Points  points  = _points.getMat();
+    Normals normals = _normals.getMat();
+    Colors colors = _colors.getMat();
+
+    Mat_<Vec4b> img = image.getMat();
+
+    RenderColorInvoker ri(points, colors, img, sz);
+    Range range(0, sz.height);
+    const int nstripes = -1;
+    parallel_for_(range, ri, nstripes);
+}
+
+} // namespace detail
+
+void makeFrameFromDepth(InputArray _depth,
+                        OutputArray pyrPoints, OutputArray pyrNormals,
+                        const Matx33f _intr, int levels, float depthFactor,
+                        float sigmaDepth, float sigmaSpatial, int kernelSize,
+                        float truncateThreshold)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_depth.type() == DEPTH_TYPE);
+
+    Intr intr(_intr);
+    CV_OCL_RUN(_depth.isUMat() && pyrPoints.isUMatVector() && pyrNormals.isUMatVector(),
+               ocl_makeFrameFromDepth(_depth.getUMat(), pyrPoints, pyrNormals,
+                                      intr, levels, depthFactor,
+                                      sigmaDepth, sigmaSpatial, kernelSize,
+                                      truncateThreshold));
+
+    int kp = pyrPoints.kind(), kn = pyrNormals.kind();
+    // There can be UMats in the container (when OpenCL is off)
+    CV_Assert(kp == _InputArray::STD_ARRAY_MAT || kp == _InputArray::STD_VECTOR_MAT || kp == _InputArray::STD_VECTOR_UMAT);
+    CV_Assert(kn == _InputArray::STD_ARRAY_MAT || kn == _InputArray::STD_VECTOR_MAT || kp == _InputArray::STD_VECTOR_UMAT);
+
+    Depth depth = _depth.getMat();
+
+    // looks like OpenCV's bilateral filter works the same as KinFu's
+    Depth smooth;
+    Depth depthNoNans = depth.clone();
+    patchNaNs(depthNoNans);
+    bilateralFilter(depthNoNans, smooth, kernelSize, sigmaDepth * depthFactor, sigmaSpatial);
+
+    // depth truncation can be used in some scenes
+    Depth depthThreshold;
+    if(truncateThreshold > 0.f)
+        threshold(smooth, depthThreshold, truncateThreshold * depthFactor, 0.0, THRESH_TOZERO_INV);
+    else
+        depthThreshold = smooth;
+
+    // we don't need depth pyramid outside this method
+    // if we do, the code is to be refactored
+
+    Depth scaled = depthThreshold;
+    Size sz = smooth.size();
+    pyrPoints.create(levels, 1, POINT_TYPE);
+    pyrNormals.create(levels, 1, POINT_TYPE);
+    for(int i = 0; i < levels; i++)
+    {
+        pyrPoints .create(sz, POINT_TYPE, i);
+        pyrNormals.create(sz, POINT_TYPE, i);
+
+        // There can be UMats in the container (when OpenCL is off),
+        // getMatRef() is not applicable
+        Points  p = pyrPoints. getMat(i);
+        Normals n = pyrNormals.getMat(i);
+
+        computePointsNormals(intr.scale(i), depthFactor, scaled, p, n);
+
+        if(i < levels - 1)
+        {
+            sz.width /= 2; sz.height /= 2;
+            scaled = pyrDownBilateral(scaled, sigmaDepth*depthFactor);
+        }
+    }
+}
+
+
+void buildPyramidPointsNormals(InputArray _points, InputArray _normals,
+                               OutputArrayOfArrays pyrPoints, OutputArrayOfArrays pyrNormals,
+                               int levels)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_points.type() == POINT_TYPE);
+    CV_Assert(_points.type() == _normals.type());
+    CV_Assert(_points.size() == _normals.size());
+
+    CV_OCL_RUN(_points.isUMat() && _normals.isUMat() &&
+               pyrPoints.isUMatVector() && pyrNormals.isUMatVector(),
+               ocl_buildPyramidPointsNormals(_points.getUMat(), _normals.getUMat(),
+                                             pyrPoints, pyrNormals,
+                                             levels));
+
+    int kp = pyrPoints.kind(), kn = pyrNormals.kind();
+    CV_Assert(kp == _InputArray::STD_ARRAY_MAT || kp == _InputArray::STD_VECTOR_MAT);
+    CV_Assert(kn == _InputArray::STD_ARRAY_MAT || kn == _InputArray::STD_VECTOR_MAT);
+
+    Mat p0 = _points.getMat(), n0 = _normals.getMat();
+
+    pyrPoints .create(levels, 1, POINT_TYPE);
+    pyrNormals.create(levels, 1, POINT_TYPE);
+
+    pyrPoints .getMatRef(0) = p0;
+    pyrNormals.getMatRef(0) = n0;
+
+    Size sz = _points.size();
+    for(int i = 1; i < levels; i++)
+    {
+        Points  p1 = pyrPoints .getMat(i-1);
+        Normals n1 = pyrNormals.getMat(i-1);
+
+        sz.width /= 2; sz.height /= 2;
+
+        pyrPoints .create(sz, POINT_TYPE, i);
+        pyrNormals.create(sz, POINT_TYPE, i);
+        Points  pd = pyrPoints. getMatRef(i);
+        Normals nd = pyrNormals.getMatRef(i);
+
+        pyrDownPointsNormals(p1, n1, pd, nd);
+    }
+}
+
+} // namespace cv
diff --git a/modules/3d/src/rgbd/normal.cpp b/modules/3d/src/rgbd/normal.cpp
new file mode 100644
index 0000000000..8340935a1a
--- /dev/null
+++ b/modules/3d/src/rgbd/normal.cpp
@@ -0,0 +1,777 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../precomp.hpp"
+
+namespace cv
+{
+
+/** Just compute the norm of a vector
+ * @param vec a vector of size 3 and any type T
+ * @return
+ */
+template<typename T>
+T inline norm_vec(const Vec<T, 3>& vec)
+{
+    return std::sqrt(vec[0] * vec[0] + vec[1] * vec[1] + vec[2] * vec[2]);
+}
+
+/** Given 3d points, compute their distance to the origin
+ * @param points
+ * @return
+ */
+template<typename T>
+Mat_<T> computeRadius(const Mat& points)
+{
+    typedef Vec<T, 3> PointT;
+
+    // Compute the
+    Size size(points.cols, points.rows);
+    Mat_<T> r(size);
+    if (points.isContinuous())
+        size = Size(points.cols * points.rows, 1);
+    for (int y = 0; y < size.height; ++y)
+    {
+        const PointT* point = points.ptr < PointT >(y), * point_end = points.ptr < PointT >(y) + size.width;
+        T* row = r[y];
+        for (; point != point_end; ++point, ++row)
+            *row = norm_vec(*point);
+    }
+
+    return r;
+}
+
+// Compute theta and phi according to equation 3 of
+// ``Fast and Accurate Computation of Surface Normals from Range Images``
+// by H. Badino, D. Huber, Y. Park and T. Kanade
+template<typename T>
+void computeThetaPhi(int rows, int cols, const Matx<T, 3, 3>& K, Mat& cos_theta, Mat& sin_theta,
+                     Mat& cos_phi, Mat& sin_phi)
+{
+    // Create some bogus coordinates
+    Mat depth_image = K(0, 0) * Mat_<T> ::ones(rows, cols);
+    Mat points3d;
+    depthTo3d(depth_image, Mat(K), points3d);
+
+    typedef Vec<T, 3> Vec3T;
+
+    cos_theta = Mat_<T>(rows, cols);
+    sin_theta = Mat_<T>(rows, cols);
+    cos_phi = Mat_<T>(rows, cols);
+    sin_phi = Mat_<T>(rows, cols);
+    Mat r = computeRadius<T>(points3d);
+    for (int y = 0; y < rows; ++y)
+    {
+        T* row_cos_theta = cos_theta.ptr <T>(y), * row_sin_theta = sin_theta.ptr <T>(y);
+        T* row_cos_phi = cos_phi.ptr <T>(y), * row_sin_phi = sin_phi.ptr <T>(y);
+        const Vec3T* row_points = points3d.ptr <Vec3T>(y),
+               * row_points_end = points3d.ptr <Vec3T>(y) + points3d.cols;
+        const T* row_r = r.ptr < T >(y);
+        for (; row_points < row_points_end;
+            ++row_cos_theta, ++row_sin_theta, ++row_cos_phi, ++row_sin_phi, ++row_points, ++row_r)
+        {
+            // In the paper, z goes away from the camera, y goes down, x goes right
+            // OpenCV has the same conventions
+            // Theta goes from z to x (and actually goes from -pi/2 to pi/2, phi goes from z to y
+            float theta = (float)std::atan2(row_points->val[0], row_points->val[2]);
+            *row_cos_theta = std::cos(theta);
+            *row_sin_theta = std::sin(theta);
+            float phi = (float)std::asin(row_points->val[1] / (*row_r));
+            *row_cos_phi = std::cos(phi);
+            *row_sin_phi = std::sin(phi);
+        }
+    }
+}
+
+/** Modify normals to make sure they point towards the camera
+ * @param normals
+ */
+template<typename T>
+inline void signNormal(const Vec<T, 3>& normal_in, Vec<T, 3>& normal_out)
+{
+    Vec<T, 3> res;
+    if (normal_in[2] > 0)
+        res = -normal_in / norm_vec(normal_in);
+    else
+        res = normal_in / norm_vec(normal_in);
+
+    normal_out[0] = res[0];
+    normal_out[1] = res[1];
+    normal_out[2] = res[2];
+}
+
+/** Modify normals to make sure they point towards the camera
+ * @param normals
+ */
+template<typename T>
+inline void signNormal(T a, T b, T c, Vec<T, 3>& normal)
+{
+    T norm = 1 / std::sqrt(a * a + b * b + c * c);
+    if (c > 0)
+    {
+        normal[0] = -a * norm;
+        normal[1] = -b * norm;
+        normal[2] = -c * norm;
+    }
+    else
+    {
+        normal[0] = a * norm;
+        normal[1] = b * norm;
+        normal[2] = c * norm;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class RgbdNormalsImpl : public RgbdNormals
+{
+public:
+    static const int dtype = cv::traits::Depth<T>::value;
+
+    RgbdNormalsImpl(int _rows, int _cols, int _windowSize, const Mat& _K, RgbdNormals::RgbdNormalsMethod _method) :
+        rows(_rows),
+        cols(_cols),
+        windowSize(_windowSize),
+        method(_method),
+        cacheIsDirty(true)
+    {
+        CV_Assert(_K.cols == 3 && _K.rows == 3);
+
+        _K.convertTo(K, dtype);
+        _K.copyTo(K_ori);
+    }
+
+    virtual ~RgbdNormalsImpl() CV_OVERRIDE
+    { }
+
+    virtual int getDepth() const CV_OVERRIDE
+    {
+        return dtype;
+    }
+    virtual int getRows() const CV_OVERRIDE
+    {
+        return rows;
+    }
+    virtual void setRows(int val) CV_OVERRIDE
+    {
+        rows = val; cacheIsDirty = true;
+    }
+    virtual int getCols() const CV_OVERRIDE
+    {
+        return cols;
+    }
+    virtual void setCols(int val) CV_OVERRIDE
+    {
+        cols = val; cacheIsDirty = true;
+    }
+    virtual int getWindowSize() const CV_OVERRIDE
+    {
+        return windowSize;
+    }
+    virtual void setWindowSize(int val) CV_OVERRIDE
+    {
+        windowSize = val; cacheIsDirty = true;
+    }
+    virtual void getK(OutputArray val) const CV_OVERRIDE
+    {
+        K.copyTo(val);
+    }
+    virtual void setK(InputArray val) CV_OVERRIDE
+    {
+        K = val.getMat(); cacheIsDirty = true;
+    }
+    virtual RgbdNormalsMethod getMethod() const CV_OVERRIDE
+    {
+        return method;
+    }
+    virtual void setMethod(RgbdNormalsMethod val) CV_OVERRIDE
+    {
+        method = val; cacheIsDirty = true;
+    }
+
+    // Helper functions for apply()
+    virtual void assertOnBadArg(const Mat& points3d_ori) const = 0;
+    virtual void calcRadiusAnd3d(const Mat& points3d_ori, Mat& points3d, Mat& radius) const
+    {
+        // Make the points have the right depth
+        if (points3d_ori.depth() == dtype)
+            points3d = points3d_ori;
+        else
+            points3d_ori.convertTo(points3d, dtype);
+
+        // Compute the distance to the points
+        radius = computeRadius<T>(points3d);
+    }
+    virtual void compute(const Mat& in, Mat& normals) const = 0;
+
+    /** Given a set of 3d points in a depth image, compute the normals at each point
+     * @param points3d_in depth a float depth image. Or it can be rows x cols x 3 is they are 3d points
+     * @param normals a rows x cols x 3 matrix
+     */
+    virtual void apply(InputArray points3d_in, OutputArray normals_out) const CV_OVERRIDE
+    {
+        Mat points3d_ori = points3d_in.getMat();
+
+        CV_Assert(points3d_ori.dims == 2);
+
+        // Either we have 3d points or a depth image
+        assertOnBadArg(points3d_ori);
+
+        // Initialize the pimpl
+        cache();
+
+        // Precompute something for RGBD_NORMALS_METHOD_SRI and RGBD_NORMALS_METHOD_FALS
+        Mat points3d, radius;
+        calcRadiusAnd3d(points3d_ori, points3d, radius);
+
+        // Get the normals
+        normals_out.create(points3d_ori.size(), CV_MAKETYPE(dtype, 3));
+        if (points3d_in.empty())
+            return;
+
+        Mat normals = normals_out.getMat();
+        if ((method == RGBD_NORMALS_METHOD_FALS) || (method == RGBD_NORMALS_METHOD_SRI))
+        {
+            compute(radius, normals);
+        }
+        else // LINEMOD
+        {
+            compute(points3d_ori, normals);
+        }
+    }
+
+    int rows, cols;
+    Mat K, K_ori;
+    int windowSize;
+    RgbdNormalsMethod method;
+    mutable bool cacheIsDirty;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Given a set of 3d points in a depth image, compute the normals at each point
+ * using the FALS method described in
+ * ``Fast and Accurate Computation of Surface Normals from Range Images``
+ * by H. Badino, D. Huber, Y. Park and T. Kanade
+ */
+template<typename T>
+class FALS : public RgbdNormalsImpl<T>
+{
+public:
+    typedef Matx<T, 3, 3> Mat33T;
+    typedef Vec<T, 9> Vec9T;
+    typedef Vec<T, 3> Vec3T;
+
+    FALS(int _rows, int _cols, int _windowSize, const Mat& _K) :
+        RgbdNormalsImpl<T>(_rows, _cols, _windowSize, _K, RgbdNormals::RGBD_NORMALS_METHOD_FALS)
+    { }
+    virtual ~FALS() CV_OVERRIDE
+    { }
+
+    /** Compute cached data
+     */
+    virtual void cache() const CV_OVERRIDE
+    {
+        if (!this->cacheIsDirty)
+            return;
+
+        // Compute theta and phi according to equation 3
+        Mat cos_theta, sin_theta, cos_phi, sin_phi;
+        computeThetaPhi<T>(this->rows, this->cols, this->K, cos_theta, sin_theta, cos_phi, sin_phi);
+
+        // Compute all the v_i for every points
+        std::vector<Mat> channels(3);
+        channels[0] = sin_theta.mul(cos_phi);
+        channels[1] = sin_phi;
+        channels[2] = cos_theta.mul(cos_phi);
+        merge(channels, V_);
+
+        // Compute M
+        Mat_<Vec9T> M(this->rows, this->cols);
+        Mat33T VVt;
+        const Vec3T* vec = V_[0];
+        Vec9T* M_ptr = M[0], * M_ptr_end = M_ptr + this->rows * this->cols;
+        for (; M_ptr != M_ptr_end; ++vec, ++M_ptr)
+        {
+            VVt = (*vec) * vec->t();
+            *M_ptr = Vec9T(VVt.val);
+        }
+
+        boxFilter(M, M, M.depth(), Size(this->windowSize, this->windowSize), Point(-1, -1), false);
+
+        // Compute M's inverse
+        Mat33T M_inv;
+        M_inv_.create(this->rows, this->cols);
+        Vec9T* M_inv_ptr = M_inv_[0];
+        for (M_ptr = &M(0); M_ptr != M_ptr_end; ++M_inv_ptr, ++M_ptr)
+        {
+            // We have a semi-definite matrix
+            invert(Mat33T(M_ptr->val), M_inv, DECOMP_CHOLESKY);
+            *M_inv_ptr = Vec9T(M_inv.val);
+        }
+
+        this->cacheIsDirty = false;
+    }
+
+    /** Compute the normals
+     * @param r
+     * @return
+     */
+    virtual void compute(const Mat& r, Mat& normals) const CV_OVERRIDE
+    {
+        // Compute B
+        Mat_<Vec3T> B(this->rows, this->cols);
+
+        const T* row_r = r.ptr < T >(0), * row_r_end = row_r + this->rows * this->cols;
+        const Vec3T* row_V = V_[0];
+        Vec3T* row_B = B[0];
+        for (; row_r != row_r_end; ++row_r, ++row_B, ++row_V)
+        {
+            Vec3T val = (*row_V) / (*row_r);
+            if (cvIsInf(val[0]) || cvIsNaN(val[0]) ||
+                cvIsInf(val[1]) || cvIsNaN(val[1]) ||
+                cvIsInf(val[2]) || cvIsNaN(val[2]))
+                *row_B = Vec3T();
+            else
+                *row_B = val;
+        }
+
+        // Apply a box filter to B
+        boxFilter(B, B, B.depth(), Size(this->windowSize, this->windowSize), Point(-1, -1), false);
+
+        // compute the Minv*B products
+        row_r = r.ptr < T >(0);
+        const Vec3T* B_vec = B[0];
+        const Mat33T* M_inv = reinterpret_cast<const Mat33T*>(M_inv_.ptr(0));
+        Vec3T* normal = normals.ptr<Vec3T>(0);
+        for (; row_r != row_r_end; ++row_r, ++B_vec, ++normal, ++M_inv)
+            if (cvIsNaN(*row_r))
+            {
+                (*normal)[0] = *row_r;
+                (*normal)[1] = *row_r;
+                (*normal)[2] = *row_r;
+            }
+            else
+            {
+                Mat33T Mr = *M_inv;
+                Vec3T Br = *B_vec;
+                Vec3T MBr(Mr(0, 0) * Br[0] + Mr(0, 1) * Br[1] + Mr(0, 2) * Br[2],
+                          Mr(1, 0) * Br[0] + Mr(1, 1) * Br[1] + Mr(1, 2) * Br[2],
+                          Mr(2, 0) * Br[0] + Mr(2, 1) * Br[1] + Mr(2, 2) * Br[2]);
+                signNormal(MBr, *normal);
+            }
+    }
+
+    virtual void assertOnBadArg(const Mat& points3d_ori) const CV_OVERRIDE
+    {
+        CV_Assert(points3d_ori.channels() == 3);
+        CV_Assert(points3d_ori.depth() == CV_32F || points3d_ori.depth() == CV_64F);
+    }
+
+    // Cached data
+    mutable Mat_<Vec3T> V_;
+    mutable Mat_<Vec9T> M_inv_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Function that multiplies K_inv by a vector. It is just meant to speed up the product as we know
+ * that K_inv is upper triangular and K_inv(2,2)=1
+ * @param K_inv
+ * @param a
+ * @param b
+ * @param c
+ * @param res
+ */
+template<typename T, typename U>
+void multiply_by_K_inv(const Matx<T, 3, 3>& K_inv, U a, U b, U c, Vec<T, 3>& res)
+{
+    res[0] = (T)(K_inv(0, 0) * a + K_inv(0, 1) * b + K_inv(0, 2) * c);
+    res[1] = (T)(K_inv(1, 1) * b + K_inv(1, 2) * c);
+    res[2] = (T)c;
+}
+
+/** Given a depth image, compute the normals as detailed in the LINEMOD paper
+ * ``Gradient Response Maps for Real-Time Detection of Texture-Less Objects``
+ * by S. Hinterstoisser, C. Cagniart, S. Ilic, P. Sturm, N. Navab, P. Fua, and V. Lepetit
+ */
+template<typename T>
+class LINEMOD : public RgbdNormalsImpl<T>
+{
+public:
+    typedef Vec<T, 3> Vec3T;
+    typedef Matx<T, 3, 3> Mat33T;
+
+    LINEMOD(int _rows, int _cols, int _windowSize, const Mat& _K) :
+        RgbdNormalsImpl<T>(_rows, _cols, _windowSize, _K, RgbdNormals::RGBD_NORMALS_METHOD_LINEMOD)
+    { }
+
+    /** Compute cached data
+     */
+    virtual void cache() const CV_OVERRIDE
+    { }
+
+    /** Compute the normals
+     * @param r
+     * @param normals the output normals
+     */
+    virtual void compute(const Mat& points3d, Mat& normals) const CV_OVERRIDE
+    {
+        // Only focus on the depth image for LINEMOD
+        Mat depth_in;
+        if (points3d.channels() == 3)
+        {
+            std::vector<Mat> channels;
+            split(points3d, channels);
+            depth_in = channels[2];
+        }
+        else
+            depth_in = points3d;
+
+        switch (depth_in.depth())
+        {
+        case CV_16U:
+        {
+            const Mat_<unsigned short>& d(depth_in);
+            computeImpl<unsigned short, long>(d, normals);
+            break;
+        }
+        case CV_32F:
+        {
+            const Mat_<float>& d(depth_in);
+            computeImpl<float, float>(d, normals);
+            break;
+        }
+        case CV_64F:
+        {
+            const Mat_<double>& d(depth_in);
+            computeImpl<double, double>(d, normals);
+            break;
+        }
+        }
+    }
+
+    /** Compute the normals
+     * @param r
+     * @return
+     */
+    template<typename DepthDepth, typename ContainerDepth>
+    Mat computeImpl(const Mat_<DepthDepth>& depthIn, Mat& normals) const
+    {
+        const int r = 5; // used to be 7
+        const int sample_step = r;
+        const int square_size = ((2 * r / sample_step) + 1);
+        long offsets[square_size * square_size];
+        long offsets_x[square_size * square_size];
+        long offsets_y[square_size * square_size];
+        long offsets_x_x[square_size * square_size];
+        long offsets_x_y[square_size * square_size];
+        long offsets_y_y[square_size * square_size];
+        for (int j = -r, index = 0; j <= r; j += sample_step)
+            for (int i = -r; i <= r; i += sample_step, ++index)
+            {
+                offsets_x[index] = i;
+                offsets_y[index] = j;
+                offsets_x_x[index] = i * i;
+                offsets_x_y[index] = i * j;
+                offsets_y_y[index] = j * j;
+                offsets[index] = j * this->cols + i;
+            }
+
+        // Define K_inv by hand, just for higher accuracy
+        Mat33T K_inv = Matx<T, 3, 3>::eye(), kmat;
+        this->K.copyTo(kmat);
+        K_inv(0, 0) = 1.0f / kmat(0, 0);
+        K_inv(0, 1) = -kmat(0, 1) / (kmat(0, 0) * kmat(1, 1));
+        K_inv(0, 2) = (kmat(0, 1) * kmat(1, 2) - kmat(0, 2) * kmat(1, 1)) / (kmat(0, 0) * kmat(1, 1));
+        K_inv(1, 1) = 1 / kmat(1, 1);
+        K_inv(1, 2) = -kmat(1, 2) / kmat(1, 1);
+
+        Vec3T X1_minus_X, X2_minus_X;
+
+        ContainerDepth difference_threshold = 50;
+        normals.setTo(std::numeric_limits<DepthDepth>::quiet_NaN());
+        for (int y = r; y < this->rows - r - 1; ++y)
+        {
+            const DepthDepth* p_line = reinterpret_cast<const DepthDepth*>(depthIn.ptr(y, r));
+            Vec3T* normal = normals.ptr<Vec3T>(y, r);
+
+            for (int x = r; x < this->cols - r - 1; ++x)
+            {
+                DepthDepth d = p_line[0];
+
+                // accum
+                long A[4];
+                A[0] = A[1] = A[2] = A[3] = 0;
+                ContainerDepth b[2];
+                b[0] = b[1] = 0;
+                for (unsigned int i = 0; i < square_size * square_size; ++i) {
+                    // We need to cast to ContainerDepth in case we have unsigned DepthDepth
+                    ContainerDepth delta = ContainerDepth(p_line[offsets[i]]) - ContainerDepth(d);
+                    if (std::abs(delta) > difference_threshold)
+                        continue;
+
+                    A[0] += offsets_x_x[i];
+                    A[1] += offsets_x_y[i];
+                    A[3] += offsets_y_y[i];
+                    b[0] += offsets_x[i] * delta;
+                    b[1] += offsets_y[i] * delta;
+                }
+
+                // solve for the optimal gradient D of equation (8)
+                long det = A[0] * A[3] - A[1] * A[1];
+                // We should divide the following two by det, but instead, we multiply
+                // X1_minus_X and X2_minus_X by det (which does not matter as we normalize the normals)
+                // Therefore, no division is done: this is only for speedup
+                ContainerDepth dx = (A[3] * b[0] - A[1] * b[1]);
+                ContainerDepth dy = (-A[1] * b[0] + A[0] * b[1]);
+
+                // Compute the dot product
+                //Vec3T X = K_inv * Vec3T(x, y, 1) * depth(y, x);
+                //Vec3T X1 = K_inv * Vec3T(x + 1, y, 1) * (depth(y, x) + dx);
+                //Vec3T X2 = K_inv * Vec3T(x, y + 1, 1) * (depth(y, x) + dy);
+                //Vec3T nor = (X1 - X).cross(X2 - X);
+                multiply_by_K_inv(K_inv, d * det + (x + 1) * dx, y * dx, dx, X1_minus_X);
+                multiply_by_K_inv(K_inv, x * dy, d * det + (y + 1) * dy, dy, X2_minus_X);
+                Vec3T nor = X1_minus_X.cross(X2_minus_X);
+                signNormal(nor, *normal);
+
+                ++p_line;
+                ++normal;
+            }
+        }
+
+        return normals;
+    }
+
+    virtual void assertOnBadArg(const Mat& points3d_ori) const CV_OVERRIDE
+    {
+        CV_Assert(((points3d_ori.channels() == 3) && (points3d_ori.depth() == CV_32F || points3d_ori.depth() == CV_64F)) ||
+                  ((points3d_ori.channels() == 1) && (points3d_ori.depth() == CV_16U || points3d_ori.depth() == CV_32F || points3d_ori.depth() == CV_64F)));
+    }
+
+    virtual void calcRadiusAnd3d(const Mat& /*points3d_ori*/, Mat& /*points3d*/, Mat& /*radius*/) const CV_OVERRIDE
+    { }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Given a set of 3d points in a depth image, compute the normals at each point
+ * using the SRI method described in
+ * ``Fast and Accurate Computation of Surface Normals from Range Images``
+ * by H. Badino, D. Huber, Y. Park and T. Kanade
+ */
+template<typename T>
+class SRI : public RgbdNormalsImpl<T>
+{
+public:
+    typedef Matx<T, 3, 3> Mat33T;
+    typedef Vec<T, 9> Vec9T;
+    typedef Vec<T, 3> Vec3T;
+
+    SRI(int _rows, int _cols, int _windowSize, const Mat& _K) :
+        RgbdNormalsImpl<T>(_rows, _cols, _windowSize, _K, RgbdNormals::RGBD_NORMALS_METHOD_SRI),
+        phi_step_(0),
+        theta_step_(0)
+    { }
+
+    /** Compute cached data
+     */
+    virtual void cache() const CV_OVERRIDE
+    {
+        if (!this->cacheIsDirty)
+            return;
+
+        Mat_<T> cos_theta, sin_theta, cos_phi, sin_phi;
+        computeThetaPhi<T>(this->rows, this->cols, this->K, cos_theta, sin_theta, cos_phi, sin_phi);
+
+        // Create the derivative kernels
+        getDerivKernels(kx_dx_, ky_dx_, 1, 0, this->windowSize, true, this->dtype);
+        getDerivKernels(kx_dy_, ky_dy_, 0, 1, this->windowSize, true, this->dtype);
+
+        // Get the mapping function for SRI
+        float min_theta = (float)std::asin(sin_theta(0, 0)), max_theta = (float)std::asin(sin_theta(0, this->cols - 1));
+        float min_phi = (float)std::asin(sin_phi(0, this->cols / 2 - 1)), max_phi = (float)std::asin(sin_phi(this->rows - 1, this->cols / 2 - 1));
+
+        std::vector<Point3f> points3d(this->cols * this->rows);
+        R_hat_.create(this->rows, this->cols);
+        phi_step_ = float(max_phi - min_phi) / (this->rows - 1);
+        theta_step_ = float(max_theta - min_theta) / (this->cols - 1);
+        for (int phi_int = 0, k = 0; phi_int < this->rows; ++phi_int)
+        {
+            float phi = min_phi + phi_int * phi_step_;
+            for (int theta_int = 0; theta_int < this->cols; ++theta_int, ++k)
+            {
+                float theta = min_theta + theta_int * theta_step_;
+                // Store the 3d point to project it later
+                points3d[k] = Point3f(std::sin(theta) * std::cos(phi), std::sin(phi), std::cos(theta) * std::cos(phi));
+
+                // Cache the rotation matrix and negate it
+                Mat_<T> mat =
+                    (Mat_ < T >(3, 3) << 0, 1, 0, 0, 0, 1, 1, 0, 0) *
+                    ((Mat_ < T >(3, 3) << std::cos(theta), -std::sin(theta), 0, std::sin(theta), std::cos(theta), 0, 0, 0, 1)) *
+                    ((Mat_ < T >(3, 3) << std::cos(phi), 0, -std::sin(phi), 0, 1, 0, std::sin(phi), 0, std::cos(phi)));
+                for (unsigned char i = 0; i < 3; ++i)
+                    mat(i, 1) = mat(i, 1) / std::cos(phi);
+                // The second part of the matrix is never explained in the paper ... but look at the wikipedia normal article
+                mat(0, 0) = mat(0, 0) - 2 * std::cos(phi) * std::sin(theta);
+                mat(1, 0) = mat(1, 0) - 2 * std::sin(phi);
+                mat(2, 0) = mat(2, 0) - 2 * std::cos(phi) * std::cos(theta);
+
+                R_hat_(phi_int, theta_int) = Vec9T((T*)(mat.data));
+            }
+        }
+
+        map_.create(this->rows, this->cols);
+        projectPoints(points3d, Mat(3, 1, CV_32FC1, Scalar::all(0.0f)), Mat(3, 1, CV_32FC1, Scalar::all(0.0f)), this->K, Mat(), map_);
+        map_ = map_.reshape(2, this->rows);
+        convertMaps(map_, Mat(), xy_, fxy_, CV_16SC2);
+
+        //map for converting from Spherical coordinate space to Euclidean space
+        euclideanMap_.create(this->rows, this->cols);
+        Matx<T, 3, 3> km(this->K);
+        float invFx = (float)(1.0f / km(0, 0)), cx = (float)(km(0, 2));
+        double invFy = 1.0f / (km(1, 1)), cy = km(1, 2);
+        for (int i = 0; i < this->rows; i++)
+        {
+            float y = (float)((i - cy) * invFy);
+            for (int j = 0; j < this->cols; j++)
+            {
+                float x = (j - cx) * invFx;
+                float theta = std::atan(x);
+                float phi = std::asin(y / std::sqrt(x * x + y * y + 1.0f));
+
+                euclideanMap_(i, j) = Vec2f((theta - min_theta) / theta_step_, (phi - min_phi) / phi_step_);
+            }
+        }
+        //convert map to 2 maps in short format for increasing speed in remap function
+        convertMaps(euclideanMap_, Mat(), invxy_, invfxy_, CV_16SC2);
+
+        // Update the kernels: the steps are due to the fact that derivatives will be computed on a grid where
+        // the step is not 1. Only need to do it on one dimension as it computes derivatives in only one direction
+        kx_dx_ /= theta_step_;
+        ky_dy_ /= phi_step_;
+
+        this->cacheIsDirty = false;
+    }
+
+    /** Compute the normals
+     * @param r
+     * @return
+     */
+    virtual void compute(const Mat& in, Mat& normals_out) const CV_OVERRIDE
+    {
+        const Mat_<T>& r_non_interp = in;
+
+        // Interpolate the radial image to make derivatives meaningful
+        Mat_<T> r;
+        // higher quality remapping does not help here
+        remap(r_non_interp, r, xy_, fxy_, INTER_LINEAR);
+
+        // Compute the derivatives with respect to theta and phi
+        // TODO add bilateral filtering (as done in kinfu)
+        Mat_<T> r_theta, r_phi;
+        sepFilter2D(r, r_theta, r.depth(), kx_dx_, ky_dx_);
+        //current OpenCV version sometimes corrupts r matrix after second call of sepFilter2D
+        //it depends on resolution, be careful
+        sepFilter2D(r, r_phi, r.depth(), kx_dy_, ky_dy_);
+
+        // Fill the result matrix
+        Mat_<Vec3T> normals(this->rows, this->cols);
+
+        const T* r_theta_ptr = r_theta[0], * r_theta_ptr_end = r_theta_ptr + this->rows * this->cols;
+        const T* r_phi_ptr = r_phi[0];
+        const Mat33T* R = reinterpret_cast<const Mat33T*>(R_hat_[0]);
+        const T* r_ptr = r[0];
+        Vec3T* normal = normals[0];
+        for (; r_theta_ptr != r_theta_ptr_end; ++r_theta_ptr, ++r_phi_ptr, ++R, ++r_ptr, ++normal)
+        {
+            if (cvIsNaN(*r_ptr))
+            {
+                (*normal)[0] = *r_ptr;
+                (*normal)[1] = *r_ptr;
+                (*normal)[2] = *r_ptr;
+            }
+            else
+            {
+                T r_theta_over_r = (*r_theta_ptr) / (*r_ptr);
+                T r_phi_over_r = (*r_phi_ptr) / (*r_ptr);
+                // R(1,1) is 0
+                signNormal((*R)(0, 0) + (*R)(0, 1) * r_theta_over_r + (*R)(0, 2) * r_phi_over_r,
+                    (*R)(1, 0) + (*R)(1, 2) * r_phi_over_r,
+                    (*R)(2, 0) + (*R)(2, 1) * r_theta_over_r + (*R)(2, 2) * r_phi_over_r, *normal);
+            }
+        }
+
+        remap(normals, normals_out, invxy_, invfxy_, INTER_LINEAR);
+        normal = normals_out.ptr<Vec3T>(0);
+        Vec3T* normal_end = normal + this->rows * this->cols;
+        for (; normal != normal_end; ++normal)
+            signNormal((*normal)[0], (*normal)[1], (*normal)[2], *normal);
+    }
+
+    virtual void assertOnBadArg(const Mat& points3d_ori) const CV_OVERRIDE
+    {
+        CV_Assert(((points3d_ori.channels() == 3) && (points3d_ori.depth() == CV_32F || points3d_ori.depth() == CV_64F)));
+    }
+
+    // Cached data
+    /** Stores R */
+    mutable Mat_<Vec9T> R_hat_;
+    mutable float phi_step_, theta_step_;
+
+    /** Derivative kernels */
+    mutable Mat kx_dx_, ky_dx_, kx_dy_, ky_dy_;
+    /** mapping function to get an SRI image */
+    mutable Mat_<Vec2f> map_;
+    mutable Mat xy_, fxy_;
+
+    mutable Mat_<Vec2f> euclideanMap_;
+    mutable Mat invxy_, invfxy_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+Ptr<RgbdNormals> RgbdNormals::create(int rows, int cols, int depth, InputArray K, int windowSize, RgbdNormalsMethod method)
+{
+    CV_Assert(rows > 0 && cols > 0 && (depth == CV_32F || depth == CV_64F));
+    CV_Assert(windowSize == 1 || windowSize == 3 || windowSize == 5 || windowSize == 7);
+    CV_Assert(K.cols() == 3 && K.rows() == 3 && (K.depth() == CV_32F || K.depth() == CV_64F));
+
+    Mat mK = K.getMat();
+    CV_Assert(method == RGBD_NORMALS_METHOD_FALS || method == RGBD_NORMALS_METHOD_LINEMOD || method == RGBD_NORMALS_METHOD_SRI);
+    Ptr<RgbdNormals> ptr;
+    switch (method)
+    {
+    case (RGBD_NORMALS_METHOD_FALS):
+    {
+        if (depth == CV_32F)
+            ptr = makePtr<FALS<float> >(rows, cols, windowSize, mK);
+        else
+            ptr = makePtr<FALS<double>>(rows, cols, windowSize, mK);
+        break;
+    }
+    case (RGBD_NORMALS_METHOD_LINEMOD):
+    {
+        if (depth == CV_32F)
+            ptr = makePtr<LINEMOD<float> >(rows, cols, windowSize, mK);
+        else
+            ptr = makePtr<LINEMOD<double>>(rows, cols, windowSize, mK);
+        break;
+    }
+    case RGBD_NORMALS_METHOD_SRI:
+    {
+        if (depth == CV_32F)
+            ptr = makePtr<SRI<float> >(rows, cols, windowSize, mK);
+        else
+            ptr = makePtr<SRI<double>>(rows, cols, windowSize, mK);
+        break;
+    }
+    }
+
+    return ptr;
+}
+
+} // namespace cv
diff --git a/modules/3d/src/rgbd/odometry.cpp b/modules/3d/src/rgbd/odometry.cpp
new file mode 100644
index 0000000000..37ba96f21e
--- /dev/null
+++ b/modules/3d/src/rgbd/odometry.cpp
@@ -0,0 +1,2476 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// FastICPOdometry is based on kinfu-remake code
+// Copyright(c) 2012, Anatoly Baksheev
+// All rights reserved.
+
+#include "../precomp.hpp"
+#include "utils.hpp"
+#include "fast_icp.hpp"
+
+namespace cv
+{
+
+enum
+{
+    RGBD_ODOMETRY = 1,
+    ICP_ODOMETRY = 2,
+    MERGED_ODOMETRY = RGBD_ODOMETRY + ICP_ODOMETRY
+};
+
+static const int sobelSize = 3;
+static const double sobelScale = 1./8.;
+static const int normalWinSize = 5;
+static const RgbdNormals::RgbdNormalsMethod normalMethod = RgbdNormals::RGBD_NORMALS_METHOD_FALS;
+
+static
+void buildPyramidCameraMatrix(const Matx33f& cameraMatrix, int levels, std::vector<Matx33f>& pyramidCameraMatrix)
+{
+    pyramidCameraMatrix.resize(levels);
+
+    for(int i = 0; i < levels; i++)
+    {
+        Matx33f levelCameraMatrix = (i == 0) ? cameraMatrix : 0.5f * pyramidCameraMatrix[i-1];
+        levelCameraMatrix(2, 2) = 1.0;
+        pyramidCameraMatrix[i] = levelCameraMatrix;
+    }
+}
+
+static inline
+void checkImage(InputArray image)
+{
+    if(image.empty())
+        CV_Error(Error::StsBadSize, "Image is empty.");
+    if(image.type() != CV_8UC1)
+        CV_Error(Error::StsBadSize, "Image type has to be CV_8UC1.");
+}
+
+static inline
+void checkDepth(InputArray depth, const Size& imageSize)
+{
+    if(depth.empty())
+        CV_Error(Error::StsBadSize, "Depth is empty.");
+    if(depth.size() != imageSize)
+        CV_Error(Error::StsBadSize, "Depth has to have the size equal to the image size.");
+    if(depth.type() != CV_32FC1)
+        CV_Error(Error::StsBadSize, "Depth type has to be CV_32FC1.");
+}
+
+static inline
+void checkMask(InputArray mask, const Size& imageSize)
+{
+    if(!mask.empty())
+    {
+        if(mask.size() != imageSize)
+            CV_Error(Error::StsBadSize, "Mask has to have the size equal to the image size.");
+        if(mask.type() != CV_8UC1)
+            CV_Error(Error::StsBadSize, "Mask type has to be CV_8UC1.");
+    }
+}
+
+static inline
+void checkNormals(InputArray normals, const Size& depthSize)
+{
+    if(normals.size() != depthSize)
+        CV_Error(Error::StsBadSize, "Normals has to have the size equal to the depth size.");
+    if(normals.type() != CV_32FC3)
+        CV_Error(Error::StsBadSize, "Normals type has to be CV_32FC3.");
+}
+
+static
+void preparePyramidImage(InputArray image, InputOutputArrayOfArrays pyramidImage, size_t levelCount)
+{
+    if(!pyramidImage.empty())
+    {
+        size_t nLevels = pyramidImage.size(-1).width;
+        if(nLevels < levelCount)
+            CV_Error(Error::StsBadSize, "Levels count of pyramidImage has to be equal or less than size of iterCounts.");
+
+        CV_Assert(pyramidImage.size(0) == image.size());
+        for(size_t i = 0; i < nLevels; i++)
+            CV_Assert(pyramidImage.type((int)i) == image.type());
+    }
+    else
+        buildPyramid(image, pyramidImage, (int)levelCount - 1);
+}
+
+static
+void preparePyramidDepth(InputArray depth, InputOutputArrayOfArrays pyramidDepth, size_t levelCount)
+{
+    if(!pyramidDepth.empty())
+    {
+        size_t nLevels = pyramidDepth.size(-1).width;
+        if(nLevels < levelCount)
+            CV_Error(Error::StsBadSize, "Levels count of pyramidDepth has to be equal or less than size of iterCounts.");
+
+        CV_Assert(pyramidDepth.size(0) == depth.size());
+        for(size_t i = 0; i < nLevels; i++)
+            CV_Assert(pyramidDepth.type((int)i) == depth.type());
+    }
+    else
+        buildPyramid(depth, pyramidDepth, (int)levelCount - 1);
+}
+
+template<typename TMat>
+static TMat getTMat(InputArray, int = -1);
+
+template<>
+Mat getTMat<Mat>(InputArray a, int i)
+{
+    return a.getMat(i);
+}
+
+template<>
+UMat getTMat<UMat>(InputArray a, int i)
+{
+    return a.getUMat(i);
+}
+
+template<typename TMat>
+static TMat& getTMatRef(InputOutputArray, int = -1);
+
+template<>
+Mat& getTMatRef<Mat>(InputOutputArray a, int i)
+{
+    return a.getMatRef(i);
+}
+
+//TODO: uncomment it when it's in use
+//template<>
+//UMat& getTMatRef<UMat>(InputOutputArray a, int i)
+//{
+//    return a.getUMatRef(i);
+//}
+
+template<typename TMat>
+static
+void preparePyramidMask(InputArray mask, InputArrayOfArrays pyramidDepth, float minDepth, float maxDepth,
+                        InputArrayOfArrays pyramidNormal,
+                        InputOutputArrayOfArrays pyramidMask)
+{
+    minDepth = std::max(0.f, minDepth);
+
+    int nLevels = pyramidDepth.size(-1).width;
+    if(!pyramidMask.empty())
+    {
+        if(pyramidMask.size(-1).width != nLevels)
+            CV_Error(Error::StsBadSize, "Levels count of pyramidMask has to be equal to size of pyramidDepth.");
+
+        for(int i = 0; i < pyramidMask.size(-1).width; i++)
+        {
+            CV_Assert(pyramidMask.size(i) == pyramidDepth.size(i));
+            CV_Assert(pyramidMask.type(i) == CV_8UC1);
+        }
+    }
+    else
+    {
+        TMat validMask;
+        if(mask.empty())
+            validMask = TMat(pyramidDepth.size(0), CV_8UC1, Scalar(255));
+        else
+            validMask = getTMat<TMat>(mask, -1).clone();
+
+        buildPyramid(validMask, pyramidMask, nLevels - 1);
+
+        for(int i = 0; i < pyramidMask.size(-1).width; i++)
+        {
+            TMat levelDepth = getTMat<TMat>(pyramidDepth, i).clone();
+            patchNaNs(levelDepth, 0);
+
+            TMat& levelMask = getTMatRef<TMat>(pyramidMask, i);
+            TMat gtmin, ltmax, tmpMask;
+            cv::compare(levelDepth, Scalar(minDepth), gtmin, CMP_GT);
+            cv::compare(levelDepth, Scalar(maxDepth), ltmax, CMP_LT);
+            cv::bitwise_and(gtmin, ltmax, tmpMask);
+            cv::bitwise_and(levelMask, tmpMask, levelMask);
+
+            if(!pyramidNormal.empty())
+            {
+                CV_Assert(pyramidNormal.type(i) == CV_32FC3);
+                CV_Assert(pyramidNormal.size(i) == pyramidDepth.size(i));
+                TMat levelNormal = getTMat<TMat>(pyramidNormal, i).clone();
+
+                TMat validNormalMask;
+                // NaN check
+                cv::compare(levelNormal, levelNormal, validNormalMask, CMP_EQ);
+                CV_Assert(validNormalMask.type() == CV_8UC3);
+
+                std::vector<TMat> channelMasks;
+                split(validNormalMask, channelMasks);
+                TMat tmpChMask;
+                cv::bitwise_and(channelMasks[0], channelMasks[1], tmpChMask);
+                cv::bitwise_and(channelMasks[2], tmpChMask, validNormalMask);
+                cv::bitwise_and(levelMask, validNormalMask, levelMask);
+            }
+        }
+    }
+}
+
+template<typename TMat>
+static
+void preparePyramidCloud(InputArrayOfArrays pyramidDepth, const Matx33f& cameraMatrix, InputOutputArrayOfArrays pyramidCloud)
+{
+    size_t depthSize = pyramidDepth.size(-1).width;
+    size_t cloudSize = pyramidCloud.size(-1).width;
+    if(!pyramidCloud.empty())
+    {
+        if(cloudSize != depthSize)
+            CV_Error(Error::StsBadSize, "Incorrect size of pyramidCloud.");
+
+        for(size_t i = 0; i < depthSize; i++)
+        {
+            CV_Assert(pyramidCloud.size((int)i) == pyramidDepth.size((int)i));
+            CV_Assert(pyramidCloud.type((int)i) == CV_32FC3);
+        }
+    }
+    else
+    {
+        std::vector<Matx33f> pyramidCameraMatrix;
+        buildPyramidCameraMatrix(cameraMatrix, (int)depthSize, pyramidCameraMatrix);
+
+        pyramidCloud.create((int)depthSize, 1, CV_32FC3, -1);
+        for(size_t i = 0; i < depthSize; i++)
+        {
+            TMat cloud;
+            depthTo3d(getTMat<TMat>(pyramidDepth, (int)i), pyramidCameraMatrix[i], cloud);
+            getTMatRef<TMat>(pyramidCloud, (int)i) = cloud;
+        }
+    }
+}
+
+template<typename TMat>
+static
+void preparePyramidSobel(InputArrayOfArrays pyramidImage, int dx, int dy, InputOutputArrayOfArrays pyramidSobel)
+{
+    size_t imgLevels = pyramidImage.size(-1).width;
+    size_t sobelLvls = pyramidSobel.size(-1).width;
+    if(!pyramidSobel.empty())
+    {
+        if(sobelLvls != imgLevels)
+            CV_Error(Error::StsBadSize, "Incorrect size of pyramidSobel.");
+
+        for(size_t i = 0; i < sobelLvls; i++)
+        {
+            CV_Assert(pyramidSobel.size((int)i) == pyramidImage.size((int)i));
+            CV_Assert(pyramidSobel.type((int)i) == CV_16SC1);
+        }
+    }
+    else
+    {
+        pyramidSobel.create((int)imgLevels, 1, CV_16SC1, -1);
+        for(size_t i = 0; i < imgLevels; i++)
+        {
+            Sobel(getTMat<TMat>(pyramidImage, (int)i), getTMatRef<TMat>(pyramidSobel, (int)i), CV_16S, dx, dy, sobelSize);
+        }
+    }
+}
+
+static
+void randomSubsetOfMask(InputOutputArray _mask, float part)
+{
+    const int minPointsCount = 1000; // minimum point count (we can process them fast)
+    const int nonzeros = countNonZero(_mask);
+    const int needCount = std::max(minPointsCount, int(_mask.total() * part));
+    if(needCount < nonzeros)
+    {
+        RNG rng;
+        Mat mask = _mask.getMat();
+        Mat subset(mask.size(), CV_8UC1, Scalar(0));
+
+        int subsetSize = 0;
+        while(subsetSize < needCount)
+        {
+            int y = rng(mask.rows);
+            int x = rng(mask.cols);
+            if(mask.at<uchar>(y,x))
+            {
+                subset.at<uchar>(y,x) = 255;
+                mask.at<uchar>(y,x) = 0;
+                subsetSize++;
+            }
+        }
+        _mask.assign(subset);
+    }
+}
+
+static
+void preparePyramidTexturedMask(InputArrayOfArrays pyramid_dI_dx, InputArrayOfArrays pyramid_dI_dy,
+                                InputArray minGradMagnitudes, InputArrayOfArrays pyramidMask, double maxPointsPart,
+                                InputOutputArrayOfArrays pyramidTexturedMask)
+{
+    size_t didxLevels = pyramid_dI_dx.size(-1).width;
+    size_t texLevels = pyramidTexturedMask.size(-1).width;
+    if(!pyramidTexturedMask.empty())
+    {
+        if(texLevels != didxLevels)
+            CV_Error(Error::StsBadSize, "Incorrect size of pyramidTexturedMask.");
+
+        for(size_t i = 0; i < texLevels; i++)
+        {
+            CV_Assert(pyramidTexturedMask.size((int)i) == pyramid_dI_dx.size((int)i));
+            CV_Assert(pyramidTexturedMask.type((int)i) == CV_8UC1);
+        }
+    }
+    else
+    {
+        CV_Assert(minGradMagnitudes.type() == CV_32F);
+        Mat_<float> mgMags = minGradMagnitudes.getMat();
+
+        const float sobelScale2_inv = 1.f / (float)(sobelScale * sobelScale);
+        pyramidTexturedMask.create((int)didxLevels, 1, CV_8UC1, -1);
+        for(size_t i = 0; i < didxLevels; i++)
+        {
+            const float minScaledGradMagnitude2 = mgMags((int)i) * mgMags((int)i) * sobelScale2_inv;
+            const Mat& dIdx = pyramid_dI_dx.getMat((int)i);
+            const Mat& dIdy = pyramid_dI_dy.getMat((int)i);
+
+            Mat texturedMask(dIdx.size(), CV_8UC1, Scalar(0));
+
+            for(int y = 0; y < dIdx.rows; y++)
+            {
+                const short *dIdx_row = dIdx.ptr<short>(y);
+                const short *dIdy_row = dIdy.ptr<short>(y);
+                uchar *texturedMask_row = texturedMask.ptr<uchar>(y);
+                for(int x = 0; x < dIdx.cols; x++)
+                {
+                    float magnitude2 = static_cast<float>(dIdx_row[x] * dIdx_row[x] + dIdy_row[x] * dIdy_row[x]);
+                    if(magnitude2 >= minScaledGradMagnitude2)
+                        texturedMask_row[x] = 255;
+                }
+            }
+            Mat texMask = texturedMask & pyramidMask.getMat((int)i);
+
+            randomSubsetOfMask(texMask, (float)maxPointsPart);
+            pyramidTexturedMask.getMatRef((int)i) = texMask;
+        }
+    }
+}
+
+static
+void preparePyramidNormals(InputArray normals, InputArrayOfArrays pyramidDepth, InputOutputArrayOfArrays pyramidNormals)
+{
+    size_t depthLevels = pyramidDepth.size(-1).width;
+    size_t normalsLevels = pyramidNormals.size(-1).width;
+    if(!pyramidNormals.empty())
+    {
+        if(normalsLevels != depthLevels)
+            CV_Error(Error::StsBadSize, "Incorrect size of pyramidNormals.");
+
+        for(size_t i = 0; i < normalsLevels; i++)
+        {
+            CV_Assert(pyramidNormals.size((int)i) == pyramidDepth.size((int)i));
+            CV_Assert(pyramidNormals.type((int)i) == CV_32FC3);
+        }
+    }
+    else
+    {
+        buildPyramid(normals, pyramidNormals, (int)depthLevels - 1);
+        // renormalize normals
+        for(size_t i = 1; i < depthLevels; i++)
+        {
+            Mat& currNormals = pyramidNormals.getMatRef((int)i);
+            for(int y = 0; y < currNormals.rows; y++)
+            {
+                Point3f* normals_row = currNormals.ptr<Point3f>(y);
+                for(int x = 0; x < currNormals.cols; x++)
+                {
+                    double nrm = norm(normals_row[x]);
+                    normals_row[x] *= 1./nrm;
+                }
+            }
+        }
+    }
+}
+
+static
+void preparePyramidNormalsMask(InputArray pyramidNormals, InputArray pyramidMask, double maxPointsPart,
+                               InputOutputArrayOfArrays /*std::vector<Mat>&*/ pyramidNormalsMask)
+{
+    size_t maskLevels = pyramidMask.size(-1).width;
+    size_t norMaskLevels = pyramidNormalsMask.size(-1).width;
+    if(!pyramidNormalsMask.empty())
+    {
+        if(norMaskLevels != maskLevels)
+            CV_Error(Error::StsBadSize, "Incorrect size of pyramidNormalsMask.");
+
+        for(size_t i = 0; i < norMaskLevels; i++)
+        {
+            CV_Assert(pyramidNormalsMask.size((int)i) == pyramidMask.size((int)i));
+            CV_Assert(pyramidNormalsMask.type((int)i) == pyramidMask.type((int)i));
+        }
+    }
+    else
+    {
+        pyramidNormalsMask.create((int)maskLevels, 1, CV_8U, -1);
+        for(size_t i = 0; i < maskLevels; i++)
+        {
+            Mat& normalsMask = pyramidNormalsMask.getMatRef((int)i);
+            normalsMask = pyramidMask.getMat((int)i).clone();
+
+            const Mat normals = pyramidNormals.getMat((int)i);
+            for(int y = 0; y < normalsMask.rows; y++)
+            {
+                const Vec3f *normals_row = normals.ptr<Vec3f>(y);
+                uchar *normalsMask_row = normalsMask.ptr<uchar>(y);
+                for(int x = 0; x < normalsMask.cols; x++)
+                {
+                    Vec3f n = normals_row[x];
+                    if(cvIsNaN(n[0]))
+                    {
+                        CV_DbgAssert(cvIsNaN(n[1]) && cvIsNaN(n[2]));
+                        normalsMask_row[x] = 0;
+                    }
+                }
+            }
+            randomSubsetOfMask(normalsMask, (float)maxPointsPart);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////
+
+static
+void computeProjectiveMatrix(const Mat& ksi, Mat& Rt)
+{
+    CV_Assert(ksi.size() == Size(1,6) && ksi.type() == CV_64FC1);
+
+    const double* ksi_ptr = ksi.ptr<const double>();
+    // 0.5 multiplication is here because (dual) quaternions keep half an angle/twist inside
+    Matx44d matdq = (DualQuatd(0, ksi_ptr[0], ksi_ptr[1], ksi_ptr[2],
+                               0, ksi_ptr[3], ksi_ptr[4], ksi_ptr[5])*0.5).exp().toMat(QUAT_ASSUME_UNIT);
+
+    matdq.copyTo(Rt);
+}
+
+static
+void computeCorresps(const Matx33f& _K, const Matx33f& _K_inv, const Mat& Rt,
+                     const Mat& depth0, const Mat& validMask0,
+                     const Mat& depth1, const Mat& selectMask1, float maxDepthDiff,
+                     Mat& _corresps)
+{
+    CV_Assert(Rt.type() == CV_64FC1);
+
+    Mat corresps(depth1.size(), CV_16SC2, Scalar::all(-1));
+
+    Matx33d K(_K), K_inv(_K_inv);
+    Rect r(0, 0, depth1.cols, depth1.rows);
+    Mat Kt = Rt(Rect(3,0,1,3)).clone();
+    Kt = K * Kt;
+    const double * Kt_ptr = Kt.ptr<const double>();
+
+    AutoBuffer<float> buf(3 * (depth1.cols + depth1.rows));
+    float *KRK_inv0_u1 = buf.data();
+    float *KRK_inv1_v1_plus_KRK_inv2 = KRK_inv0_u1 + depth1.cols;
+    float *KRK_inv3_u1 = KRK_inv1_v1_plus_KRK_inv2 + depth1.rows;
+    float *KRK_inv4_v1_plus_KRK_inv5 = KRK_inv3_u1 + depth1.cols;
+    float *KRK_inv6_u1 = KRK_inv4_v1_plus_KRK_inv5 + depth1.rows;
+    float *KRK_inv7_v1_plus_KRK_inv8 = KRK_inv6_u1 + depth1.cols;
+    {
+        Mat R = Rt(Rect(0,0,3,3)).clone();
+
+        Mat KRK_inv = K * R * K_inv;
+        const double * KRK_inv_ptr = KRK_inv.ptr<const double>();
+        for(int u1 = 0; u1 < depth1.cols; u1++)
+        {
+            KRK_inv0_u1[u1] = (float)(KRK_inv_ptr[0] * u1);
+            KRK_inv3_u1[u1] = (float)(KRK_inv_ptr[3] * u1);
+            KRK_inv6_u1[u1] = (float)(KRK_inv_ptr[6] * u1);
+        }
+
+        for(int v1 = 0; v1 < depth1.rows; v1++)
+        {
+            KRK_inv1_v1_plus_KRK_inv2[v1] = (float)(KRK_inv_ptr[1] * v1 + KRK_inv_ptr[2]);
+            KRK_inv4_v1_plus_KRK_inv5[v1] = (float)(KRK_inv_ptr[4] * v1 + KRK_inv_ptr[5]);
+            KRK_inv7_v1_plus_KRK_inv8[v1] = (float)(KRK_inv_ptr[7] * v1 + KRK_inv_ptr[8]);
+        }
+    }
+
+    int correspCount = 0;
+    for(int v1 = 0; v1 < depth1.rows; v1++)
+    {
+        const float *depth1_row = depth1.ptr<float>(v1);
+        const uchar *mask1_row = selectMask1.ptr<uchar>(v1);
+        for(int u1 = 0; u1 < depth1.cols; u1++)
+        {
+            float d1 = depth1_row[u1];
+            if(mask1_row[u1])
+            {
+                CV_DbgAssert(!cvIsNaN(d1));
+                float transformed_d1 = static_cast<float>(d1 * (KRK_inv6_u1[u1] + KRK_inv7_v1_plus_KRK_inv8[v1]) +
+                                                          Kt_ptr[2]);
+                if(transformed_d1 > 0)
+                {
+                    float transformed_d1_inv = 1.f / transformed_d1;
+                    int u0 = cvRound(transformed_d1_inv * (d1 * (KRK_inv0_u1[u1] + KRK_inv1_v1_plus_KRK_inv2[v1]) +
+                                                           Kt_ptr[0]));
+                    int v0 = cvRound(transformed_d1_inv * (d1 * (KRK_inv3_u1[u1] + KRK_inv4_v1_plus_KRK_inv5[v1]) +
+                                                           Kt_ptr[1]));
+
+                    if(r.contains(Point(u0,v0)))
+                    {
+                        float d0 = depth0.at<float>(v0,u0);
+                        if(validMask0.at<uchar>(v0, u0) && std::abs(transformed_d1 - d0) <= maxDepthDiff)
+                        {
+                            CV_DbgAssert(!cvIsNaN(d0));
+                            Vec2s& c = corresps.at<Vec2s>(v0,u0);
+                            if(c[0] != -1)
+                            {
+                                int exist_u1 = c[0], exist_v1 = c[1];
+
+                                float exist_d1 = (float)(depth1.at<float>(exist_v1,exist_u1) *
+                                    (KRK_inv6_u1[exist_u1] + KRK_inv7_v1_plus_KRK_inv8[exist_v1]) + Kt_ptr[2]);
+
+                                if(transformed_d1 > exist_d1)
+                                    continue;
+                            }
+                            else
+                                correspCount++;
+
+                            c = Vec2s((short)u1, (short)v1);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    _corresps.create(correspCount, 1, CV_32SC4);
+    Vec4i * corresps_ptr = _corresps.ptr<Vec4i>();
+    for(int v0 = 0, i = 0; v0 < corresps.rows; v0++)
+    {
+        const Vec2s* corresps_row = corresps.ptr<Vec2s>(v0);
+        for(int u0 = 0; u0 < corresps.cols; u0++)
+        {
+            const Vec2s& c = corresps_row[u0];
+            if(c[0] != -1)
+                corresps_ptr[i++] = Vec4i(u0,v0,c[0],c[1]);
+        }
+    }
+}
+
+static inline
+void calcRgbdEquationCoeffs(double* C, double dIdx, double dIdy, const Point3f& p3d, double fx, double fy)
+{
+    double invz  = 1. / p3d.z,
+           v0 = dIdx * fx * invz,
+           v1 = dIdy * fy * invz,
+           v2 = -(v0 * p3d.x + v1 * p3d.y) * invz;
+
+    C[0] = -p3d.z * v1 + p3d.y * v2;
+    C[1] =  p3d.z * v0 - p3d.x * v2;
+    C[2] = -p3d.y * v0 + p3d.x * v1;
+    C[3] = v0;
+    C[4] = v1;
+    C[5] = v2;
+}
+
+static inline
+void calcRgbdEquationCoeffsRotation(double* C, double dIdx, double dIdy, const Point3f& p3d, double fx, double fy)
+{
+    double invz  = 1. / p3d.z,
+           v0 = dIdx * fx * invz,
+           v1 = dIdy * fy * invz,
+           v2 = -(v0 * p3d.x + v1 * p3d.y) * invz;
+    C[0] = -p3d.z * v1 + p3d.y * v2;
+    C[1] =  p3d.z * v0 - p3d.x * v2;
+    C[2] = -p3d.y * v0 + p3d.x * v1;
+}
+
+static inline
+void calcRgbdEquationCoeffsTranslation(double* C, double dIdx, double dIdy, const Point3f& p3d, double fx, double fy)
+{
+    double invz  = 1. / p3d.z,
+           v0 = dIdx * fx * invz,
+           v1 = dIdy * fy * invz,
+           v2 = -(v0 * p3d.x + v1 * p3d.y) * invz;
+    C[0] = v0;
+    C[1] = v1;
+    C[2] = v2;
+}
+
+typedef
+void (*CalcRgbdEquationCoeffsPtr)(double*, double, double, const Point3f&, double, double);
+
+static inline
+void calcICPEquationCoeffs(double* C, const Point3f& p0, const Vec3f& n1)
+{
+    C[0] = -p0.z * n1[1] + p0.y * n1[2];
+    C[1] =  p0.z * n1[0] - p0.x * n1[2];
+    C[2] = -p0.y * n1[0] + p0.x * n1[1];
+    C[3] = n1[0];
+    C[4] = n1[1];
+    C[5] = n1[2];
+}
+
+static inline
+void calcICPEquationCoeffsRotation(double* C, const Point3f& p0, const Vec3f& n1)
+{
+    C[0] = -p0.z * n1[1] + p0.y * n1[2];
+    C[1] =  p0.z * n1[0] - p0.x * n1[2];
+    C[2] = -p0.y * n1[0] + p0.x * n1[1];
+}
+
+static inline
+void calcICPEquationCoeffsTranslation(double* C, const Point3f& /*p0*/, const Vec3f& n1)
+{
+    C[0] = n1[0];
+    C[1] = n1[1];
+    C[2] = n1[2];
+}
+
+typedef
+void (*CalcICPEquationCoeffsPtr)(double*, const Point3f&, const Vec3f&);
+
+static
+void calcRgbdLsmMatrices(const Mat& image0, const Mat& cloud0, const Mat& Rt,
+               const Mat& image1, const Mat& dI_dx1, const Mat& dI_dy1,
+               const Mat& corresps, double fx, double fy, double sobelScaleIn,
+               Mat& AtA, Mat& AtB, CalcRgbdEquationCoeffsPtr func, int transformDim)
+{
+    AtA = Mat(transformDim, transformDim, CV_64FC1, Scalar(0));
+    AtB = Mat(transformDim, 1, CV_64FC1, Scalar(0));
+    double* AtB_ptr = AtB.ptr<double>();
+
+    const int correspsCount = corresps.rows;
+
+    CV_Assert(Rt.type() == CV_64FC1);
+    const double * Rt_ptr = Rt.ptr<const double>();
+
+    AutoBuffer<float> diffs(correspsCount);
+    float* diffs_ptr = diffs.data();
+
+    const Vec4i* corresps_ptr = corresps.ptr<Vec4i>();
+
+    double sigma = 0;
+    for(int correspIndex = 0; correspIndex < corresps.rows; correspIndex++)
+    {
+         const Vec4i& c = corresps_ptr[correspIndex];
+         int u0 = c[0], v0 = c[1];
+         int u1 = c[2], v1 = c[3];
+
+         diffs_ptr[correspIndex] = static_cast<float>(static_cast<int>(image0.at<uchar>(v0,u0)) -
+                                                      static_cast<int>(image1.at<uchar>(v1,u1)));
+         sigma += diffs_ptr[correspIndex] * diffs_ptr[correspIndex];
+    }
+    sigma = std::sqrt(sigma/correspsCount);
+
+    std::vector<double> A_buf(transformDim);
+    double* A_ptr = &A_buf[0];
+
+    for(int correspIndex = 0; correspIndex < corresps.rows; correspIndex++)
+    {
+         const Vec4i& c = corresps_ptr[correspIndex];
+         int u0 = c[0], v0 = c[1];
+         int u1 = c[2], v1 = c[3];
+
+         double w = sigma + std::abs(diffs_ptr[correspIndex]);
+         w = w > DBL_EPSILON ? 1./w : 1.;
+
+         double w_sobelScale = w * sobelScaleIn;
+
+         const Point3f& p0 = cloud0.at<Point3f>(v0,u0);
+         Point3f tp0;
+         tp0.x = (float)(p0.x * Rt_ptr[0] + p0.y * Rt_ptr[1] + p0.z * Rt_ptr[2] + Rt_ptr[3]);
+         tp0.y = (float)(p0.x * Rt_ptr[4] + p0.y * Rt_ptr[5] + p0.z * Rt_ptr[6] + Rt_ptr[7]);
+         tp0.z = (float)(p0.x * Rt_ptr[8] + p0.y * Rt_ptr[9] + p0.z * Rt_ptr[10] + Rt_ptr[11]);
+
+         func(A_ptr,
+              w_sobelScale * dI_dx1.at<short int>(v1,u1),
+              w_sobelScale * dI_dy1.at<short int>(v1,u1),
+              tp0, fx, fy);
+
+        for(int y = 0; y < transformDim; y++)
+        {
+            double* AtA_ptr = AtA.ptr<double>(y);
+            for(int x = y; x < transformDim; x++)
+                AtA_ptr[x] += A_ptr[y] * A_ptr[x];
+
+            AtB_ptr[y] += A_ptr[y] * w * diffs_ptr[correspIndex];
+        }
+    }
+
+    for(int y = 0; y < transformDim; y++)
+        for(int x = y+1; x < transformDim; x++)
+            AtA.at<double>(x,y) = AtA.at<double>(y,x);
+}
+
+static
+void calcICPLsmMatrices(const Mat& cloud0, const Mat& Rt,
+                        const Mat& cloud1, const Mat& normals1,
+                        const Mat& corresps,
+                        Mat& AtA, Mat& AtB, CalcICPEquationCoeffsPtr func, int transformDim)
+{
+    AtA = Mat(transformDim, transformDim, CV_64FC1, Scalar(0));
+    AtB = Mat(transformDim, 1, CV_64FC1, Scalar(0));
+    double* AtB_ptr = AtB.ptr<double>();
+
+    const int correspsCount = corresps.rows;
+
+    CV_Assert(Rt.type() == CV_64FC1);
+    const double * Rt_ptr = Rt.ptr<const double>();
+
+    AutoBuffer<float> diffs(correspsCount);
+    float * diffs_ptr = diffs.data();
+
+    AutoBuffer<Point3f> transformedPoints0(correspsCount);
+    Point3f * tps0_ptr = transformedPoints0.data();
+
+    const Vec4i* corresps_ptr = corresps.ptr<Vec4i>();
+
+    double sigma = 0;
+    for(int correspIndex = 0; correspIndex < corresps.rows; correspIndex++)
+    {
+        const Vec4i& c = corresps_ptr[correspIndex];
+        int u0 = c[0], v0 = c[1];
+        int u1 = c[2], v1 = c[3];
+
+        const Point3f& p0 = cloud0.at<Point3f>(v0,u0);
+        Point3f tp0;
+        tp0.x = (float)(p0.x * Rt_ptr[0] + p0.y * Rt_ptr[1] + p0.z * Rt_ptr[2] + Rt_ptr[3]);
+        tp0.y = (float)(p0.x * Rt_ptr[4] + p0.y * Rt_ptr[5] + p0.z * Rt_ptr[6] + Rt_ptr[7]);
+        tp0.z = (float)(p0.x * Rt_ptr[8] + p0.y * Rt_ptr[9] + p0.z * Rt_ptr[10] + Rt_ptr[11]);
+
+        Vec3f n1 = normals1.at<Vec3f>(v1, u1);
+        Point3f v = cloud1.at<Point3f>(v1,u1) - tp0;
+
+        tps0_ptr[correspIndex] = tp0;
+        diffs_ptr[correspIndex] = n1[0] * v.x + n1[1] * v.y + n1[2] * v.z;
+        sigma += diffs_ptr[correspIndex] * diffs_ptr[correspIndex];
+    }
+
+    sigma = std::sqrt(sigma/correspsCount);
+
+    std::vector<double> A_buf(transformDim);
+    double* A_ptr = &A_buf[0];
+    for(int correspIndex = 0; correspIndex < corresps.rows; correspIndex++)
+    {
+        const Vec4i& c = corresps_ptr[correspIndex];
+        int u1 = c[2], v1 = c[3];
+
+        double w = sigma + std::abs(diffs_ptr[correspIndex]);
+        w = w > DBL_EPSILON ? 1./w : 1.;
+
+        func(A_ptr, tps0_ptr[correspIndex], normals1.at<Vec3f>(v1, u1) * w);
+
+        for(int y = 0; y < transformDim; y++)
+        {
+            double* AtA_ptr = AtA.ptr<double>(y);
+            for(int x = y; x < transformDim; x++)
+                AtA_ptr[x] += A_ptr[y] * A_ptr[x];
+
+            AtB_ptr[y] += A_ptr[y] * w * diffs_ptr[correspIndex];
+        }
+    }
+
+    for(int y = 0; y < transformDim; y++)
+        for(int x = y+1; x < transformDim; x++)
+            AtA.at<double>(x,y) = AtA.at<double>(y,x);
+}
+
+static
+bool solveSystem(const Mat& AtA, const Mat& AtB, double detThreshold, Mat& x)
+{
+    double det = determinant(AtA);
+
+    if(fabs (det) < detThreshold || cvIsNaN(det) || cvIsInf(det))
+        return false;
+
+    solve(AtA, AtB, x, DECOMP_CHOLESKY);
+
+    return true;
+}
+
+static
+bool testDeltaTransformation(const Mat& deltaRt, double maxTranslation, double maxRotation)
+{
+    double translation = norm(deltaRt(Rect(3, 0, 1, 3)));
+
+    Mat rvec;
+    Rodrigues(deltaRt(Rect(0,0,3,3)), rvec);
+
+    double rotation = norm(rvec) * 180. / CV_PI;
+
+    return translation <= maxTranslation && rotation <= maxRotation;
+}
+
+static
+bool RGBDICPOdometryImpl(OutputArray _Rt, const Mat& initRt,
+                         const Ptr<OdometryFrame>& srcFrame,
+                         const Ptr<OdometryFrame>& dstFrame,
+                         const Matx33f& cameraMatrix,
+                         float maxDepthDiff, const std::vector<int>& iterCounts,
+                         double maxTranslation, double maxRotation,
+                         int method, int transfromType)
+{
+    int transformDim = -1;
+    CalcRgbdEquationCoeffsPtr rgbdEquationFuncPtr = 0;
+    CalcICPEquationCoeffsPtr icpEquationFuncPtr = 0;
+    switch(transfromType)
+    {
+    case Odometry::RIGID_BODY_MOTION:
+        transformDim = 6;
+        rgbdEquationFuncPtr = calcRgbdEquationCoeffs;
+        icpEquationFuncPtr = calcICPEquationCoeffs;
+        break;
+    case Odometry::ROTATION:
+        transformDim = 3;
+        rgbdEquationFuncPtr = calcRgbdEquationCoeffsRotation;
+        icpEquationFuncPtr = calcICPEquationCoeffsRotation;
+        break;
+    case Odometry::TRANSLATION:
+        transformDim = 3;
+        rgbdEquationFuncPtr = calcRgbdEquationCoeffsTranslation;
+        icpEquationFuncPtr = calcICPEquationCoeffsTranslation;
+        break;
+    default:
+        CV_Error(Error::StsBadArg, "Incorrect transformation type");
+    }
+
+    const int minOverdetermScale = 20;
+    const int minCorrespsCount = minOverdetermScale * transformDim;
+
+    std::vector<Matx33f> pyramidCameraMatrix;
+    buildPyramidCameraMatrix(cameraMatrix, (int)iterCounts.size(), pyramidCameraMatrix);
+
+    Mat resultRt = initRt.empty() ? Mat::eye(4,4,CV_64FC1) : initRt.clone();
+    Mat currRt, ksi;
+
+    bool isOk = false;
+    for(int level = (int)iterCounts.size() - 1; level >= 0; level--)
+    {
+        const Matx33f& levelCameraMatrix = pyramidCameraMatrix[level];
+        const Matx33f& levelCameraMatrix_inv = levelCameraMatrix.inv(DECOMP_SVD);
+        const Mat srcLevelDepth, dstLevelDepth;
+        srcFrame->getPyramidAt(srcLevelDepth, OdometryFrame::PYR_DEPTH, level);
+        dstFrame->getPyramidAt(dstLevelDepth, OdometryFrame::PYR_DEPTH, level);
+
+        const double fx = levelCameraMatrix(0, 0);
+        const double fy = levelCameraMatrix(1, 1);
+        const double determinantThreshold = 1e-6;
+
+        Mat AtA_rgbd, AtB_rgbd, AtA_icp, AtB_icp;
+        Mat corresps_rgbd, corresps_icp;
+
+        // Run transformation search on current level iteratively.
+        for(int iter = 0; iter < iterCounts[level]; iter ++)
+        {
+            Mat resultRt_inv = resultRt.inv(DECOMP_SVD);
+
+            const Mat pyramidMask;
+            srcFrame->getPyramidAt(pyramidMask, OdometryFrame::PYR_MASK, level);
+
+            if(method & RGBD_ODOMETRY)
+            {
+                const Mat pyramidTexturedMask;
+                dstFrame->getPyramidAt(pyramidTexturedMask, OdometryFrame::PYR_TEXMASK, level);
+                computeCorresps(levelCameraMatrix, levelCameraMatrix_inv, resultRt_inv,
+                                srcLevelDepth, pyramidMask, dstLevelDepth, pyramidTexturedMask,
+                                maxDepthDiff, corresps_rgbd);
+            }
+
+            if(method & ICP_ODOMETRY)
+            {
+                const Mat pyramidNormalsMask;
+                dstFrame->getPyramidAt(pyramidNormalsMask, OdometryFrame::PYR_NORMMASK, level);
+                computeCorresps(levelCameraMatrix, levelCameraMatrix_inv, resultRt_inv,
+                                srcLevelDepth, pyramidMask, dstLevelDepth, pyramidNormalsMask,
+                                maxDepthDiff, corresps_icp);
+            }
+
+            if(corresps_rgbd.rows < minCorrespsCount && corresps_icp.rows < minCorrespsCount)
+                break;
+
+            const Mat srcPyrCloud;
+            srcFrame->getPyramidAt(srcPyrCloud, OdometryFrame::PYR_CLOUD, level);
+
+
+            Mat AtA(transformDim, transformDim, CV_64FC1, Scalar(0)), AtB(transformDim, 1, CV_64FC1, Scalar(0));
+            if(corresps_rgbd.rows >= minCorrespsCount)
+            {
+                const Mat srcPyrImage, dstPyrImage, dstPyrIdx, dstPyrIdy;
+                srcFrame->getPyramidAt(srcPyrImage, OdometryFrame::PYR_IMAGE, level);
+                dstFrame->getPyramidAt(dstPyrImage, OdometryFrame::PYR_IMAGE, level);
+                dstFrame->getPyramidAt(dstPyrIdx, OdometryFrame::PYR_DIX, level);
+                dstFrame->getPyramidAt(dstPyrIdy, OdometryFrame::PYR_DIY, level);
+                calcRgbdLsmMatrices(srcPyrImage, srcPyrCloud, resultRt, dstPyrImage, dstPyrIdx, dstPyrIdy,
+                                    corresps_rgbd, fx, fy, sobelScale,
+                                    AtA_rgbd, AtB_rgbd, rgbdEquationFuncPtr, transformDim);
+
+                AtA += AtA_rgbd;
+                AtB += AtB_rgbd;
+            }
+            if(corresps_icp.rows >= minCorrespsCount)
+            {
+                const Mat dstPyrCloud, dstPyrNormals;
+                dstFrame->getPyramidAt(dstPyrCloud, OdometryFrame::PYR_CLOUD, level);
+                dstFrame->getPyramidAt(dstPyrNormals, OdometryFrame::PYR_NORM, level);
+                calcICPLsmMatrices(srcPyrCloud, resultRt, dstPyrCloud, dstPyrNormals,
+                                   corresps_icp, AtA_icp, AtB_icp, icpEquationFuncPtr, transformDim);
+                AtA += AtA_icp;
+                AtB += AtB_icp;
+            }
+
+            bool solutionExist = solveSystem(AtA, AtB, determinantThreshold, ksi);
+            if(!solutionExist)
+                break;
+
+            if(transfromType == Odometry::ROTATION)
+            {
+                Mat tmp(6, 1, CV_64FC1, Scalar(0));
+                ksi.copyTo(tmp.rowRange(0,3));
+                ksi = tmp;
+            }
+            else if(transfromType == Odometry::TRANSLATION)
+            {
+                Mat tmp(6, 1, CV_64FC1, Scalar(0));
+                ksi.copyTo(tmp.rowRange(3,6));
+                ksi = tmp;
+            }
+
+            computeProjectiveMatrix(ksi, currRt);
+            resultRt = currRt * resultRt;
+            isOk = true;
+        }
+    }
+    _Rt.create(resultRt.size(), resultRt.type());
+    Mat Rt = _Rt.getMat();
+    resultRt.copyTo(Rt);
+
+    if(isOk)
+    {
+        Mat deltaRt;
+        if(initRt.empty())
+            deltaRt = resultRt;
+        else
+            deltaRt = resultRt * initRt.inv(DECOMP_SVD);
+
+        isOk = testDeltaTransformation(deltaRt, maxTranslation, maxRotation);
+    }
+
+    return isOk;
+}
+
+//
+
+template<class ImageElemType>
+static void
+warpFrameImpl(InputArray _image, InputArray depth, InputArray _mask,
+              const Mat& Rt, const Mat& cameraMatrix, const Mat& distCoeff,
+              OutputArray _warpedImage, OutputArray warpedDepth, OutputArray warpedMask)
+{
+    CV_Assert(_image.size() == depth.size());
+
+    Mat cloud;
+    depthTo3d(depth, cameraMatrix, cloud);
+
+    std::vector<Point2f> points2d;
+    Mat transformedCloud;
+    perspectiveTransform(cloud, transformedCloud, Rt);
+    projectPoints(transformedCloud.reshape(3, 1), Mat::eye(3, 3, CV_64FC1), Mat::zeros(3, 1, CV_64FC1), cameraMatrix,
+                  distCoeff, points2d);
+
+    Mat image = _image.getMat();
+    Size sz = _image.size();
+    Mat mask = _mask.getMat();
+    _warpedImage.create(sz, image.type());
+    Mat warpedImage = _warpedImage.getMat();
+
+    Mat zBuffer(sz, CV_32FC1, std::numeric_limits<float>::max());
+    const Rect rect = Rect(Point(), sz);
+
+    for (int y = 0; y < sz.height; y++)
+    {
+        //const Point3f* cloud_row = cloud.ptr<Point3f>(y);
+        const Point3f* transformedCloud_row = transformedCloud.ptr<Point3f>(y);
+        const Point2f* points2d_row = &points2d[y*sz.width];
+        const ImageElemType* image_row = image.ptr<ImageElemType>(y);
+        const uchar* mask_row = mask.empty() ? 0 : mask.ptr<uchar>(y);
+        for (int x = 0; x < sz.width; x++)
+        {
+            const float transformed_z = transformedCloud_row[x].z;
+            const Point2i p2d = points2d_row[x];
+            if((!mask_row || mask_row[x]) && transformed_z > 0 && rect.contains(p2d) && /*!cvIsNaN(cloud_row[x].z) && */zBuffer.at<float>(p2d) > transformed_z)
+            {
+                warpedImage.at<ImageElemType>(p2d) = image_row[x];
+                zBuffer.at<float>(p2d) = transformed_z;
+            }
+        }
+    }
+
+    if(warpedMask.needed())
+        Mat(zBuffer != std::numeric_limits<float>::max()).copyTo(warpedMask);
+
+   if(warpedDepth.needed())
+    {
+        zBuffer.setTo(std::numeric_limits<float>::quiet_NaN(), zBuffer == std::numeric_limits<float>::max());
+        zBuffer.copyTo(warpedDepth);
+    }
+}
+
+void warpFrame(InputArray image, InputArray depth, InputArray mask,
+               InputArray Rt, InputArray cameraMatrix, InputArray distCoeff,
+               OutputArray warpedImage, OutputArray warpedDepth, OutputArray warpedMask)
+{
+    if (image.type() == CV_8UC1)
+        warpFrameImpl<uchar>(image, depth, mask, Rt.getMat(), cameraMatrix.getMat(), distCoeff.getMat(), warpedImage, warpedDepth, warpedMask);
+    else if (image.type() == CV_8UC3)
+        warpFrameImpl<Point3_<uchar> >(image, depth, mask, Rt.getMat(), cameraMatrix.getMat(), distCoeff.getMat(), warpedImage, warpedDepth, warpedMask);
+    else
+        CV_Error(Error::StsBadArg, "Image has to be type of CV_8UC1 or CV_8UC3");
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename TMat>
+struct OdometryFrameImpl : public OdometryFrame
+{
+    OdometryFrameImpl() : OdometryFrame(), image(), depth(), mask(), normals(), pyramids(N_PYRAMIDS) { }
+    OdometryFrameImpl(InputArray _image, InputArray _depth, InputArray _mask = noArray(), InputArray _normals = noArray(), int _ID = -1);
+    virtual ~OdometryFrameImpl() { }
+
+    virtual void setImage(InputArray  _image) CV_OVERRIDE
+    {
+        image = getTMat<TMat>(_image);
+    }
+    virtual void getImage(OutputArray _image) CV_OVERRIDE
+    {
+        _image.assign(image);
+    }
+    virtual void setDepth(InputArray  _depth) CV_OVERRIDE
+    {
+        depth = getTMat<TMat>(_depth);
+    }
+    virtual void getDepth(OutputArray _depth) CV_OVERRIDE
+    {
+        _depth.assign(depth);
+    }
+    virtual void setMask(InputArray  _mask) CV_OVERRIDE
+    {
+        mask = getTMat<TMat>(_mask);
+    }
+    virtual void getMask(OutputArray _mask) CV_OVERRIDE
+    {
+        _mask.assign(mask);
+    }
+    virtual void setNormals(InputArray  _normals) CV_OVERRIDE
+    {
+        normals = getTMat<TMat>(_normals);
+    }
+    virtual void getNormals(OutputArray _normals) CV_OVERRIDE
+    {
+        _normals.assign(normals);
+    }
+
+    virtual void setPyramidLevels(size_t _nLevels) CV_OVERRIDE
+    {
+        for (auto& p : pyramids)
+        {
+            p.resize(_nLevels, TMat());
+        }
+    }
+
+    virtual size_t getPyramidLevels(OdometryFramePyramidType what = PYR_IMAGE) CV_OVERRIDE
+    {
+        if (what < N_PYRAMIDS)
+            return pyramids[what].size();
+        else
+            return 0;
+    }
+
+    virtual void setPyramidAt(InputArray  _pyrImage, OdometryFramePyramidType pyrType, size_t level) CV_OVERRIDE
+    {
+        TMat img = getTMat<TMat>(_pyrImage);
+        pyramids[pyrType][level] = img;
+    }
+
+    virtual void getPyramidAt(OutputArray _pyrImage, OdometryFramePyramidType pyrType, size_t level) CV_OVERRIDE
+    {
+        TMat img;
+        img = pyramids[pyrType][level];
+        _pyrImage.assign(img);
+    }
+
+    TMat image;
+    TMat depth;
+    TMat mask;
+    TMat normals;
+
+    std::vector< std::vector<TMat> > pyramids;
+};
+
+template<>
+OdometryFrameImpl<Mat>::OdometryFrameImpl(InputArray _image, InputArray _depth, InputArray _mask, InputArray _normals, int _ID) :
+    OdometryFrame(),
+    image(_image.getMat()), depth(_depth.getMat()), mask(_mask.getMat()), normals(_normals.getMat()),
+    pyramids(N_PYRAMIDS)
+{
+    ID = _ID;
+}
+
+template<>
+OdometryFrameImpl<UMat>::OdometryFrameImpl(InputArray _image, InputArray _depth, InputArray _mask, InputArray _normals, int _ID) :
+    OdometryFrame(),
+    image(_image.getUMat()), depth(_depth.getUMat()), mask(_mask.getUMat()), normals(_normals.getUMat()),
+    pyramids(N_PYRAMIDS)
+{
+    ID = _ID;
+}
+
+
+Ptr<OdometryFrame> OdometryFrame::create(InputArray _image, InputArray _depth, InputArray _mask, InputArray _normals, int _ID)
+{
+    bool allEmpty = _image.empty() && _depth.empty() && _mask.empty() && _normals.empty();
+    bool useOcl = (_image.isUMat() || _image.empty()) &&
+                  (_depth.isUMat() || _depth.empty()) &&
+                  (_mask.isUMat()  || _mask.empty()) &&
+                  (_normals.isUMat() || _normals.empty());
+    if (useOcl && !allEmpty)
+        return makePtr<OdometryFrameImpl<UMat>>(_image, _depth, _mask, _normals, _ID);
+    else
+        return makePtr<OdometryFrameImpl<Mat>> (_image, _depth, _mask, _normals, _ID);
+}
+
+
+class OdometryImpl : public Odometry
+{
+public:
+    // initialized outside of a class
+    static const float defaultMaxTranslation;
+    static const float defaultMaxRotation;
+    static const float defaultMinGradientMagnitude;
+    static const std::vector<int> defaultIterCounts;
+    static const cv::Matx33f defaultCameraMatrix;
+    static const float defaultMinDepth;
+    static const float defaultMaxDepth;
+    static const float defaultMaxDepthDiff;
+    static const float defaultMaxPointsPart;
+
+    OdometryImpl(InputArray _cameraMatrix = noArray(),
+                 InputArray _iterCounts = noArray(),
+                 InputArray _minGradientMagnitudes = noArray(),
+                 Odometry::OdometryTransformType _transformType = Odometry::RIGID_BODY_MOTION)
+    {
+        setTransformType(_transformType);
+        setMaxTranslation(defaultMaxTranslation);
+        setMaxRotation(defaultMaxRotation);
+        setCameraMatrix(_cameraMatrix);
+        setIterationCounts(_iterCounts);
+        setMinGradientMagnitudes(_minGradientMagnitudes);
+    }
+
+    virtual ~OdometryImpl() { }
+
+    virtual bool compute(InputArray srcImage, InputArray srcDepth, InputArray srcMask, InputArray dstImage, InputArray dstDepth,
+                         InputArray dstMask, OutputArray Rt, InputArray initRt = noArray()) const CV_OVERRIDE
+    {
+        Ptr<OdometryFrame> srcFrame = makeOdometryFrame(srcImage, srcDepth, srcMask);
+        Ptr<OdometryFrame> dstFrame = makeOdometryFrame(dstImage, dstDepth, dstMask);
+
+        return compute(srcFrame, dstFrame, Rt, initRt);
+    }
+
+    virtual bool compute(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame,
+                         OutputArray Rt, InputArray initRt) const CV_OVERRIDE
+    {
+        Size srcSize = prepareFrameCache(srcFrame, OdometryFrame::CACHE_SRC);
+        Size dstSize = prepareFrameCache(dstFrame, OdometryFrame::CACHE_DST);
+
+        if (srcSize != dstSize)
+            CV_Error(Error::StsBadSize, "srcFrame and dstFrame have to have the same size (resolution).");
+
+        return computeImpl(srcFrame, dstFrame, Rt, initRt);
+    }
+
+    virtual Size prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType /*cacheType*/) const CV_OVERRIDE
+    {
+        if (!frame)
+            CV_Error(Error::StsBadArg, "Null frame pointer.");
+
+        return Size();
+    }
+
+    virtual void getCameraMatrix(OutputArray val) const CV_OVERRIDE
+    {
+        cameraMatrix.copyTo(val);
+    }
+    virtual void setCameraMatrix(InputArray val) CV_OVERRIDE
+    {
+        if (val.empty())
+        {
+            cameraMatrix = defaultCameraMatrix;
+        }
+        else
+        {
+            CV_Assert(val.rows() == 3 && val.cols() == 3 && val.channels() == 1);
+            CV_Assert(val.type() == CV_32F);
+            val.copyTo(cameraMatrix);
+        }
+    }
+
+    virtual Odometry::OdometryTransformType getTransformType() const CV_OVERRIDE
+    {
+        return transformType;
+    }
+    virtual void setTransformType(Odometry::OdometryTransformType val) CV_OVERRIDE
+    {
+        transformType = val;
+    }
+
+    virtual void getIterationCounts(OutputArray val) const CV_OVERRIDE
+    {
+        val.create((int)iterCounts.size(), 1, CV_32S);
+        Mat(iterCounts).copyTo(val);
+    }
+    virtual void setIterationCounts(InputArray val) CV_OVERRIDE
+    {
+        if (val.empty())
+        {
+            iterCounts = defaultIterCounts;
+        }
+        else
+        {
+            CV_Assert(val.type() == CV_32SC1);
+            CV_Assert(val.rows() == 1 || val.cols() == 1);
+            iterCounts.resize(val.rows() * val.cols());
+            val.copyTo(iterCounts);
+
+            minGradientMagnitudes.resize(iterCounts.size(), defaultMinGradientMagnitude);
+        }
+    }
+
+    virtual void getMinGradientMagnitudes(OutputArray val) const CV_OVERRIDE
+    {
+        val.create((int)minGradientMagnitudes.size(), 1, CV_32F);
+        Mat(minGradientMagnitudes).copyTo(val);
+    }
+    virtual void setMinGradientMagnitudes(InputArray val) CV_OVERRIDE
+    {
+        if (val.empty())
+        {
+            minGradientMagnitudes = std::vector<float>(iterCounts.size(), defaultMinGradientMagnitude);
+        }
+        else
+        {
+            CV_Assert(val.type() == CV_32FC1);
+            CV_Assert(val.rows() == 1 || val.cols() == 1);
+            size_t valSize = val.rows() * val.cols();
+            CV_Assert(valSize == iterCounts.size());
+            minGradientMagnitudes.resize(valSize);
+            val.copyTo(minGradientMagnitudes);
+        }
+    }
+
+    /** Get max allowed translation in meters.
+    Found delta transform is considered successful only if the translation is in given limits. */
+    virtual double getMaxTranslation() const CV_OVERRIDE
+    {
+        return maxTranslation;
+    }
+    /** Set max allowed translation in meters.
+    * Found delta transform is considered successful only if the translation is in given limits. */
+    virtual void setMaxTranslation(double val) CV_OVERRIDE
+    {
+        maxTranslation = val;
+    }
+    /** Get max allowed rotation in degrees.
+    * Found delta transform is considered successful only if the rotation is in given limits. */
+    virtual double getMaxRotation() const CV_OVERRIDE
+    {
+        return maxRotation;
+    }
+    /** Set max allowed rotation in degrees.
+    * Found delta transform is considered successful only if the rotation is in given limits. */
+    virtual void setMaxRotation(double val) CV_OVERRIDE
+    {
+        maxRotation = val;
+    }
+
+    virtual bool computeImpl(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame, OutputArray Rt,
+                             InputArray initRt) const = 0;
+
+    double maxTranslation, maxRotation;
+
+    Odometry::OdometryTransformType transformType;
+
+    Matx33f cameraMatrix;
+    std::vector<int> iterCounts;
+    std::vector<float> minGradientMagnitudes;
+};
+
+const float OdometryImpl::defaultMaxTranslation = 0.15f;
+const float OdometryImpl::defaultMaxRotation = 15.f;
+const float OdometryImpl::defaultMinGradientMagnitude = 10.f;
+const std::vector<int> OdometryImpl::defaultIterCounts = { 7, 7, 7, 10 };
+const cv::Matx33f OdometryImpl::defaultCameraMatrix = { /* fx, 0, cx*/ 0, 0, 0, /* 0, fy, cy */ 0, 0, 0, /**/ 0, 0, 0 };
+const float OdometryImpl::defaultMinDepth = 0.f;
+const float OdometryImpl::defaultMaxDepth = 4.f;
+const float OdometryImpl::defaultMaxDepthDiff = 0.07f;
+const float OdometryImpl::defaultMaxPointsPart = 0.07f;
+
+//
+
+// Public Odometry classes are pure abstract, therefore a sin of multiple inheritance should be forgiven
+class RgbdOdometryImpl : public OdometryImpl, public RgbdOdometry
+{
+public:
+    /** Constructor.
+     * @param cameraMatrix Camera matrix
+     * @param minDepth Pixels with depth less than minDepth will not be used (in meters)
+     * @param maxDepth Pixels with depth larger than maxDepth will not be used (in meters)
+     * @param maxDepthDiff Correspondences between pixels of two given frames will be filtered out
+     *                     if their depth difference is larger than maxDepthDiff (in meters)
+     * @param iterCounts Count of iterations on each pyramid level.
+     * @param minGradientMagnitudes For each pyramid level the pixels will be filtered out
+     *                              if they have gradient magnitude less than minGradientMagnitudes[level].
+     * @param maxPointsPart The method uses a random pixels subset of size frameWidth x frameHeight x pointsPart
+     * @param transformType Class of transformation
+     */
+    RgbdOdometryImpl(InputArray _cameraMatrix = noArray(),
+                     float _minDepth = defaultMinDepth,
+                     float _maxDepth = defaultMaxDepth,
+                     float _maxDepthDiff = defaultMaxDepthDiff,
+                     InputArray _iterCounts = noArray(),
+                     InputArray _minGradientMagnitudes = noArray(),
+                     float _maxPointsPart = defaultMaxPointsPart,
+                     Odometry::OdometryTransformType _transformType = Odometry::RIGID_BODY_MOTION) :
+        OdometryImpl(_cameraMatrix, _iterCounts, _minGradientMagnitudes, _transformType)
+    {
+        setMinDepth(_minDepth);
+        setMaxDepth(_maxDepth);
+        setMaxDepthDiff(_maxDepthDiff);
+        setMaxPointsPart(_maxPointsPart);
+    }
+
+    virtual ~RgbdOdometryImpl() { }
+
+    virtual Size prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const CV_OVERRIDE;
+
+    virtual Ptr<OdometryFrame> makeOdometryFrame(InputArray image, InputArray depth, InputArray mask) const CV_OVERRIDE;
+
+    virtual double getMinDepth() const CV_OVERRIDE
+    {
+        return minDepth;
+    }
+    virtual void setMinDepth(double val) CV_OVERRIDE
+    {
+        minDepth = val;
+    }
+    virtual double getMaxDepth() const CV_OVERRIDE
+    {
+        return maxDepth;
+    }
+    virtual void setMaxDepth(double val) CV_OVERRIDE
+    {
+        maxDepth = val;
+    }
+    virtual double getMaxDepthDiff() const CV_OVERRIDE
+    {
+        return maxDepthDiff;
+    }
+    virtual void setMaxDepthDiff(double val) CV_OVERRIDE
+    {
+        maxDepthDiff = val;
+    }
+    virtual double getMaxPointsPart() const CV_OVERRIDE
+    {
+        return maxPointsPart;
+    }
+    virtual void setMaxPointsPart(double val) CV_OVERRIDE
+    {
+        CV_Assert(val > 0. && val <= 1.);
+        maxPointsPart = val;
+    }
+
+    virtual void getIterationCounts(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getIterationCounts(val);
+    }
+    virtual void setIterationCounts(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setIterationCounts(val);
+    }
+    virtual void getMinGradientMagnitudes(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getMinGradientMagnitudes(val);
+    }
+    virtual void setMinGradientMagnitudes(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setMinGradientMagnitudes(val);
+    }
+    virtual void getCameraMatrix(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getCameraMatrix(val);
+    }
+    virtual void setCameraMatrix(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setCameraMatrix(val);
+    }
+    virtual Odometry::OdometryTransformType getTransformType() const CV_OVERRIDE
+    {
+        return OdometryImpl::getTransformType();
+    }
+    virtual void setTransformType(Odometry::OdometryTransformType val) CV_OVERRIDE
+    {
+        OdometryImpl::setTransformType(val);
+    }
+    virtual double getMaxTranslation() const CV_OVERRIDE
+    {
+        return OdometryImpl::getMaxTranslation();
+    }
+    virtual void setMaxTranslation(double val) CV_OVERRIDE
+    {
+        OdometryImpl::setMaxTranslation(val);
+    }
+    virtual double getMaxRotation() const CV_OVERRIDE
+    {
+        return OdometryImpl::getMaxRotation();
+    }
+    virtual void setMaxRotation(double val) CV_OVERRIDE
+    {
+        OdometryImpl::setMaxRotation(val);
+    }
+
+    virtual bool compute(InputArray srcImage, InputArray srcDepth, InputArray srcMask, InputArray dstImage, InputArray dstDepth,
+                         InputArray dstMask, OutputArray Rt, InputArray initRt = noArray()) const CV_OVERRIDE
+    {
+        return OdometryImpl::compute(srcImage, srcDepth, srcMask, dstImage, dstDepth, dstMask, Rt, initRt);
+    }
+
+    virtual bool compute(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame,
+                         OutputArray Rt, InputArray initRt) const CV_OVERRIDE
+    {
+        return OdometryImpl::compute(srcFrame, dstFrame, Rt, initRt);
+    }
+
+protected:
+
+    virtual bool computeImpl(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame, OutputArray Rt,
+                             InputArray initRt) const CV_OVERRIDE;
+
+    // Some params have commented desired type. It's due to AlgorithmInfo::addParams does not support it now.
+    /*float*/
+    double minDepth, maxDepth, maxDepthDiff;
+
+    double maxPointsPart;
+};
+
+Ptr<RgbdOdometry> RgbdOdometry::create(InputArray _cameraMatrix, float _minDepth, float _maxDepth,
+                                       float _maxDepthDiff, InputArray _iterCounts,
+                                       InputArray _minGradientMagnitudes, float _maxPointsPart,
+                                       Odometry::OdometryTransformType _transformType)
+{
+    return makePtr<RgbdOdometryImpl>(_cameraMatrix, _minDepth, _maxDepth, _maxDepthDiff, _iterCounts, _minGradientMagnitudes, _maxPointsPart, _transformType);
+}
+
+Ptr<OdometryFrame> RgbdOdometryImpl::makeOdometryFrame(InputArray _image, InputArray _depth, InputArray _mask) const
+{
+    // Can get rid of getMat() calls as soon as this Odometry algorithm supports UMats
+    return OdometryFrame::create(_image.getMat(), _depth.getMat(), _mask.getMat());
+}
+
+
+Size RgbdOdometryImpl::prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const
+{
+    OdometryImpl::prepareFrameCache(frame, cacheType);
+
+    // Can be transformed into template argument in the future
+    // when this algorithm supports OCL UMats too
+    typedef Mat TMat;
+
+    if (frame.dynamicCast<OdometryFrameImpl<UMat>>())
+        CV_Error(cv::Error::Code::StsBadArg, "RgbdOdometry does not support UMats yet");
+
+    TMat image;
+    frame->getImage(image);
+    if(image.empty())
+    {
+        if (frame->getPyramidLevels(OdometryFrame::PYR_IMAGE) > 0)
+        {
+            TMat pyr0;
+            frame->getPyramidAt(pyr0, OdometryFrame::PYR_IMAGE, 0);
+            frame->setImage(pyr0);
+        }
+        else
+            CV_Error(Error::StsBadSize, "Image or pyramidImage have to be set.");
+    }
+    checkImage(image);
+
+    TMat depth;
+    frame->getDepth(depth);
+    if(depth.empty())
+    {
+        if (frame->getPyramidLevels(OdometryFrame::PYR_DEPTH) > 0)
+        {
+            TMat pyr0;
+            frame->getPyramidAt(pyr0, OdometryFrame::PYR_DEPTH, 0);
+            frame->setDepth(pyr0);
+        }
+        else if(frame->getPyramidLevels(OdometryFrame::PYR_CLOUD) > 0)
+        {
+            TMat cloud;
+            frame->getPyramidAt(cloud, OdometryFrame::PYR_CLOUD, 0);
+            std::vector<TMat> xyz;
+            split(cloud, xyz);
+            frame->setDepth(xyz[2]);
+        }
+        else
+            CV_Error(Error::StsBadSize, "Depth or pyramidDepth or pyramidCloud have to be set.");
+    }
+    checkDepth(depth, image.size());
+
+    TMat mask;
+    frame->getMask(mask);
+    if (mask.empty() && frame->getPyramidLevels(OdometryFrame::PYR_MASK) > 0)
+    {
+        TMat pyr0;
+        frame->getPyramidAt(pyr0, OdometryFrame::PYR_MASK, 0);
+        frame->setMask(pyr0);
+    }
+    checkMask(mask, image.size());
+
+    auto tframe = frame.dynamicCast<OdometryFrameImpl<TMat>>();
+    preparePyramidImage(image, tframe->pyramids[OdometryFrame::PYR_IMAGE], iterCounts.size());
+
+    preparePyramidDepth(depth, tframe->pyramids[OdometryFrame::PYR_DEPTH], iterCounts.size());
+
+    preparePyramidMask<TMat>(mask, tframe->pyramids[OdometryFrame::PYR_DEPTH], (float)minDepth, (float)maxDepth,
+                             tframe->pyramids[OdometryFrame::PYR_NORM], tframe->pyramids[OdometryFrame::PYR_MASK]);
+
+    if(cacheType & OdometryFrame::CACHE_SRC)
+        preparePyramidCloud<TMat>(tframe->pyramids[OdometryFrame::PYR_DEPTH], cameraMatrix, tframe->pyramids[OdometryFrame::PYR_CLOUD]);
+
+    if(cacheType & OdometryFrame::CACHE_DST)
+    {
+        preparePyramidSobel<TMat>(tframe->pyramids[OdometryFrame::PYR_IMAGE], 1, 0, tframe->pyramids[OdometryFrame::PYR_DIX]);
+        preparePyramidSobel<TMat>(tframe->pyramids[OdometryFrame::PYR_IMAGE], 0, 1, tframe->pyramids[OdometryFrame::PYR_DIY]);
+        preparePyramidTexturedMask(tframe->pyramids[OdometryFrame::PYR_DIX], tframe->pyramids[OdometryFrame::PYR_DIY], minGradientMagnitudes,
+                                   tframe->pyramids[OdometryFrame::PYR_MASK], maxPointsPart, tframe->pyramids[OdometryFrame::PYR_TEXMASK]);
+    }
+
+    return image.size();
+}
+
+
+bool RgbdOdometryImpl::computeImpl(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame, OutputArray Rt, InputArray initRt) const
+{
+    return RGBDICPOdometryImpl(Rt, initRt.getMat(), srcFrame, dstFrame, cameraMatrix, (float)maxDepthDiff, iterCounts, maxTranslation, maxRotation, RGBD_ODOMETRY, transformType);
+}
+
+//
+
+// Public Odometry classes are pure abstract, therefore a sin of multiple inheritance should be forgiven
+class ICPOdometryImpl : public OdometryImpl, public ICPOdometry
+{
+public:
+
+    /** Constructor.
+     * @param cameraMatrix Camera matrix
+     * @param minDepth Pixels with depth less than minDepth will not be used
+     * @param maxDepth Pixels with depth larger than maxDepth will not be used
+     * @param maxDepthDiff Correspondences between pixels of two given frames will be filtered out
+     *                     if their depth difference is larger than maxDepthDiff
+     * @param maxPointsPart The method uses a random pixels subset of size frameWidth x frameHeight x pointsPart
+     * @param iterCounts Count of iterations on each pyramid level.
+     * @param transformType Class of trasformation
+     */
+    ICPOdometryImpl(InputArray _cameraMatrix = noArray(),
+                    float _minDepth = defaultMinDepth,
+                    float _maxDepth = defaultMaxDepth,
+                    float _maxDepthDiff = defaultMaxDepthDiff,
+                    float _maxPointsPart = defaultMaxPointsPart,
+                    InputArray _iterCounts = noArray(),
+                    Odometry::OdometryTransformType _transformType = Odometry::RIGID_BODY_MOTION) :
+        OdometryImpl(_cameraMatrix, _iterCounts, noArray(), _transformType)
+    {
+        setMinDepth(_minDepth);
+        setMaxDepth(_maxDepth);
+        setMaxDepthDiff(_maxDepthDiff);
+        setMaxPointsPart(_maxPointsPart);
+    }
+
+    virtual Size prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const CV_OVERRIDE;
+
+    virtual Ptr<OdometryFrame> makeOdometryFrame(InputArray image, InputArray depth, InputArray mask) const CV_OVERRIDE;
+
+    virtual double getMinDepth() const CV_OVERRIDE
+    {
+        return minDepth;
+    }
+    virtual void setMinDepth(double val) CV_OVERRIDE
+    {
+        minDepth = val;
+    }
+    virtual double getMaxDepth() const CV_OVERRIDE
+    {
+        return maxDepth;
+    }
+    virtual void setMaxDepth(double val) CV_OVERRIDE
+    {
+        maxDepth = val;
+    }
+    virtual double getMaxDepthDiff() const CV_OVERRIDE
+    {
+        return maxDepthDiff;
+    }
+    virtual void setMaxDepthDiff(double val) CV_OVERRIDE
+    {
+        maxDepthDiff = val;
+    }
+    virtual double getMaxPointsPart() const CV_OVERRIDE
+    {
+        return maxPointsPart;
+    }
+    virtual void setMaxPointsPart(double val) CV_OVERRIDE
+    {
+        CV_Assert(val > 0. && val <= 1.);
+        maxPointsPart = val;
+    }
+
+    virtual void getIterationCounts(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getIterationCounts(val);
+    }
+    virtual void setIterationCounts(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setIterationCounts(val);
+    }
+    virtual void getMinGradientMagnitudes(OutputArray /*val*/) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, "This Odometry class does not use minGradientMagnitudes");
+    }
+    virtual void setMinGradientMagnitudes(InputArray /*val*/) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, "This Odometry class does not use minGradientMagnitudes");
+    }
+    virtual void getCameraMatrix(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getCameraMatrix(val);
+    }
+    virtual void setCameraMatrix(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setCameraMatrix(val);
+    }
+    virtual Odometry::OdometryTransformType getTransformType() const CV_OVERRIDE
+    {
+        return OdometryImpl::getTransformType();
+    }
+    virtual void setTransformType(Odometry::OdometryTransformType val) CV_OVERRIDE
+    {
+        OdometryImpl::setTransformType(val);
+    }
+    virtual double getMaxTranslation() const CV_OVERRIDE
+    {
+        return OdometryImpl::getMaxTranslation();
+    }
+    virtual void setMaxTranslation(double val) CV_OVERRIDE
+    {
+        OdometryImpl::setMaxTranslation(val);
+    }
+    virtual double getMaxRotation() const CV_OVERRIDE
+    {
+        return OdometryImpl::getMaxRotation();
+    }
+    virtual void setMaxRotation(double val) CV_OVERRIDE
+    {
+        OdometryImpl::setMaxRotation(val);
+    }
+
+    virtual Ptr<RgbdNormals> getNormalsComputer() const CV_OVERRIDE
+    {
+        return normalsComputer;
+    }
+
+    virtual bool compute(InputArray srcImage, InputArray srcDepth, InputArray srcMask, InputArray dstImage, InputArray dstDepth,
+                         InputArray dstMask, OutputArray Rt, InputArray initRt = noArray()) const CV_OVERRIDE
+    {
+        return OdometryImpl::compute(srcImage, srcDepth, srcMask, dstImage, dstDepth, dstMask, Rt, initRt);
+    }
+
+    virtual bool compute(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame,
+                         OutputArray Rt, InputArray initRt) const CV_OVERRIDE
+    {
+        return OdometryImpl::compute(srcFrame, dstFrame, Rt, initRt);
+    }
+
+protected:
+
+    virtual bool computeImpl(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame, OutputArray Rt,
+                             InputArray initRt) const CV_OVERRIDE;
+
+    // Some params have commented desired type. It's due to AlgorithmInfo::addParams does not support it now.
+    /*float*/
+    double minDepth, maxDepth, maxDepthDiff;
+    /*float*/
+    double maxPointsPart;
+
+    mutable Ptr<RgbdNormals> normalsComputer;
+};
+
+Ptr<ICPOdometry> ICPOdometry::create(InputArray _cameraMatrix, float _minDepth, float _maxDepth,
+                                     float _maxDepthDiff, float _maxPointsPart, InputArray _iterCounts,
+                                     Odometry::OdometryTransformType _transformType)
+{
+    return makePtr<ICPOdometryImpl>(_cameraMatrix, _minDepth, _maxDepth, _maxDepthDiff, _maxPointsPart, _iterCounts, _transformType);
+}
+
+Ptr<OdometryFrame> ICPOdometryImpl::makeOdometryFrame(InputArray _image, InputArray _depth, InputArray _mask) const
+{
+    // Can get rid of getMat() calls as soon as this Odometry algorithm supports UMats
+    return OdometryFrame::create(_image.getMat(), _depth.getMat(), _mask.getMat());
+}
+
+
+Size ICPOdometryImpl::prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const
+{
+    OdometryImpl::prepareFrameCache(frame, cacheType);
+
+    // Can be transformed into template argument in the future
+    // when this algorithm supports OCL UMats too
+    typedef Mat TMat;
+
+    if (frame.dynamicCast<OdometryFrameImpl<UMat>>())
+        CV_Error(cv::Error::Code::StsBadArg, "ICPOdometry does not support UMats yet");
+
+    TMat depth;
+    frame->getDepth(depth);
+    if(depth.empty())
+    {
+        if (frame->getPyramidLevels(OdometryFrame::PYR_DEPTH))
+        {
+            TMat pyr0;
+            frame->getPyramidAt(pyr0, OdometryFrame::PYR_DEPTH, 0);
+            frame->setDepth(pyr0);
+        }
+        else if(frame->getPyramidLevels(OdometryFrame::PYR_CLOUD))
+        {
+            TMat cloud;
+            frame->getPyramidAt(cloud, OdometryFrame::PYR_CLOUD, 0);
+            std::vector<TMat> xyz;
+            split(cloud, xyz);
+            frame->setDepth(xyz[2]);
+        }
+        else
+            CV_Error(Error::StsBadSize, "Depth or pyramidDepth or pyramidCloud have to be set.");
+    }
+    checkDepth(depth, depth.size());
+
+    TMat mask;
+    frame->getMask(mask);
+    if (mask.empty() && frame->getPyramidLevels(OdometryFrame::PYR_MASK))
+    {
+        Mat m0;
+        frame->getPyramidAt(m0, OdometryFrame::PYR_MASK, 0);
+        frame->setMask(m0);
+    }
+    checkMask(mask, depth.size());
+
+    auto tframe = frame.dynamicCast<OdometryFrameImpl<TMat>>();
+    preparePyramidDepth(depth, tframe->pyramids[OdometryFrame::PYR_DEPTH], iterCounts.size());
+
+    preparePyramidCloud<TMat>(tframe->pyramids[OdometryFrame::PYR_DEPTH], cameraMatrix, tframe->pyramids[OdometryFrame::PYR_CLOUD]);
+
+    if(cacheType & OdometryFrame::CACHE_DST)
+    {
+        TMat normals;
+        frame->getNormals(normals);
+        if(normals.empty())
+        {
+            if (frame->getPyramidLevels(OdometryFrame::PYR_NORM))
+            {
+                TMat n0;
+                frame->getPyramidAt(n0, OdometryFrame::PYR_NORM, 0);
+                frame->setNormals(n0);
+            }
+            else
+            {
+                Matx33f K;
+                if (!normalsComputer.empty())
+                    normalsComputer->getK(K);
+                if(normalsComputer.empty() ||
+                   normalsComputer->getRows() != depth.rows ||
+                   normalsComputer->getCols() != depth.cols ||
+                   norm(K, cameraMatrix) > FLT_EPSILON)
+                    normalsComputer = RgbdNormals::create(depth.rows,
+                                                          depth.cols,
+                                                          depth.depth(),
+                                                          cameraMatrix,
+                                                          normalWinSize,
+                                                          normalMethod);
+                TMat c0;
+                frame->getPyramidAt(c0, OdometryFrame::PYR_CLOUD, 0);
+                normalsComputer->apply(c0, normals);
+                frame->setNormals(normals);
+            }
+        }
+        checkNormals(normals, depth.size());
+
+        preparePyramidNormals(normals, tframe->pyramids[OdometryFrame::PYR_DEPTH], tframe->pyramids[OdometryFrame::PYR_NORM]);
+
+        preparePyramidMask<TMat>(mask, tframe->pyramids[OdometryFrame::PYR_DEPTH], (float)minDepth, (float)maxDepth,
+                                 tframe->pyramids[OdometryFrame::PYR_NORM], tframe->pyramids[OdometryFrame::PYR_MASK]);
+
+        preparePyramidNormalsMask(tframe->pyramids[OdometryFrame::PYR_NORM], tframe->pyramids[OdometryFrame::PYR_MASK],
+                                  maxPointsPart, tframe->pyramids[OdometryFrame::PYR_NORMMASK]);
+    }
+    else
+        preparePyramidMask<TMat>(mask, tframe->pyramids[OdometryFrame::PYR_DEPTH], (float)minDepth, (float)maxDepth,
+                                 tframe->pyramids[OdometryFrame::PYR_NORM], tframe->pyramids[OdometryFrame::PYR_MASK]);
+
+    return depth.size();
+}
+
+
+bool ICPOdometryImpl::computeImpl(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame, OutputArray Rt, InputArray initRt) const
+{
+    return RGBDICPOdometryImpl(Rt, initRt.getMat(), srcFrame, dstFrame, cameraMatrix, (float)maxDepthDiff, iterCounts, maxTranslation, maxRotation, ICP_ODOMETRY, transformType);
+}
+
+//
+
+// Public Odometry classes are pure abstract, therefore a sin of multiple inheritance should be forgiven
+class RgbdICPOdometryImpl : public OdometryImpl, public RgbdICPOdometry
+{
+public:
+    /** Constructor.
+     * @param cameraMatrix Camera matrix
+     * @param minDepth Pixels with depth less than minDepth will not be used
+     * @param maxDepth Pixels with depth larger than maxDepth will not be used
+     * @param maxDepthDiff Correspondences between pixels of two given frames will be filtered out
+     *                     if their depth difference is larger than maxDepthDiff
+     * @param maxPointsPart The method uses a random pixels subset of size frameWidth x frameHeight x pointsPart
+     * @param iterCounts Count of iterations on each pyramid level.
+     * @param minGradientMagnitudes For each pyramid level the pixels will be filtered out
+     *                              if they have gradient magnitude less than minGradientMagnitudes[level].
+     * @param transformType Class of trasformation
+     */
+    RgbdICPOdometryImpl(InputArray _cameraMatrix = noArray(),
+                        float _minDepth = defaultMinDepth,
+                        float _maxDepth = defaultMaxDepth,
+                        float _maxDepthDiff = defaultMaxDepthDiff,
+                        float _maxPointsPart = defaultMaxPointsPart,
+                        InputArray _iterCounts = noArray(),
+                        InputArray _minGradientMagnitudes = noArray(),
+                        Odometry::OdometryTransformType _transformType = Odometry::RIGID_BODY_MOTION) :
+        OdometryImpl(_cameraMatrix, _iterCounts, _minGradientMagnitudes, _transformType)
+    {
+        setMinDepth(_minDepth);
+        setMaxDepth(_maxDepth);
+        setMaxDepthDiff(_maxDepthDiff);
+        setMaxPointsPart(_maxPointsPart);
+    }
+
+    virtual Size prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const CV_OVERRIDE;
+
+    virtual Ptr<OdometryFrame> makeOdometryFrame(InputArray image, InputArray depth, InputArray mask) const CV_OVERRIDE;
+
+    virtual double getMinDepth() const CV_OVERRIDE
+    {
+        return minDepth;
+    }
+    virtual void setMinDepth(double val) CV_OVERRIDE
+    {
+        minDepth = val;
+    }
+    virtual double getMaxDepth() const CV_OVERRIDE
+    {
+        return maxDepth;
+    }
+    virtual void setMaxDepth(double val) CV_OVERRIDE
+    {
+        maxDepth = val;
+    }
+    virtual double getMaxDepthDiff() const CV_OVERRIDE
+    {
+        return maxDepthDiff;
+    }
+    virtual void setMaxDepthDiff(double val) CV_OVERRIDE
+    {
+        maxDepthDiff = val;
+    }
+    virtual double getMaxPointsPart() const CV_OVERRIDE
+    {
+        return maxPointsPart;
+    }
+    virtual void setMaxPointsPart(double val) CV_OVERRIDE
+    {
+        CV_Assert(val > 0. && val <= 1.);
+        maxPointsPart = val;
+    }
+
+    virtual void getIterationCounts(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getIterationCounts(val);
+    }
+    virtual void setIterationCounts(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setIterationCounts(val);
+    }
+    virtual void getMinGradientMagnitudes(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getMinGradientMagnitudes(val);
+    }
+    virtual void setMinGradientMagnitudes(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setMinGradientMagnitudes(val);
+    }
+    virtual void getCameraMatrix(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getCameraMatrix(val);
+    }
+    virtual void setCameraMatrix(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setCameraMatrix(val);
+    }
+    virtual Odometry::OdometryTransformType getTransformType() const CV_OVERRIDE
+    {
+        return OdometryImpl::getTransformType();
+    }
+    virtual void setTransformType(Odometry::OdometryTransformType val) CV_OVERRIDE
+    {
+        OdometryImpl::setTransformType(val);
+    }
+    virtual double getMaxTranslation() const CV_OVERRIDE
+    {
+        return OdometryImpl::getMaxTranslation();
+    }
+    virtual void setMaxTranslation(double val) CV_OVERRIDE
+    {
+        OdometryImpl::setMaxTranslation(val);
+    }
+    virtual double getMaxRotation() const CV_OVERRIDE
+    {
+        return OdometryImpl::getMaxRotation();
+    }
+    virtual void setMaxRotation(double val) CV_OVERRIDE
+    {
+        OdometryImpl::setMaxRotation(val);
+    }
+
+    virtual Ptr<RgbdNormals> getNormalsComputer() const CV_OVERRIDE
+    {
+        return normalsComputer;
+    }
+
+    virtual bool compute(InputArray srcImage, InputArray srcDepth, InputArray srcMask, InputArray dstImage, InputArray dstDepth,
+                         InputArray dstMask, OutputArray Rt, InputArray initRt = noArray()) const CV_OVERRIDE
+    {
+        return OdometryImpl::compute(srcImage, srcDepth, srcMask, dstImage, dstDepth, dstMask, Rt, initRt);
+    }
+
+    virtual bool compute(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame,
+                         OutputArray Rt, InputArray initRt) const CV_OVERRIDE
+    {
+        return OdometryImpl::compute(srcFrame, dstFrame, Rt, initRt);
+    }
+
+protected:
+
+    virtual bool computeImpl(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame, OutputArray Rt,
+                             InputArray initRt) const CV_OVERRIDE;
+
+    // Some params have commented desired type. It's due to AlgorithmInfo::addParams does not support it now.
+    /*float*/
+    double minDepth, maxDepth, maxDepthDiff;
+    /*float*/
+    double maxPointsPart;
+
+    mutable Ptr<RgbdNormals> normalsComputer;
+};
+
+
+Ptr<RgbdICPOdometry> RgbdICPOdometry::create(InputArray _cameraMatrix, float _minDepth, float _maxDepth,
+                                             float _maxDepthDiff, float _maxPointsPart, InputArray _iterCounts,
+                                             InputArray _minGradientMagnitudes,
+                                             Odometry::OdometryTransformType _transformType)
+{
+    return makePtr<RgbdICPOdometryImpl>(_cameraMatrix, _minDepth, _maxDepth, _maxDepthDiff, _maxPointsPart, _iterCounts, _minGradientMagnitudes, _transformType);
+}
+
+Ptr<OdometryFrame> RgbdICPOdometryImpl::makeOdometryFrame(InputArray _image, InputArray _depth, InputArray _mask) const
+{
+    // Can get rid of getMat() calls as soon as this Odometry algorithm supports UMats
+    return OdometryFrame::create(_image.getMat(), _depth.getMat(), _mask.getMat());
+}
+
+
+Size RgbdICPOdometryImpl::prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const
+{
+    OdometryImpl::prepareFrameCache(frame, cacheType);
+
+    // Can be transformed into template argument in the future
+    // when this algorithm supports OCL UMats too
+    typedef Mat TMat;
+
+    if (frame.dynamicCast<OdometryFrameImpl<UMat>>())
+        CV_Error(cv::Error::Code::StsBadArg, "RgbdICPOdometry does not support UMats yet");
+
+    TMat image;
+    frame->getImage(image);
+    if(image.empty())
+    {
+        if (frame->getPyramidLevels(OdometryFrame::PYR_IMAGE))
+        {
+            TMat p0;
+            frame->getPyramidAt(p0, OdometryFrame::PYR_IMAGE, 0);
+            frame->setImage(p0);
+        }
+        else
+            CV_Error(Error::StsBadSize, "Image or pyramidImage have to be set.");
+    }
+    checkImage(image);
+
+    TMat depth;
+    frame->getDepth(depth);
+    if (depth.empty())
+    {
+        if (frame->getPyramidLevels(OdometryFrame::PYR_DEPTH))
+        {
+            TMat d0;
+            frame->getPyramidAt(d0, OdometryFrame::PYR_DEPTH, 0);
+            frame->setDepth(d0);
+        }
+        else if(frame->getPyramidLevels(OdometryFrame::PYR_CLOUD))
+        {
+            TMat cloud;
+            frame->getPyramidAt(cloud, OdometryFrame::PYR_CLOUD, 0);
+            std::vector<TMat> xyz;
+            split(cloud, xyz);
+            frame->setDepth(xyz[2]);
+        }
+        else
+            CV_Error(Error::StsBadSize, "Depth or pyramidDepth or pyramidCloud have to be set.");
+    }
+    checkDepth(depth, image.size());
+
+    TMat mask;
+    frame->getMask(mask);
+    if(mask.empty() && frame->getPyramidLevels(OdometryFrame::PYR_MASK))
+    {
+        TMat m0;
+        frame->getPyramidAt(m0, OdometryFrame::PYR_MASK, 0);
+        frame->setMask(m0);
+    }
+    checkMask(mask, image.size());
+
+    auto tframe = frame.dynamicCast<OdometryFrameImpl<TMat>>();
+    preparePyramidImage(image, tframe->pyramids[OdometryFrame::PYR_IMAGE], iterCounts.size());
+
+    preparePyramidDepth(depth, tframe->pyramids[OdometryFrame::PYR_DEPTH], iterCounts.size());
+
+    preparePyramidCloud<TMat>(tframe->pyramids[OdometryFrame::PYR_DEPTH], cameraMatrix, tframe->pyramids[OdometryFrame::PYR_CLOUD]);
+
+    if(cacheType & OdometryFrame::CACHE_DST)
+    {
+        TMat normals;
+        frame->getNormals(normals);
+        if (normals.empty())
+        {
+            if (frame->getPyramidLevels(OdometryFrame::PYR_NORM))
+            {
+                TMat n0;
+                frame->getPyramidAt(n0, OdometryFrame::PYR_NORM, 0);
+                frame->setNormals(n0);
+            }
+            else
+            {
+                Matx33f K;
+                if (!normalsComputer.empty())
+                    normalsComputer->getK(K);
+                if(normalsComputer.empty() ||
+                   normalsComputer->getRows() != depth.rows ||
+                   normalsComputer->getCols() != depth.cols ||
+                   norm(K, cameraMatrix) > FLT_EPSILON)
+                    normalsComputer = RgbdNormals::create(depth.rows,
+                                                          depth.cols,
+                                                          depth.depth(),
+                                                          cameraMatrix,
+                                                          normalWinSize,
+                                                          normalMethod);
+
+                TMat c0;
+                frame->getPyramidAt(c0, OdometryFrame::PYR_CLOUD, 0);
+                normalsComputer->apply(c0, normals);
+                frame->setNormals(normals);
+            }
+        }
+        checkNormals(normals, depth.size());
+
+        preparePyramidNormals(normals, tframe->pyramids[OdometryFrame::PYR_DEPTH], tframe->pyramids[OdometryFrame::PYR_NORM]);
+
+        preparePyramidMask<TMat>(mask, tframe->pyramids[OdometryFrame::PYR_DEPTH], (float)minDepth, (float)maxDepth,
+                                 tframe->pyramids[OdometryFrame::PYR_NORM], tframe->pyramids[OdometryFrame::PYR_MASK]);
+
+        preparePyramidSobel<TMat>(tframe->pyramids[OdometryFrame::PYR_IMAGE], 1, 0, tframe->pyramids[OdometryFrame::PYR_DIX]);
+        preparePyramidSobel<TMat>(tframe->pyramids[OdometryFrame::PYR_IMAGE], 0, 1, tframe->pyramids[OdometryFrame::PYR_DIY]);
+        preparePyramidTexturedMask(tframe->pyramids[OdometryFrame::PYR_DIX], tframe->pyramids[OdometryFrame::PYR_DIY],
+                                   minGradientMagnitudes, tframe->pyramids[OdometryFrame::PYR_MASK],
+                                   maxPointsPart, tframe->pyramids[OdometryFrame::PYR_TEXMASK]);
+
+        preparePyramidNormalsMask(tframe->pyramids[OdometryFrame::PYR_NORM], tframe->pyramids[OdometryFrame::PYR_MASK],
+                                  maxPointsPart, tframe->pyramids[OdometryFrame::PYR_NORMMASK]);
+    }
+    else
+        preparePyramidMask<TMat>(mask, tframe->pyramids[OdometryFrame::PYR_DEPTH], (float)minDepth, (float)maxDepth,
+                                 tframe->pyramids[OdometryFrame::PYR_NORM], tframe->pyramids[OdometryFrame::PYR_MASK]);
+
+    return image.size();
+}
+
+
+bool RgbdICPOdometryImpl::computeImpl(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame, OutputArray Rt, InputArray initRt) const
+{
+    return RGBDICPOdometryImpl(Rt, initRt.getMat(), srcFrame, dstFrame, cameraMatrix, (float)maxDepthDiff, iterCounts,  maxTranslation, maxRotation, MERGED_ODOMETRY, transformType);
+}
+
+//
+
+// Public Odometry classes are pure abstract, therefore a sin of multiple inheritance should be forgiven
+class FastICPOdometryImpl : public OdometryImpl, public FastICPOdometry
+{
+public:
+    /** Creates FastICPOdometry object
+     * @param cameraMatrix Camera matrix
+     * @param maxDistDiff Correspondences between pixels of two given frames will be filtered out
+     *                     if their depth difference is larger than maxDepthDiff
+     * @param angleThreshold Correspondence will be filtered out
+     *                     if an angle between their normals is bigger than threshold
+     * @param sigmaDepth Depth sigma in meters for bilateral smooth
+     * @param sigmaSpatial Spatial sigma in pixels for bilateral smooth
+     * @param kernelSize Kernel size in pixels for bilateral smooth
+     * @param iterCounts Count of iterations on each pyramid level
+     * @param depthFactor pre-scale per 1 meter for input values
+     * @param truncateThreshold Threshold for depth truncation in meters
+     *        All depth values beyond this threshold will be set to zero
+     */
+    FastICPOdometryImpl(InputArray _cameraMatrix = noArray(),
+                        float _maxDistDiff = 0.1f,
+                        float _angleThreshold = (float)(30. * CV_PI / 180.),
+                        float _sigmaDepth = 0.04f,
+                        float _sigmaSpatial = 4.5f,
+                        int _kernelSize = 7,
+                        InputArray _iterCounts = noArray(),
+                        float _depthFactor = 1.f,
+                        float _truncateThreshold = 0.f) :
+        OdometryImpl(_cameraMatrix, _iterCounts, noArray(), Odometry::RIGID_BODY_MOTION)
+    {
+        setMaxDistDiff(_maxDistDiff);
+        setAngleThreshold(_angleThreshold);
+        setSigmaDepth(_sigmaDepth);
+        setSigmaSpatial(_sigmaSpatial);
+        setKernelSize(_kernelSize);
+        setDepthFactor(_depthFactor);
+        setTruncateThreshold(_truncateThreshold);
+    }
+
+    virtual Size prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const CV_OVERRIDE;
+
+    virtual Ptr<OdometryFrame> makeOdometryFrame(InputArray image, InputArray depth, InputArray mask) const CV_OVERRIDE;
+
+    virtual double getMaxDistDiff() const CV_OVERRIDE
+    {
+        return maxDistDiff;
+    }
+    virtual void setMaxDistDiff(float val) CV_OVERRIDE
+    {
+        CV_Assert(val > 0);
+        maxDistDiff = val;
+    }
+    virtual float getAngleThreshold() const CV_OVERRIDE
+    {
+        return angleThreshold;
+    }
+    virtual void setAngleThreshold(float f) CV_OVERRIDE
+    {
+        CV_Assert(f > 0);
+        angleThreshold = f;
+    }
+    virtual float getSigmaDepth() const CV_OVERRIDE
+    {
+        return sigmaDepth;
+    }
+    virtual void setSigmaDepth(float f) CV_OVERRIDE
+    {
+        CV_Assert(f > 0);
+        sigmaDepth = f;
+    }
+    virtual float getSigmaSpatial() const CV_OVERRIDE
+    {
+        return sigmaSpatial;
+    }
+    virtual void setSigmaSpatial(float f) CV_OVERRIDE
+    {
+        CV_Assert(f > 0);
+        sigmaSpatial = f;
+    }
+    virtual int getKernelSize() const CV_OVERRIDE
+    {
+        return kernelSize;
+    }
+    virtual void setKernelSize(int f) CV_OVERRIDE
+    {
+        CV_Assert(f > 0);
+        kernelSize = f;
+    }
+    virtual float getDepthFactor() const CV_OVERRIDE
+    {
+        return depthFactor;
+    }
+    virtual void setDepthFactor(float _depthFactor) CV_OVERRIDE
+    {
+        depthFactor = _depthFactor;
+    }
+    virtual float getTruncateThreshold() const CV_OVERRIDE
+    {
+        return truncateThreshold;
+    }
+    virtual void setTruncateThreshold(float _truncateThreshold) CV_OVERRIDE
+    {
+        truncateThreshold = _truncateThreshold;
+    }
+
+    virtual void getIterationCounts(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getIterationCounts(val);
+    }
+    virtual void setIterationCounts(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setIterationCounts(val);
+    }
+    virtual void getMinGradientMagnitudes(OutputArray /*val*/) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, "This Odometry class does not use minGradientMagnitudes");
+    }
+    virtual void setMinGradientMagnitudes(InputArray /*val*/) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, "This Odometry class does not use minGradientMagnitudes");
+    }
+    virtual void getCameraMatrix(OutputArray val) const CV_OVERRIDE
+    {
+        OdometryImpl::getCameraMatrix(val);
+    }
+    virtual void setCameraMatrix(InputArray val) CV_OVERRIDE
+    {
+        OdometryImpl::setCameraMatrix(val);
+    }
+    virtual double getMaxTranslation() const CV_OVERRIDE
+    {
+        return OdometryImpl::getMaxTranslation();
+    }
+    virtual void setMaxTranslation(double val) CV_OVERRIDE
+    {
+        OdometryImpl::setMaxTranslation(val);
+    }
+    virtual double getMaxRotation() const CV_OVERRIDE
+    {
+        return OdometryImpl::getMaxRotation();
+    }
+    virtual void setMaxRotation(double val) CV_OVERRIDE
+    {
+        OdometryImpl::setMaxRotation(val);
+    }
+    virtual Odometry::OdometryTransformType getTransformType() const CV_OVERRIDE
+    {
+        return Odometry::RIGID_BODY_MOTION;
+    }
+    virtual void setTransformType(Odometry::OdometryTransformType val) CV_OVERRIDE
+    {
+        if (val != Odometry::RIGID_BODY_MOTION)
+            CV_Error(CV_StsBadArg, "Rigid Body Motion is the only accepted transformation type for this odometry method");
+    }
+
+    virtual bool compute(InputArray srcImage, InputArray srcDepth, InputArray srcMask, InputArray dstImage, InputArray dstDepth,
+                         InputArray dstMask, OutputArray Rt, InputArray initRt = noArray()) const CV_OVERRIDE
+    {
+        return OdometryImpl::compute(srcImage, srcDepth, srcMask, dstImage, dstDepth, dstMask, Rt, initRt);
+    }
+
+    virtual bool compute(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame,
+                         OutputArray Rt, InputArray initRt) const CV_OVERRIDE
+    {
+        return OdometryImpl::compute(srcFrame, dstFrame, Rt, initRt);
+    }
+
+protected:
+
+    virtual bool computeImpl(const Ptr<OdometryFrame>& srcFrame, const Ptr<OdometryFrame>& dstFrame, OutputArray Rt,
+                             InputArray initRt) const CV_OVERRIDE;
+
+    template<typename TMat>
+    Size prepareFrameCacheT(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const;
+
+    // Some params have commented desired type. It's due to AlgorithmInfo::addParams does not support it now.
+    float maxDistDiff;
+
+    float angleThreshold;
+
+    float sigmaDepth;
+
+    float sigmaSpatial;
+
+    int kernelSize;
+
+    float depthFactor;
+
+    float truncateThreshold;
+};
+
+using namespace cv::kinfu;
+
+Ptr<FastICPOdometry> FastICPOdometry::create(InputArray _cameraMatrix,
+                                             float _maxDistDiff,
+                                             float _angleThreshold,
+                                             float _sigmaDepth,
+                                             float _sigmaSpatial,
+                                             int _kernelSize,
+                                             InputArray _iterCounts,
+                                             float _depthFactor,
+                                             float _truncateThreshold)
+{
+    return makePtr<FastICPOdometryImpl>(_cameraMatrix, _maxDistDiff, _angleThreshold,
+                                        _sigmaDepth, _sigmaSpatial, _kernelSize, _iterCounts,
+                                        _depthFactor, _truncateThreshold);
+}
+
+Ptr<OdometryFrame> FastICPOdometryImpl::makeOdometryFrame(InputArray _image, InputArray _depth, InputArray _mask) const
+{
+    return OdometryFrame::create(_image, _depth, _mask);
+}
+
+template<typename TMat>
+Size FastICPOdometryImpl::prepareFrameCacheT(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const
+{
+    OdometryImpl::prepareFrameCache(frame, cacheType);
+
+    TMat depth;
+    frame->getDepth(depth);
+    if (depth.empty())
+    {
+        if (frame->getPyramidLevels(OdometryFrame::PYR_CLOUD))
+        {
+            if (frame->getPyramidLevels(OdometryFrame::PYR_NORM))
+            {
+                TMat points, normals;
+                frame->getPyramidAt(points, OdometryFrame::PYR_CLOUD, 0);
+                frame->getPyramidAt(normals, OdometryFrame::PYR_NORM, 0);
+                std::vector<TMat> pyrPoints, pyrNormals;
+                // in, in, out, out
+                size_t nLevels = iterCounts.size();
+                buildPyramidPointsNormals(points, normals, pyrPoints, pyrNormals, (int)nLevels);
+                for (size_t i = 1; i < nLevels; i++)
+                {
+                    frame->setPyramidAt(pyrPoints [i], OdometryFrame::PYR_CLOUD, i);
+                    frame->setPyramidAt(pyrNormals[i], OdometryFrame::PYR_NORM,  i);
+                }
+
+                return points.size();
+            }
+            else
+            {
+                TMat cloud;
+                frame->getPyramidAt(cloud, OdometryFrame::PYR_CLOUD, 0);
+                std::vector<TMat> xyz;
+                split(cloud, xyz);
+                frame->setDepth(xyz[2]);
+            }
+        }
+        else if (frame->getPyramidLevels(OdometryFrame::PYR_DEPTH))
+        {
+            TMat d0;
+            frame->getPyramidAt(d0, OdometryFrame::PYR_DEPTH, 0);
+            frame->setDepth(d0);
+        }
+        else
+            CV_Error(Error::StsBadSize, "Depth or pyramidDepth or pyramidCloud have to be set.");
+    }
+    frame->getDepth(depth);
+    checkDepth(depth, depth.size());
+
+    // mask isn't used by FastICP
+    auto tframe = frame.dynamicCast<OdometryFrameImpl<TMat>>();
+    makeFrameFromDepth(depth, tframe->pyramids[OdometryFrame::PYR_CLOUD], tframe->pyramids[OdometryFrame::PYR_NORM], cameraMatrix, (int)iterCounts.size(),
+                       depthFactor, sigmaDepth, sigmaSpatial, kernelSize, truncateThreshold);
+
+    return depth.size();
+}
+
+Size FastICPOdometryImpl::prepareFrameCache(Ptr<OdometryFrame> frame, OdometryFrame::OdometryFrameCacheType cacheType) const
+{
+    auto oclFrame = frame.dynamicCast<OdometryFrameImpl<UMat>>();
+    auto cpuFrame = frame.dynamicCast<OdometryFrameImpl<Mat>>();
+    if (oclFrame != nullptr)
+    {
+        return prepareFrameCacheT<UMat>(frame, cacheType);
+    }
+    else if (cpuFrame != nullptr)
+    {
+        return prepareFrameCacheT<Mat>(frame, cacheType);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg, "Incorrect OdometryFrame type");
+    }
+}
+
+
+bool FastICPOdometryImpl::computeImpl(const Ptr<OdometryFrame>& srcFrame,
+                                      const Ptr<OdometryFrame>& dstFrame,
+                                      OutputArray Rt, InputArray /*initRt*/) const
+{
+    Intr intr(cameraMatrix);
+    Ptr<kinfu::ICP> icp = kinfu::makeICP(intr,
+                                         iterCounts,
+                                         angleThreshold,
+                                         maxDistDiff);
+
+    // KinFu's ICP calculates transformation from new frame to old one (src to dst)
+    Affine3f transform;
+    bool result;
+    auto srcOclFrame = srcFrame.dynamicCast<OdometryFrameImpl<UMat>>();
+    auto srcCpuFrame = srcFrame.dynamicCast<OdometryFrameImpl<Mat>>();
+    auto dstOclFrame = dstFrame.dynamicCast<OdometryFrameImpl<UMat>>();
+    auto dstCpuFrame = dstFrame.dynamicCast<OdometryFrameImpl<Mat>>();
+    bool useOcl = (srcOclFrame != nullptr) && (dstOclFrame != nullptr);
+    bool useCpu = (srcCpuFrame != nullptr) && (dstCpuFrame != nullptr);
+    if (useOcl)
+    {
+        result = icp->estimateTransform(transform,
+                                        dstOclFrame->pyramids[OdometryFrame::PYR_CLOUD], dstOclFrame->pyramids[OdometryFrame::PYR_NORM],
+                                        srcOclFrame->pyramids[OdometryFrame::PYR_CLOUD], srcOclFrame->pyramids[OdometryFrame::PYR_NORM]);
+    }
+    else if (useCpu)
+    {
+        result = icp->estimateTransform(transform,
+                                        dstCpuFrame->pyramids[OdometryFrame::PYR_CLOUD], dstCpuFrame->pyramids[OdometryFrame::PYR_NORM],
+                                        srcCpuFrame->pyramids[OdometryFrame::PYR_CLOUD], srcCpuFrame->pyramids[OdometryFrame::PYR_NORM]);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg, "Both OdometryFrames should have the same storage type: Mat or UMat");
+    }
+
+    Rt.create(Size(4, 4), CV_64FC1);
+    Mat(Matx44d(transform.matrix)).copyTo(Rt.getMat());
+
+    result = result && testDeltaTransformation(Mat(Matx44d(transform.matrix)), maxTranslation, maxRotation);
+
+    return result;
+}
+
+//
+
+Ptr<Odometry> Odometry::createFromName(const std::string& odometryType)
+{
+    if (odometryType == "RgbdOdometry")
+        return RgbdOdometry::create();
+    else if (odometryType == "ICPOdometry")
+        return ICPOdometry::create();
+    else if (odometryType == "RgbdICPOdometry")
+        return RgbdICPOdometry::create();
+    else if (odometryType == "FastICPOdometry")
+        return FastICPOdometry::create();
+    return Ptr<Odometry>();
+}
+
+} // namespace cv
diff --git a/modules/3d/src/rgbd/plane.cpp b/modules/3d/src/rgbd/plane.cpp
new file mode 100644
index 0000000000..3a580397ee
--- /dev/null
+++ b/modules/3d/src/rgbd/plane.cpp
@@ -0,0 +1,583 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+/** This is an implementation of a fast plane detection loosely inspired by
+ * Fast Plane Detection and Polygonalization in noisy 3D Range Images
+ * Jann Poppinga, Narunas Vaskevicius, Andreas Birk, and Kaustubh Pathak
+ * and the follow-up
+ * Fast Plane Detection for SLAM from Noisy Range Images in
+ * Both Structured and Unstructured Environments
+ * Junhao Xiao, Jianhua Zhang and Jianwei Zhang
+ * Houxiang Zhang and Hans Petter Hildre
+ */
+
+#include "../precomp.hpp"
+
+namespace cv
+{
+
+/** Structure defining a plane. The notations are from the second paper */
+class PlaneBase
+{
+public:
+    PlaneBase(const Vec3f& m, const Vec3f& n_in, int index) :
+        index_(index),
+        n_(n_in),
+        m_sum_(Vec3f(0, 0, 0)),
+        m_(m),
+        Q_(Matx33f::zeros()),
+        mse_(0),
+        K_(0)
+    {
+        UpdateD();
+    }
+
+    virtual
+        ~PlaneBase()
+    {  }
+
+    /** Compute the distance to the plane. This will be implemented by the children to take into account different
+     * sensor models
+     * @param p_j
+     * @return
+     */
+    virtual float distance(const Vec3f& p_j) const = 0;
+
+    /** The d coefficient in the plane equation ax+by+cz+d = 0
+     * @return
+     */
+    inline float d() const
+    {
+        return d_;
+    }
+
+    /** The normal to the plane
+     * @return the normal to the plane
+     */
+    const Vec3f& n() const
+    {
+        return n_;
+    }
+
+    /** Update the different coefficients of the plane, based on the new statistics
+     */
+    void UpdateParameters()
+    {
+        if (empty())
+            return;
+        m_ = m_sum_ / K_;
+        // Compute C
+        Matx33f C = Q_ - m_sum_ * m_.t();
+
+        // Compute n
+        SVD svd(C);
+        n_ = Vec3f(svd.vt.at<float>(2, 0), svd.vt.at<float>(2, 1), svd.vt.at<float>(2, 2));
+        mse_ = svd.w.at<float>(2) / K_;
+
+        UpdateD();
+    }
+
+    /** Update the different sum of point and sum of point*point.t()
+     */
+    void  UpdateStatistics(const Vec3f& point, const Matx33f& Q_local)
+    {
+        m_sum_ += point;
+        Q_ += Q_local;
+        ++K_;
+    }
+
+    inline size_t empty() const
+    {
+        return K_ == 0;
+    }
+
+    inline int
+        K() const
+    {
+        return K_;
+    }
+    /** The index of the plane */
+    int index_;
+protected:
+    /** The 4th coefficient in the plane equation ax+by+cz+d = 0 */
+    float d_;
+    /** Normal of the plane */
+    Vec3f n_;
+private:
+    inline void UpdateD()
+    {
+        d_ = -m_.dot(n_);
+    }
+    /** The sum of the points */
+    Vec3f m_sum_;
+    /** The mean of the points */
+    Vec3f m_;
+    /** The sum of pi * pi^\top */
+    Matx33f Q_;
+    /** The different matrices we need to update */
+    Matx33f C_;
+    float mse_;
+    /** the number of points that form the plane */
+    int K_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Basic planar child, with no sensor error model
+ */
+class Plane : public PlaneBase
+{
+public:
+    Plane(const Vec3f& m, const Vec3f& n_in, int index) :
+        PlaneBase(m, n_in, index)
+    { }
+
+    /** The computed distance is perfect in that case
+     * @param p_j the point to compute its distance to
+     * @return
+     */
+    float distance(const Vec3f& p_j) const CV_OVERRIDE
+    {
+        return std::abs(float(p_j.dot(n_) + d_));
+    }
+};
+
+/** Planar child with a quadratic error model
+ */
+class PlaneABC : public PlaneBase
+{
+public:
+    PlaneABC(const Vec3f& m, const Vec3f& n_in, int index, float sensor_error_a, float sensor_error_b, float sensor_error_c) :
+        PlaneBase(m, n_in, index),
+        sensor_error_a_(sensor_error_a),
+        sensor_error_b_(sensor_error_b),
+        sensor_error_c_(sensor_error_c)
+    {
+    }
+
+    /** The distance is now computed by taking the sensor error into account */
+    inline float distance(const Vec3f& p_j) const CV_OVERRIDE
+    {
+        float cst = p_j.dot(n_) + d_;
+        float err = sensor_error_a_ * p_j[2] * p_j[2] + sensor_error_b_ * p_j[2] + sensor_error_c_;
+        if (((cst - n_[2] * err <= 0) && (cst + n_[2] * err >= 0)) || ((cst + n_[2] * err <= 0) && (cst - n_[2] * err >= 0)))
+            return 0;
+        return std::min(std::abs(cst - err), std::abs(cst + err));
+    }
+private:
+    float sensor_error_a_;
+    float sensor_error_b_;
+    float sensor_error_c_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** The PlaneGrid contains statistic about the individual tiles
+ */
+class PlaneGrid
+{
+public:
+    PlaneGrid(const Mat_<Vec3f>& points3d, int block_size) :
+        block_size_(block_size)
+    {
+        // Figure out some dimensions
+        int mini_rows = points3d.rows / block_size;
+        if (points3d.rows % block_size != 0)
+            ++mini_rows;
+
+        int mini_cols = points3d.cols / block_size;
+        if (points3d.cols % block_size != 0)
+            ++mini_cols;
+
+        // Compute all the interesting quantities
+        m_.create(mini_rows, mini_cols);
+        n_.create(mini_rows, mini_cols);
+        Q_.create(points3d.rows, points3d.cols);
+        mse_.create(mini_rows, mini_cols);
+        for (int y = 0; y < mini_rows; ++y)
+            for (int x = 0; x < mini_cols; ++x)
+            {
+                // Update the tiles
+                Matx33f Q = Matx33f::zeros();
+                Vec3f m = Vec3f(0, 0, 0);
+                int K = 0;
+                for (int j = y * block_size; j < std::min((y + 1) * block_size, points3d.rows); ++j)
+                {
+                    const Vec3f* vec = points3d.ptr < Vec3f >(j, x * block_size), * vec_end;
+                    float* pointpointt = reinterpret_cast<float*>(Q_.ptr < Vec<float, 9> >(j, x * block_size));
+                    if (x == mini_cols - 1)
+                        vec_end = points3d.ptr < Vec3f >(j, points3d.cols - 1) + 1;
+                    else
+                        vec_end = vec + block_size;
+                    for (; vec != vec_end; ++vec, pointpointt += 9)
+                    {
+                        if (cvIsNaN(vec->val[0]))
+                            continue;
+                        // Fill point*point.t()
+                        *pointpointt = vec->val[0] * vec->val[0];
+                        *(pointpointt + 1) = vec->val[0] * vec->val[1];
+                        *(pointpointt + 2) = vec->val[0] * vec->val[2];
+                        *(pointpointt + 3) = *(pointpointt + 1);
+                        *(pointpointt + 4) = vec->val[1] * vec->val[1];
+                        *(pointpointt + 5) = vec->val[1] * vec->val[2];
+                        *(pointpointt + 6) = *(pointpointt + 2);
+                        *(pointpointt + 7) = *(pointpointt + 5);
+                        *(pointpointt + 8) = vec->val[2] * vec->val[2];
+
+                        Q += *reinterpret_cast<Matx33f*>(pointpointt);
+                        m += (*vec);
+                        ++K;
+                    }
+                }
+                if (K == 0)
+                {
+                    mse_(y, x) = std::numeric_limits<float>::max();
+                    continue;
+                }
+
+                m /= K;
+                m_(y, x) = m;
+
+                // Compute C
+                Matx33f C = Q - K * m * m.t();
+
+                // Compute n
+                SVD svd(C);
+                n_(y, x) = Vec3f(svd.vt.at<float>(2, 0), svd.vt.at<float>(2, 1), svd.vt.at<float>(2, 2));
+                mse_(y, x) = svd.w.at<float>(2) / K;
+            }
+    }
+
+    /** The size of the block */
+    int block_size_;
+    Mat_<Vec3f> m_;
+    Mat_<Vec3f> n_;
+    Mat_<Vec<float, 9> > Q_;
+    Mat_<float> mse_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class TileQueue
+{
+public:
+    struct PlaneTile
+    {
+        PlaneTile(int x, int y, float mse) :
+            x_(x),
+            y_(y),
+            mse_(mse)
+        { }
+
+        bool operator<(const PlaneTile& tile2) const
+        {
+            return mse_ < tile2.mse_;
+        }
+
+        int x_;
+        int y_;
+        float mse_;
+    };
+
+    TileQueue(const PlaneGrid& plane_grid)
+    {
+        done_tiles_ = Mat_<unsigned char>::zeros(plane_grid.mse_.rows, plane_grid.mse_.cols);
+        tiles_.clear();
+        for (int y = 0; y < plane_grid.mse_.rows; ++y)
+            for (int x = 0; x < plane_grid.mse_.cols; ++x)
+                if (plane_grid.mse_(y, x) != std::numeric_limits<float>::max())
+                    // Update the tiles
+                    tiles_.push_back(PlaneTile(x, y, plane_grid.mse_(y, x)));
+        // Sort tiles by MSE
+        tiles_.sort();
+    }
+
+    bool empty()
+    {
+        while (!tiles_.empty())
+        {
+            const PlaneTile& tile = tiles_.front();
+            if (done_tiles_(tile.y_, tile.x_))
+                tiles_.pop_front();
+            else
+                break;
+        }
+        return tiles_.empty();
+    }
+
+    const PlaneTile& front() const
+    {
+        return tiles_.front();
+    }
+
+    void remove(int y, int x)
+    {
+        done_tiles_(y, x) = 1;
+    }
+private:
+    /** The list of tiles ordered from most planar to least */
+    std::list<PlaneTile> tiles_;
+    /** contains 1 when the tiles has been studied, 0 otherwise */
+    Mat_<unsigned char> done_tiles_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class InlierFinder
+{
+public:
+    InlierFinder(float err, const Mat_<Vec3f>& points3d, const Mat_<Vec3f>& normals,
+                 unsigned char plane_index, int block_size) :
+        err_(err),
+        points3d_(points3d),
+        normals_(normals),
+        plane_index_(plane_index),
+        block_size_(block_size)
+    {
+    }
+
+    void Find(const PlaneGrid& plane_grid, Ptr<PlaneBase>& plane, TileQueue& tile_queue,
+              std::set<TileQueue::PlaneTile>& neighboring_tiles, Mat_<unsigned char>& overall_mask,
+              Mat_<unsigned char>& plane_mask)
+    {
+        // Do not use reference as we pop the from later on
+        TileQueue::PlaneTile tile = *(neighboring_tiles.begin());
+
+        // Figure the part of the image to look at
+        Range range_x, range_y;
+        int x = tile.x_ * block_size_, y = tile.y_ * block_size_;
+
+        if (tile.x_ == plane_mask.cols - 1)
+            range_x = Range(x, overall_mask.cols);
+        else
+            range_x = Range(x, x + block_size_);
+
+        if (tile.y_ == plane_mask.rows - 1)
+            range_y = Range(y, overall_mask.rows);
+        else
+            range_y = Range(y, y + block_size_);
+
+        int n_valid_points = 0;
+        for (int yy = range_y.start; yy != range_y.end; ++yy)
+        {
+            uchar* data = overall_mask.ptr(yy, range_x.start), * data_end = data + range_x.size();
+            const Vec3f* point = points3d_.ptr < Vec3f >(yy, range_x.start);
+            const Matx33f* Q_local = reinterpret_cast<const Matx33f*>(plane_grid.Q_.ptr < Vec<float, 9>
+            >(yy, range_x.start));
+
+            // Depending on whether you have a normal, check it
+            if (!normals_.empty())
+            {
+                const Vec3f* normal = normals_.ptr < Vec3f >(yy, range_x.start);
+                for (; data != data_end; ++data, ++point, ++normal, ++Q_local)
+                {
+                    // Don't do anything if the point already belongs to another plane
+                    if (cvIsNaN(point->val[0]) || ((*data) != 255))
+                        continue;
+
+                    // If the point is close enough to the plane
+                    if (plane->distance(*point) < err_)
+                    {
+                        // make sure the normals are similar to the plane
+                        if (std::abs(plane->n().dot(*normal)) > 0.3)
+                        {
+                            // The point now belongs to the plane
+                            plane->UpdateStatistics(*point, *Q_local);
+                            *data = plane_index_;
+                            ++n_valid_points;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (; data != data_end; ++data, ++point, ++Q_local)
+                {
+                    // Don't do anything if the point already belongs to another plane
+                    if (cvIsNaN(point->val[0]) || ((*data) != 255))
+                        continue;
+
+                    // If the point is close enough to the plane
+                    if (plane->distance(*point) < err_)
+                    {
+                        // The point now belongs to the plane
+                        plane->UpdateStatistics(*point, *Q_local);
+                        *data = plane_index_;
+                        ++n_valid_points;
+                    }
+                }
+            }
+        }
+
+        plane->UpdateParameters();
+
+        // Mark the front as being done and pop it
+        if (n_valid_points > (range_x.size() * range_y.size()) / 2)
+            tile_queue.remove(tile.y_, tile.x_);
+        plane_mask(tile.y_, tile.x_) = 1;
+        neighboring_tiles.erase(neighboring_tiles.begin());
+
+        // Add potential neighbors of the tile
+        std::vector<std::pair<int, int> > pairs;
+        if (tile.x_ > 0)
+            for (unsigned char* val = overall_mask.ptr<unsigned char>(range_y.start, range_x.start), *val_end = val
+                + range_y.size() * overall_mask.step; val != val_end; val += overall_mask.step)
+                if (*val == plane_index_)
+                {
+                    pairs.push_back(std::pair<int, int>(tile.x_ - 1, tile.y_));
+                    break;
+                }
+        if (tile.x_ < plane_mask.cols - 1)
+            for (unsigned char* val = overall_mask.ptr<unsigned char>(range_y.start, range_x.end - 1), *val_end = val
+                + range_y.size() * overall_mask.step; val != val_end; val += overall_mask.step)
+                if (*val == plane_index_)
+                {
+                    pairs.push_back(std::pair<int, int>(tile.x_ + 1, tile.y_));
+                    break;
+                }
+        if (tile.y_ > 0)
+            for (unsigned char* val = overall_mask.ptr<unsigned char>(range_y.start, range_x.start), *val_end = val
+                + range_x.size(); val != val_end; ++val)
+                if (*val == plane_index_)
+                {
+                    pairs.push_back(std::pair<int, int>(tile.x_, tile.y_ - 1));
+                    break;
+                }
+        if (tile.y_ < plane_mask.rows - 1)
+            for (unsigned char* val = overall_mask.ptr<unsigned char>(range_y.end - 1, range_x.start), *val_end = val
+                + range_x.size(); val != val_end; ++val)
+                if (*val == plane_index_)
+                {
+                    pairs.push_back(std::pair<int, int>(tile.x_, tile.y_ + 1));
+                    break;
+                }
+
+        for (unsigned char i = 0; i < pairs.size(); ++i)
+            if (!plane_mask(pairs[i].second, pairs[i].first))
+                neighboring_tiles.insert(
+                    TileQueue::PlaneTile(pairs[i].first, pairs[i].second, plane_grid.mse_(pairs[i].second, pairs[i].first)));
+    }
+
+private:
+    float err_;
+    const Mat_<Vec3f>& points3d_;
+    const Mat_<Vec3f>& normals_;
+    unsigned char plane_index_;
+    /** THe block size as defined in the main algorithm */
+    int block_size_;
+
+    const InlierFinder& operator = (const InlierFinder&);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void findPlanes(InputArray points3d_in, InputArray normals_in, OutputArray mask_out, OutputArray plane_coefficients_out,
+                int block_size, int min_size, double threshold, double sensor_error_a, double sensor_error_b, double sensor_error_c,
+                RgbdPlaneMethod method)
+{
+    CV_Assert(method == RGBD_PLANE_METHOD_DEFAULT);
+
+    Mat_<Vec3f> points3d, normals;
+    if (points3d_in.depth() == CV_32F)
+        points3d = points3d_in.getMat();
+    else
+        points3d_in.getMat().convertTo(points3d, CV_32F);
+    if (!normals_in.empty())
+    {
+        if (normals_in.depth() == CV_32F)
+            normals = normals_in.getMat();
+        else
+            normals_in.getMat().convertTo(normals, CV_32F);
+    }
+
+    // Pre-computations
+    mask_out.create(points3d.size(), CV_8U);
+    Mat mask_out_mat = mask_out.getMat();
+    Mat_<unsigned char> mask_out_uc = (Mat_<unsigned char>&) mask_out_mat;
+    mask_out_uc.setTo(255);
+    PlaneGrid plane_grid(points3d, block_size);
+    TileQueue plane_queue(plane_grid);
+    size_t index_plane = 0;
+
+    std::vector<Vec4f> plane_coefficients;
+    float mse_min = (float)(threshold * threshold);
+
+    while (!plane_queue.empty())
+    {
+        // Get the first tile if it's good enough
+        const TileQueue::PlaneTile front_tile = plane_queue.front();
+        if (front_tile.mse_ > mse_min)
+            break;
+
+        InlierFinder inlier_finder((float)threshold, points3d, normals, (unsigned char)index_plane, block_size);
+
+        // Construct the plane for the first tile
+        int x = front_tile.x_, y = front_tile.y_;
+        const Vec3f& n = plane_grid.n_(y, x);
+        Ptr<PlaneBase> plane;
+        if ((sensor_error_a == 0) && (sensor_error_b == 0) && (sensor_error_c == 0))
+            plane = Ptr<PlaneBase>(new Plane(plane_grid.m_(y, x), n, (int)index_plane));
+        else
+            plane = Ptr<PlaneBase>(new PlaneABC(plane_grid.m_(y, x), n, (int)index_plane,
+                                   (float)sensor_error_a, (float)sensor_error_b, (float)sensor_error_c));
+
+        Mat_<unsigned char> plane_mask = Mat_<unsigned char>::zeros(divUp(points3d.rows, block_size),
+                                                                    divUp(points3d.cols, block_size));
+        std::set<TileQueue::PlaneTile> neighboring_tiles;
+        neighboring_tiles.insert(front_tile);
+        plane_queue.remove(front_tile.y_, front_tile.x_);
+
+        // Process all the neighboring tiles
+        while (!neighboring_tiles.empty())
+            inlier_finder.Find(plane_grid, plane, plane_queue, neighboring_tiles, mask_out_uc, plane_mask);
+
+        // Don't record the plane if it's empty
+        if (plane->empty())
+            continue;
+        // Don't record the plane if it's smaller than asked
+        if (plane->K() < min_size)
+        {
+            // Reset the plane index in the mask
+            for (y = 0; y < plane_mask.rows; ++y)
+                for (x = 0; x < plane_mask.cols; ++x)
+                {
+                    if (!plane_mask(y, x))
+                        continue;
+                    // Go over the tile
+                    for (int yy = y * block_size;
+                        yy < std::min((y + 1) * block_size, mask_out_uc.rows); ++yy)
+                    {
+                        uchar* data = mask_out_uc.ptr(yy, x * block_size);
+                        uchar* data_end = data + std::min(block_size, mask_out_uc.cols - x * block_size);
+                        for (; data != data_end; ++data)
+                        {
+                            if (*data == index_plane)
+                                *data = 255;
+                        }
+                    }
+                }
+            continue;
+        }
+
+        ++index_plane;
+        if (index_plane >= 255)
+            break;
+        Vec4f coeffs(plane->n()[0], plane->n()[1], plane->n()[2], plane->d());
+        if (coeffs(2) > 0)
+            coeffs = -coeffs;
+        plane_coefficients.push_back(coeffs);
+    };
+
+    // Fill the plane coefficients
+    if (plane_coefficients.empty())
+        return;
+    plane_coefficients_out.create((int)plane_coefficients.size(), 1, CV_32FC4);
+    Mat plane_coefficients_mat = plane_coefficients_out.getMat();
+    float* data = plane_coefficients_mat.ptr<float>(0);
+    for (size_t i = 0; i < plane_coefficients.size(); ++i)
+        for (uchar j = 0; j < 4; ++j, ++data)
+            *data = plane_coefficients[i][j];
+}
+
+} // namespace cv
diff --git a/modules/3d/src/rgbd/pose_graph.cpp b/modules/3d/src/rgbd/pose_graph.cpp
new file mode 100644
index 0000000000..0b597c5b40
--- /dev/null
+++ b/modules/3d/src/rgbd/pose_graph.cpp
@@ -0,0 +1,908 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../precomp.hpp"
+#include "sparse_block_matrix.hpp"
+#include "opencv2/3d/detail/pose_graph.hpp"
+
+// matrix form of conjugation
+static const cv::Matx44d M_Conj{ 1,  0,  0,  0,
+                                 0, -1,  0,  0,
+                                 0,  0, -1,  0,
+                                 0,  0,  0, -1 };
+
+// matrix form of quaternion multiplication from left side
+static inline cv::Matx44d m_left(cv::Quatd q)
+{
+    // M_left(a)* V(b) =
+    //    = (I_4 * a0 + [ 0 | -av    [    0 | 0_1x3
+    //                   av | 0_3] +  0_3x1 | skew(av)]) * V(b)
+
+    double w = q.w, x = q.x, y = q.y, z = q.z;
+    return { w, -x, -y, -z,
+             x,  w, -z,  y,
+             y,  z,  w, -x,
+             z, -y,  x,  w };
+}
+
+// matrix form of quaternion multiplication from right side
+static inline cv::Matx44d m_right(cv::Quatd q)
+{
+    // M_right(b)* V(a) =
+    //    = (I_4 * b0 + [ 0 | -bv    [    0 | 0_1x3
+    //                   bv | 0_3] +  0_3x1 | skew(-bv)]) * V(a)
+
+    double w = q.w, x = q.x, y = q.y, z = q.z;
+    return { w, -x, -y, -z,
+             x,  w,  z, -y,
+             y, -z,  w,  x,
+             z,  y, -x,  w };
+}
+
+// precaution against "unused function" warning when there's no Eigen
+#if defined(HAVE_EIGEN)
+// jacobian of quaternionic (exp(x)*q) : R_3 -> H near x == 0
+static inline cv::Matx43d expQuatJacobian(cv::Quatd q)
+{
+    double w = q.w, x = q.x, y = q.y, z = q.z;
+    return cv::Matx43d(-x, -y, -z,
+                        w,  z, -y,
+                       -z,  w,  x,
+                        y, -x,  w);
+}
+#endif
+
+// concatenate matrices vertically
+template<typename _Tp, int m, int n, int k> static inline
+cv::Matx<_Tp, m + k, n> concatVert(const cv::Matx<_Tp, m, n>& a, const cv::Matx<_Tp, k, n>& b)
+{
+    cv::Matx<_Tp, m + k, n> res;
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            res(i, j) = a(i, j);
+        }
+    }
+    for (int i = 0; i < k; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            res(m + i, j) = b(i, j);
+        }
+    }
+    return res;
+}
+
+// concatenate matrices horizontally
+template<typename _Tp, int m, int n, int k> static inline
+cv::Matx<_Tp, m, n + k> concatHor(const cv::Matx<_Tp, m, n>& a, const cv::Matx<_Tp, m, k>& b)
+{
+    cv::Matx<_Tp, m, n + k> res;
+
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            res(i, j) = a(i, j);
+        }
+    }
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < k; j++)
+        {
+            res(i, n + j) = b(i, j);
+        }
+    }
+    return res;
+}
+
+namespace cv
+{
+namespace detail
+{
+
+class PoseGraphImpl : public detail::PoseGraph
+{
+    struct Pose3d
+    {
+        Vec3d t;
+        Quatd q;
+
+        Pose3d() : t(), q(1, 0, 0, 0) { }
+
+        Pose3d(const Matx33d& rotation, const Vec3d& translation)
+            : t(translation), q(Quatd::createFromRotMat(rotation).normalize())
+        { }
+
+        explicit Pose3d(const Matx44d& pose) :
+            Pose3d(pose.get_minor<3, 3>(0, 0), Vec3d(pose(0, 3), pose(1, 3), pose(2, 3)))
+        { }
+
+        inline Pose3d operator*(const Pose3d& otherPose) const
+        {
+            Pose3d out(*this);
+            out.t += q.toRotMat3x3(QUAT_ASSUME_UNIT) * otherPose.t;
+            out.q = out.q * otherPose.q;
+            return out;
+        }
+
+        Affine3d getAffine() const
+        {
+            return Affine3d(q.toRotMat3x3(QUAT_ASSUME_UNIT), t);
+        }
+
+        inline Pose3d inverse() const
+        {
+            Pose3d out;
+            out.q = q.conjugate();
+            out.t = -(out.q.toRotMat3x3(QUAT_ASSUME_UNIT) * t);
+            return out;
+        }
+
+        inline void normalizeRotation()
+        {
+            q = q.normalize();
+        }
+    };
+
+    /*! \class GraphNode
+     *  \brief Defines a node/variable that is optimizable in a posegraph
+     *
+     *  Detailed description
+     */
+    struct Node
+    {
+    public:
+        explicit Node(size_t _nodeId, const Affine3d& _pose)
+            : id(_nodeId), isFixed(false), pose(_pose.rotation(), _pose.translation())
+        { }
+
+        Affine3d getPose() const
+        {
+            return pose.getAffine();
+        }
+        void setPose(const Affine3d& _pose)
+        {
+            pose = Pose3d(_pose.rotation(), _pose.translation());
+        }
+
+    public:
+        size_t id;
+        bool isFixed;
+        Pose3d pose;
+    };
+
+    /*! \class PoseGraphEdge
+     *  \brief Defines the constraints between two PoseGraphNodes
+     *
+     *  Detailed description
+     */
+    struct Edge
+    {
+    public:
+        explicit Edge(size_t _sourceNodeId, size_t _targetNodeId, const Affine3f& _transformation,
+                      const Matx66f& _information = Matx66f::eye());
+
+        bool operator==(const Edge& edge)
+        {
+            if ((edge.sourceNodeId == sourceNodeId && edge.targetNodeId == targetNodeId) ||
+                (edge.sourceNodeId == targetNodeId && edge.targetNodeId == sourceNodeId))
+                return true;
+            return false;
+        }
+
+    public:
+        size_t sourceNodeId;
+        size_t targetNodeId;
+        Pose3d pose;
+        Matx66f sqrtInfo;
+    };
+
+
+public:
+    PoseGraphImpl() : nodes(), edges()
+    { }
+    virtual ~PoseGraphImpl() CV_OVERRIDE
+    { }
+
+    // Node may have any id >= 0
+    virtual void addNode(size_t _nodeId, const Affine3d& _pose, bool fixed) CV_OVERRIDE;
+    virtual bool isNodeExist(size_t nodeId) const CV_OVERRIDE
+    {
+        return (nodes.find(nodeId) != nodes.end());
+    }
+
+    virtual bool setNodeFixed(size_t nodeId, bool fixed) CV_OVERRIDE
+    {
+        auto it = nodes.find(nodeId);
+        if (it != nodes.end())
+        {
+            it->second.isFixed = fixed;
+            return true;
+        }
+        else
+            return false;
+    }
+
+    virtual bool isNodeFixed(size_t nodeId) const CV_OVERRIDE
+    {
+        auto it = nodes.find(nodeId);
+        if (it != nodes.end())
+            return it->second.isFixed;
+        else
+            return false;
+    }
+
+    virtual Affine3d getNodePose(size_t nodeId) const CV_OVERRIDE
+    {
+        auto it = nodes.find(nodeId);
+        if (it != nodes.end())
+            return it->second.getPose();
+        else
+            return Affine3d();
+    }
+
+    virtual std::vector<size_t> getNodesIds() const CV_OVERRIDE
+    {
+        std::vector<size_t> ids;
+        for (const auto& it : nodes)
+        {
+            ids.push_back(it.first);
+        }
+        return ids;
+    }
+
+    virtual size_t getNumNodes() const CV_OVERRIDE
+    {
+        return nodes.size();
+    }
+
+    // Edges have consequent indices starting from 0
+    virtual void addEdge(size_t _sourceNodeId, size_t _targetNodeId, const Affine3f& _transformation,
+                         const Matx66f& _information = Matx66f::eye()) CV_OVERRIDE
+    {
+        Edge e(_sourceNodeId, _targetNodeId, _transformation, _information);
+        edges.push_back(e);
+    }
+
+    virtual size_t getEdgeStart(size_t i) const CV_OVERRIDE
+    {
+        return edges[i].sourceNodeId;
+    }
+
+    virtual size_t getEdgeEnd(size_t i) const CV_OVERRIDE
+    {
+        return edges[i].targetNodeId;
+    }
+
+    virtual size_t getNumEdges() const CV_OVERRIDE
+    {
+        return edges.size();
+    }
+
+    // checks if graph is connected and each edge connects exactly 2 nodes
+    virtual bool isValid() const CV_OVERRIDE;
+
+    // calculate cost function based on current nodes parameters
+    virtual double calcEnergy() const CV_OVERRIDE;
+
+    // calculate cost function based on provided nodes parameters
+    double calcEnergyNodes(const std::map<size_t, Node>& newNodes) const;
+
+    // Termination criteria are max number of iterations and min relative energy change to current energy
+    // Returns number of iterations elapsed or -1 if max number of iterations was reached or failed to optimize
+    virtual int optimize(const cv::TermCriteria& tc = cv::TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 100, 1e-6)) CV_OVERRIDE;
+
+    std::map<size_t, Node> nodes;
+    std::vector<Edge>   edges;
+};
+
+
+void PoseGraphImpl::addNode(size_t _nodeId, const Affine3d& _pose, bool fixed)
+{
+    Node node(_nodeId, _pose);
+    node.isFixed = fixed;
+
+    size_t id = node.id;
+    const auto& it = nodes.find(id);
+    if (it != nodes.end())
+    {
+        std::cout << "duplicated node, id=" << id << std::endl;
+        nodes.insert(it, { id, node });
+    }
+    else
+    {
+        nodes.insert({ id, node });
+    }
+}
+
+
+// Cholesky decomposition of symmetrical 6x6 matrix
+static inline cv::Matx66d llt6(Matx66d m)
+{
+    Matx66d L;
+    for (int i = 0; i < 6; i++)
+    {
+        for (int j = 0; j < (i + 1); j++)
+        {
+            double sum = 0;
+            for (int k = 0; k < j; k++)
+                sum += L(i, k) * L(j, k);
+
+            if (i == j)
+                L(i, i) = sqrt(m(i, i) - sum);
+            else
+                L(i, j) = (1.0 / L(j, j) * (m(i, j) - sum));
+        }
+    }
+    return L;
+}
+
+PoseGraphImpl::Edge::Edge(size_t _sourceNodeId, size_t _targetNodeId, const Affine3f& _transformation,
+                          const Matx66f& _information) :
+                          sourceNodeId(_sourceNodeId),
+                          targetNodeId(_targetNodeId),
+                          pose(_transformation.rotation(), _transformation.translation()),
+                          sqrtInfo(llt6(_information))
+{ }
+
+
+bool PoseGraphImpl::isValid() const
+{
+    size_t numNodes = getNumNodes();
+    size_t numEdges = getNumEdges();
+
+    if (!numNodes || !numEdges)
+        return false;
+
+    std::unordered_set<size_t> nodesVisited;
+    std::vector<size_t> nodesToVisit;
+
+    nodesToVisit.push_back(nodes.begin()->first);
+
+    bool isGraphConnected = false;
+    while (!nodesToVisit.empty())
+    {
+        size_t currNodeId = nodesToVisit.back();
+        nodesToVisit.pop_back();
+        nodesVisited.insert(currNodeId);
+        // Since each node does not maintain its neighbor list
+        for (size_t i = 0; i < numEdges; i++)
+        {
+            const Edge& potentialEdge = edges.at(i);
+            size_t nextNodeId = (size_t)(-1);
+
+            if (potentialEdge.sourceNodeId == currNodeId)
+            {
+                nextNodeId = potentialEdge.targetNodeId;
+            }
+            else if (potentialEdge.targetNodeId == currNodeId)
+            {
+                nextNodeId = potentialEdge.sourceNodeId;
+            }
+            if (nextNodeId != (size_t)(-1))
+            {
+                if (nodesVisited.count(nextNodeId) == 0)
+                {
+                    nodesToVisit.push_back(nextNodeId);
+                }
+            }
+        }
+    }
+
+    isGraphConnected = (nodesVisited.size() == numNodes);
+
+    CV_LOG_INFO(NULL, "nodesVisited: " << nodesVisited.size() << " IsGraphConnected: " << isGraphConnected);
+
+    bool invalidEdgeNode = false;
+    for (size_t i = 0; i < numEdges; i++)
+    {
+        const Edge& edge = edges.at(i);
+        // edges have spurious source/target nodes
+        if ((nodesVisited.count(edge.sourceNodeId) != 1) ||
+            (nodesVisited.count(edge.targetNodeId) != 1))
+        {
+            invalidEdgeNode = true;
+            break;
+        }
+    }
+    return isGraphConnected && !invalidEdgeNode;
+}
+
+
+//////////////////////////
+// Optimization itself //
+////////////////////////
+
+static inline double poseError(Quatd sourceQuat, Vec3d sourceTrans, Quatd targetQuat, Vec3d targetTrans,
+                               Quatd rotMeasured, Vec3d transMeasured, Matx66d sqrtInfoMatrix, bool needJacobians,
+                               Matx<double, 6, 4>& sqj, Matx<double, 6, 3>& stj,
+                               Matx<double, 6, 4>& tqj, Matx<double, 6, 3>& ttj,
+                               Vec6d& res)
+{
+    // err_r = 2*Im(conj(rel_r) * measure_r) = 2*Im(conj(target_r) * source_r * measure_r)
+    // err_t = conj(source_r) * (target_t - source_t) * source_r - measure_t
+
+    Quatd sourceQuatInv = sourceQuat.conjugate();
+    Vec3d deltaTrans = targetTrans - sourceTrans;
+
+    Quatd relativeQuat = sourceQuatInv * targetQuat;
+    Vec3d relativeTrans = sourceQuatInv.toRotMat3x3(cv::QUAT_ASSUME_UNIT) * deltaTrans;
+
+    //! Definition should actually be relativeQuat * rotMeasured.conjugate()
+    Quatd deltaRot = relativeQuat.conjugate() * rotMeasured;
+
+    Vec3d terr = relativeTrans - transMeasured;
+    Vec3d rerr = 2.0 * Vec3d(deltaRot.x, deltaRot.y, deltaRot.z);
+    Vec6d rterr(terr[0], terr[1], terr[2], rerr[0], rerr[1], rerr[2]);
+
+    res = sqrtInfoMatrix * rterr;
+
+    if (needJacobians)
+    {
+        // d(err_r) = 2*Im(d(conj(target_r) * source_r * measure_r)) = < measure_r is constant > =
+        // 2*Im((conj(d(target_r)) * source_r + conj(target_r) * d(source_r)) * measure_r)
+        // d(target_r) == 0:
+        //  # d(err_r) = 2*Im(conj(target_r) * d(source_r) * measure_r)
+        //  # V(d(err_r)) = 2 * M_Im * M_right(measure_r) * M_left(conj(target_r)) * V(d(source_r))
+        //  # d(err_r) / d(source_r) = 2 * M_Im * M_right(measure_r) * M_left(conj(target_r))
+        Matx34d drdsq = 2.0 * (m_right(rotMeasured) * m_left(targetQuat.conjugate())).get_minor<3, 4>(1, 0);
+
+        // d(source_r) == 0:
+        //  # d(err_r) = 2*Im(conj(d(target_r)) * source_r * measure_r)
+        //  # V(d(err_r)) = 2 * M_Im * M_right(source_r * measure_r) * M_Conj * V(d(target_r))
+        //  # d(err_r) / d(target_r) = 2 * M_Im * M_right(source_r * measure_r) * M_Conj
+        Matx34d drdtq = 2.0 * (m_right(sourceQuat * rotMeasured) * M_Conj).get_minor<3, 4>(1, 0);
+
+        // d(err_t) = d(conj(source_r) * (target_t - source_t) * source_r) =
+        // conj(source_r) * (d(target_t) - d(source_t)) * source_r +
+        // conj(d(source_r)) * (target_t - source_t) * source_r +
+        // conj(source_r) * (target_t - source_t) * d(source_r) =
+        // <conj(a*b) == conj(b)*conj(a), conj(target_t - source_t) = - (target_t - source_t), 2 * Im(x) = (x - conj(x))>
+        // conj(source_r) * (d(target_t) - d(source_t)) * source_r +
+        // 2 * Im(conj(source_r) * (target_t - source_t) * d(source_r))
+        // d(*_t) == 0:
+        //  # d(err_t) = 2 * Im(conj(source_r) * (target_t - source_t) * d(source_r))
+        //  # V(d(err_t)) = 2 * M_Im * M_left(conj(source_r) * (target_t - source_t)) * V(d(source_r))
+        //  # d(err_t) / d(source_r) = 2 * M_Im * M_left(conj(source_r) * (target_t - source_t))
+        Matx34d dtdsq = 2 * m_left(sourceQuatInv * Quatd(0, deltaTrans[0], deltaTrans[1], deltaTrans[2])).get_minor<3, 4>(1, 0);
+        // deltaTrans is rotated by sourceQuatInv, so the jacobian is rot matrix of sourceQuatInv by +1 or -1
+        Matx33d dtdtt = sourceQuatInv.toRotMat3x3(QUAT_ASSUME_UNIT);
+        Matx33d dtdst = -dtdtt;
+
+        Matx33d z;
+        sqj = concatVert(dtdsq, drdsq);
+        tqj = concatVert(Matx34d(), drdtq);
+        stj = concatVert(dtdst, z);
+        ttj = concatVert(dtdtt, z);
+
+        stj = sqrtInfoMatrix * stj;
+        ttj = sqrtInfoMatrix * ttj;
+        sqj = sqrtInfoMatrix * sqj;
+        tqj = sqrtInfoMatrix * tqj;
+    }
+
+    return res.ddot(res);
+}
+
+
+double PoseGraphImpl::calcEnergy() const
+{
+    return calcEnergyNodes(nodes);
+}
+
+
+// estimate current energy
+double PoseGraphImpl::calcEnergyNodes(const std::map<size_t, Node>& newNodes) const
+{
+    double totalErr = 0;
+    for (const auto& e : edges)
+    {
+        Pose3d srcP = newNodes.at(e.sourceNodeId).pose;
+        Pose3d tgtP = newNodes.at(e.targetNodeId).pose;
+
+        Vec6d res;
+        Matx<double, 6, 3> stj, ttj;
+        Matx<double, 6, 4> sqj, tqj;
+        double err = poseError(srcP.q, srcP.t, tgtP.q, tgtP.t, e.pose.q, e.pose.t, e.sqrtInfo,
+                               /* needJacobians = */ false, sqj, stj, tqj, ttj, res);
+
+        totalErr += err;
+    }
+    return totalErr * 0.5;
+}
+
+
+#if defined(HAVE_EIGEN)
+
+// from Ceres, equation energy change:
+// eq. energy = 1/2 * (residuals + J * step)^2 =
+// 1/2 * ( residuals^2 + 2 * residuals^T * J * step + (J*step)^T * J * step)
+// eq. energy change = 1/2 * residuals^2 - eq. energy =
+// residuals^T * J * step + 1/2 * (J*step)^T * J * step =
+// (residuals^T * J + 1/2 * step^T * J^T * J) * step =
+// step^T * ((residuals^T * J)^T + 1/2 * (step^T * J^T * J)^T) =
+// 1/2 * step^T * (2 * J^T * residuals + J^T * J * step) =
+// 1/2 * step^T * (2 * J^T * residuals + (J^T * J + LMDiag - LMDiag) * step) =
+// 1/2 * step^T * (2 * J^T * residuals + (J^T * J + LMDiag) * step - LMDiag * step) =
+// 1/2 * step^T * (J^T * residuals - LMDiag * step) =
+// 1/2 * x^T * (jtb - lmDiag^T * x)
+static inline double calcJacCostChange(const std::vector<double>& jtb,
+                                       const std::vector<double>& x,
+                                       const std::vector<double>& lmDiag)
+{
+    double jdiag = 0.0;
+    for (size_t i = 0; i < x.size(); i++)
+    {
+        jdiag += x[i] * (jtb[i] - lmDiag[i] * x[i]);
+    }
+    double costChange = jdiag * 0.5;
+    return costChange;
+}
+
+
+// J := J * d_inv, d_inv = make_diag(di)
+// J^T*J := (J * d_inv)^T * J * d_inv = diag(di)* (J^T * J)* diag(di) = eltwise_mul(J^T*J, di*di^T)
+// J^T*b := (J * d_inv)^T * b = d_inv^T * J^T*b = eltwise_mul(J^T*b, di)
+static inline void doJacobiScaling(BlockSparseMat<double, 6, 6>& jtj, std::vector<double>& jtb, const std::vector<double>& di)
+{
+    // scaling J^T*J
+    for (auto& ijv : jtj.ijValue)
+    {
+        Point2i bpt = ijv.first;
+        Matx66d& m = ijv.second;
+        for (int i = 0; i < 6; i++)
+        {
+            for (int j = 0; j < 6; j++)
+            {
+                Point2i pt(bpt.x * 6 + i, bpt.y * 6 + j);
+                m(i, j) *= di[pt.x] * di[pt.y];
+            }
+        }
+    }
+
+    // scaling J^T*b
+    for (size_t i = 0; i < di.size(); i++)
+    {
+        jtb[i] *= di[i];
+    }
+}
+
+
+int PoseGraphImpl::optimize(const cv::TermCriteria& tc)
+{
+    if (!isValid())
+    {
+        CV_LOG_INFO(NULL, "Invalid PoseGraph that is either not connected or has invalid nodes");
+        return -1;
+    }
+
+    size_t numNodes = getNumNodes();
+    size_t numEdges = getNumEdges();
+
+    // Allocate indices for nodes
+    std::vector<size_t> placesIds;
+    std::map<size_t, size_t> idToPlace;
+    for (const auto& ni : nodes)
+    {
+        if (!ni.second.isFixed)
+        {
+            idToPlace[ni.first] = placesIds.size();
+            placesIds.push_back(ni.first);
+        }
+    }
+
+    size_t nVarNodes = placesIds.size();
+    if (!nVarNodes)
+    {
+        CV_LOG_INFO(NULL, "PoseGraph contains no non-constant nodes, skipping optimization");
+        return -1;
+    }
+
+    if (numEdges == 0)
+    {
+        CV_LOG_INFO(NULL, "PoseGraph has no edges, no optimization to be done");
+        return -1;
+    }
+
+    CV_LOG_INFO(NULL, "Optimizing PoseGraph with " << numNodes << " nodes and " << numEdges << " edges");
+
+    size_t nVars = nVarNodes * 6;
+    BlockSparseMat<double, 6, 6> jtj(nVarNodes);
+    std::vector<double> jtb(nVars);
+
+    double energy = calcEnergyNodes(nodes);
+    double oldEnergy = energy;
+
+    CV_LOG_INFO(NULL, "#s" << " energy: " << energy);
+
+    // options
+    // stop conditions
+    bool checkIterations = (tc.type & TermCriteria::COUNT);
+    bool checkEps = (tc.type & TermCriteria::EPS);
+    const unsigned int maxIterations = tc.maxCount;
+    const double minGradientTolerance = 1e-6;
+    const double stepNorm2Tolerance = 1e-6;
+    const double relEnergyDeltaTolerance = tc.epsilon;
+    // normalize jacobian columns for better conditioning
+    // slows down sparse solver, but maybe this'd be useful for some other solver
+    const bool jacobiScaling = false;
+    const double minDiag = 1e-6;
+    const double maxDiag = 1e32;
+
+    const double initialLambdaLevMarq = 0.0001;
+    const double initialLmUpFactor = 2.0;
+    const double initialLmDownFactor = 3.0;
+
+    // finish reasons
+    bool tooLong          = false; // => not found
+    bool smallGradient    = false; // => found
+    bool smallStep        = false; // => found
+    bool smallEnergyDelta = false; // => found
+
+    // column scale inverted, for jacobian scaling
+    std::vector<double> di(nVars);
+
+    double lmUpFactor = initialLmUpFactor;
+    double lambdaLevMarq = initialLambdaLevMarq;
+
+    unsigned int iter = 0;
+    bool done = false;
+    while (!done)
+    {
+        jtj.clear();
+        std::fill(jtb.begin(), jtb.end(), 0.0);
+
+        // caching nodes jacobians
+        std::vector<cv::Matx<double, 7, 6>> cachedJac;
+        for (auto id : placesIds)
+        {
+            Pose3d p = nodes.at(id).pose;
+            Matx43d qj = expQuatJacobian(p.q);
+            // x node layout is (rot_x, rot_y, rot_z, trans_x, trans_y, trans_z)
+            // pose layout is (q_w, q_x, q_y, q_z, trans_x, trans_y, trans_z)
+            Matx<double, 7, 6> j = concatVert(concatHor(qj, Matx43d()),
+                                              concatHor(Matx33d(), Matx33d::eye()));
+            cachedJac.push_back(j);
+        }
+
+        // fill jtj and jtb
+        for (const auto& e : edges)
+        {
+            size_t srcId = e.sourceNodeId, dstId = e.targetNodeId;
+            const Node& srcNode = nodes.at(srcId);
+            const Node& dstNode = nodes.at(dstId);
+
+            Pose3d srcP = srcNode.pose;
+            Pose3d tgtP = dstNode.pose;
+            bool srcFixed = srcNode.isFixed;
+            bool dstFixed = dstNode.isFixed;
+
+            Vec6d res;
+            Matx<double, 6, 3> stj, ttj;
+            Matx<double, 6, 4> sqj, tqj;
+            poseError(srcP.q, srcP.t, tgtP.q, tgtP.t, e.pose.q, e.pose.t, e.sqrtInfo,
+                      /* needJacobians = */ true, sqj, stj, tqj, ttj, res);
+
+            size_t srcPlace = (size_t)(-1), dstPlace = (size_t)(-1);
+            Matx66d sj, tj;
+            if (!srcFixed)
+            {
+                srcPlace = idToPlace.at(srcId);
+                sj = concatHor(sqj, stj) * cachedJac[srcPlace];
+
+                jtj.refBlock(srcPlace, srcPlace) += sj.t() * sj;
+
+                Vec6f jtbSrc = sj.t() * res;
+                for (int i = 0; i < 6; i++)
+                {
+                    jtb[6 * srcPlace + i] += -jtbSrc[i];
+                }
+            }
+
+            if (!dstFixed)
+            {
+                dstPlace = idToPlace.at(dstId);
+                tj = concatHor(tqj, ttj) * cachedJac[dstPlace];
+
+                jtj.refBlock(dstPlace, dstPlace) += tj.t() * tj;
+
+                Vec6f jtbDst = tj.t() * res;
+                for (int i = 0; i < 6; i++)
+                {
+                    jtb[6 * dstPlace + i] += -jtbDst[i];
+                }
+            }
+
+            if (!(srcFixed || dstFixed))
+            {
+                Matx66d sjttj = sj.t() * tj;
+                jtj.refBlock(srcPlace, dstPlace) += sjttj;
+                jtj.refBlock(dstPlace, srcPlace) += sjttj.t();
+            }
+        }
+
+        CV_LOG_INFO(NULL, "#LM#s" << " energy: " << energy);
+
+        // do the jacobian conditioning improvement used in Ceres
+        if (jacobiScaling)
+        {
+            // L2-normalize each jacobian column
+            // vec d = {d_j = sum(J_ij^2) for each column j of J} = get_diag{ J^T * J }
+            // di = { 1/(1+sqrt(d_j)) }, extra +1 to avoid div by zero
+            if (iter == 0)
+            {
+                for (size_t i = 0; i < nVars; i++)
+                {
+                    double ds = sqrt(jtj.valElem(i, i)) + 1.0;
+                    di[i] = 1.0 / ds;
+                }
+            }
+
+            doJacobiScaling(jtj, jtb, di);
+        }
+
+        double gradientMax = 0.0;
+        // gradient max
+        for (size_t i = 0; i < nVars; i++)
+        {
+            gradientMax = std::max(gradientMax, abs(jtb[i]));
+        }
+
+        // Save original diagonal of jtj matrix for LevMarq
+        std::vector<double> diag(nVars);
+        for (size_t i = 0; i < nVars; i++)
+        {
+            diag[i] = jtj.valElem(i, i);
+        }
+
+        // Solve using LevMarq and get delta transform
+        bool enoughLm = false;
+
+        decltype(nodes) tempNodes = nodes;
+
+        while (!enoughLm && !done)
+        {
+            // form LevMarq matrix
+            std::vector<double> lmDiag(nVars);
+            for (size_t i = 0; i < nVars; i++)
+            {
+                double v = diag[i];
+                double ld = std::min(max(v * lambdaLevMarq, minDiag), maxDiag);
+                lmDiag[i] = ld;
+                jtj.refElem(i, i) = v + ld;
+            }
+
+            CV_LOG_INFO(NULL, "sparse solve...");
+
+            // use double or convert everything to float
+            std::vector<double> x;
+            bool solved = jtj.sparseSolve(jtb, x, false);
+
+            CV_LOG_INFO(NULL, (solved ? "OK" : "FAIL"));
+
+            double costChange = 0.0;
+            double jacCostChange = 0.0;
+            double stepQuality = 0.0;
+            double xNorm2 = 0.0;
+            if (solved)
+            {
+                jacCostChange = calcJacCostChange(jtb, x, lmDiag);
+
+                // x squared norm
+                for (size_t i = 0; i < nVars; i++)
+                {
+                    xNorm2 += x[i] * x[i];
+                }
+
+                // undo jacobi scaling
+                if (jacobiScaling)
+                {
+                    for (size_t i = 0; i < nVars; i++)
+                    {
+                        x[i] *= di[i];
+                    }
+                }
+
+                tempNodes = nodes;
+
+                // Update temp nodes using x
+                for (size_t i = 0; i < nVarNodes; i++)
+                {
+                    Vec6d dx(&x[i * 6]);
+                    Vec3d deltaRot(dx[0], dx[1], dx[2]), deltaTrans(dx[3], dx[4], dx[5]);
+                    Pose3d& p = tempNodes.at(placesIds[i]).pose;
+
+                    p.q = Quatd(0, deltaRot[0], deltaRot[1], deltaRot[2]).exp() * p.q;
+                    p.t += deltaTrans;
+                }
+
+                // calc energy with temp nodes
+                energy = calcEnergyNodes(tempNodes);
+
+                costChange = oldEnergy - energy;
+
+                stepQuality = costChange / jacCostChange;
+
+                CV_LOG_INFO(NULL, "#LM#" << iter
+                               << " energy: " << energy
+                               << " deltaEnergy: " << costChange
+                               << " deltaEqEnergy: " << jacCostChange
+                               << " max(J^T*b): " << gradientMax
+                               << " norm2(x): " << xNorm2
+                               << " deltaEnergy/energy: " << costChange / energy);
+            }
+
+            if (!solved || costChange < 0)
+            {
+                // failed to optimize, increase lambda and repeat
+
+                lambdaLevMarq *= lmUpFactor;
+                lmUpFactor *= 2.0;
+
+                CV_LOG_INFO(NULL, "LM goes up, lambda: " << lambdaLevMarq << ", old energy: " << oldEnergy);
+            }
+            else
+            {
+                // optimized successfully, decrease lambda and set variables for next iteration
+                enoughLm = true;
+
+                lambdaLevMarq *= std::max(1.0 / initialLmDownFactor, 1.0 - pow(2.0 * stepQuality - 1.0, 3));
+                lmUpFactor = initialLmUpFactor;
+
+                smallGradient = (gradientMax < minGradientTolerance);
+                smallStep = (xNorm2 < stepNorm2Tolerance);
+                smallEnergyDelta = (costChange / energy < relEnergyDeltaTolerance);
+
+                nodes = tempNodes;
+
+                CV_LOG_INFO(NULL, "#" << iter << " energy: " << energy);
+
+                oldEnergy = energy;
+
+                CV_LOG_INFO(NULL, "LM goes down, lambda: " << lambdaLevMarq << " step quality: " << stepQuality);
+            }
+
+            iter++;
+
+            tooLong = (iter >= maxIterations);
+
+            done = ( (checkIterations && tooLong) || smallGradient || smallStep || (checkEps && smallEnergyDelta) );
+        }
+    }
+
+    // if maxIterations is given as a stop criteria, let's consider tooLong as a successful finish
+    bool found = (smallGradient || smallStep || smallEnergyDelta || (checkIterations && tooLong));
+
+    CV_LOG_INFO(NULL, "Finished: " << (found ? "" : "not") << "found");
+    if (smallGradient)
+        CV_LOG_INFO(NULL, "Finish reason: gradient max val dropped below threshold");
+    if (smallStep)
+        CV_LOG_INFO(NULL, "Finish reason: step size dropped below threshold");
+    if (smallEnergyDelta)
+        CV_LOG_INFO(NULL, "Finish reason: relative energy change between iterations dropped below threshold");
+    if (tooLong)
+        CV_LOG_INFO(NULL, "Finish reason: max number of iterations reached");
+
+    return (found ? iter : -1);
+}
+
+#else
+int PoseGraphImpl::optimize(const cv::TermCriteria& /*tc*/)
+{
+    CV_Error(Error::StsNotImplemented, "Eigen library required for sparse matrix solve during pose graph optimization, dense solver is not implemented");
+}
+#endif
+
+
+Ptr<detail::PoseGraph> detail::PoseGraph::create()
+{
+    return makePtr<PoseGraphImpl>();
+}
+
+PoseGraph::~PoseGraph() { }
+
+}  // namespace detail
+}  // namespace cv
diff --git a/modules/3d/src/rgbd/sparse_block_matrix.hpp b/modules/3d/src/rgbd/sparse_block_matrix.hpp
new file mode 100644
index 0000000000..07d442f1cf
--- /dev/null
+++ b/modules/3d/src/rgbd/sparse_block_matrix.hpp
@@ -0,0 +1,197 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_3D_SPARSE_BLOCK_MATRIX_HPP
+#define OPENCV_3D_SPARSE_BLOCK_MATRIX_HPP
+
+#include "../precomp.hpp"
+
+#if defined(HAVE_EIGEN)
+#include <Eigen/Core>
+#include <Eigen/Sparse>
+#include <Eigen/SparseCholesky>
+
+#include "opencv2/core/eigen.hpp"
+#endif
+
+namespace cv
+{
+/*!
+ * \class BlockSparseMat
+ * Naive implementation of Sparse Block Matrix
+ */
+template<typename _Tp, size_t blockM, size_t blockN>
+struct BlockSparseMat
+{
+    struct Point2iHash
+    {
+        size_t operator()(const cv::Point2i& point) const noexcept
+        {
+            size_t seed                     = 0;
+            constexpr uint32_t GOLDEN_RATIO = 0x9e3779b9;
+            seed ^= std::hash<int>()(point.x) + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+            seed ^= std::hash<int>()(point.y) + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+            return seed;
+        }
+    };
+    typedef Matx<_Tp, blockM, blockN> MatType;
+    typedef std::unordered_map<Point2i, MatType, Point2iHash> IDtoBlockValueMap;
+
+    BlockSparseMat(size_t _nBlocks) : nBlocks(_nBlocks), ijValue() {}
+
+    void clear()
+    {
+        ijValue.clear();
+    }
+
+    inline MatType& refBlock(size_t i, size_t j)
+    {
+        Point2i p((int)i, (int)j);
+        auto it = ijValue.find(p);
+        if (it == ijValue.end())
+        {
+            it = ijValue.insert({ p, MatType::zeros() }).first;
+        }
+        return it->second;
+    }
+
+    inline _Tp& refElem(size_t i, size_t j)
+    {
+        Point2i ib((int)(i / blockM), (int)(j / blockN));
+        Point2i iv((int)(i % blockM), (int)(j % blockN));
+        return refBlock(ib.x, ib.y)(iv.x, iv.y);
+    }
+
+    inline MatType valBlock(size_t i, size_t j) const
+    {
+        Point2i p((int)i, (int)j);
+        auto it = ijValue.find(p);
+        if (it == ijValue.end())
+            return MatType::zeros();
+        else
+            return it->second;
+    }
+
+    inline _Tp valElem(size_t i, size_t j) const
+    {
+        Point2i ib((int)(i / blockM), (int)(j / blockN));
+        Point2i iv((int)(i % blockM), (int)(j % blockN));
+        return valBlock(ib.x, ib.y)(iv.x, iv.y);
+    }
+
+    Mat diagonal() const
+    {
+        // Diagonal max length is the number of columns in the sparse matrix
+        int diagLength = blockN * nBlocks;
+        cv::Mat diag   = cv::Mat::zeros(diagLength, 1, cv::DataType<_Tp>::type);
+
+        for (int i = 0; i < diagLength; i++)
+        {
+            diag.at<_Tp>(i, 0) = valElem(i, i);
+        }
+        return diag;
+    }
+
+#if defined(HAVE_EIGEN)
+    Eigen::SparseMatrix<_Tp> toEigen() const
+    {
+        std::vector<Eigen::Triplet<_Tp>> tripletList;
+        tripletList.reserve(ijValue.size() * blockM * blockN);
+        for (const auto& ijv : ijValue)
+        {
+            int xb = ijv.first.x, yb = ijv.first.y;
+            MatType vblock = ijv.second;
+            for (size_t i = 0; i < blockM; i++)
+            {
+                for (size_t j = 0; j < blockN; j++)
+                {
+                    _Tp val = vblock((int)i, (int)j);
+                    if (abs(val) >= NON_ZERO_VAL_THRESHOLD)
+                    {
+                        tripletList.push_back(Eigen::Triplet<_Tp>((int)(blockM * xb + i), (int)(blockN * yb + j), val));
+                    }
+                }
+            }
+        }
+        Eigen::SparseMatrix<_Tp> EigenMat(blockM * nBlocks, blockN * nBlocks);
+        EigenMat.setFromTriplets(tripletList.begin(), tripletList.end());
+        EigenMat.makeCompressed();
+
+        return EigenMat;
+    }
+#endif
+    inline size_t nonZeroBlocks() const { return ijValue.size(); }
+
+    BlockSparseMat<_Tp, blockM, blockN>& operator+=(const BlockSparseMat<_Tp, blockM, blockN>& other)
+    {
+        for (const auto& oijv : other.ijValue)
+        {
+            Point2i p = oijv.first;
+            MatType vblock = oijv.second;
+            this->refBlock(p.x, p.y) += vblock;
+        }
+
+        return *this;
+    }
+
+#if defined(HAVE_EIGEN)
+    //! Function to solve a sparse linear system of equations HX = B
+    //! Requires Eigen
+    bool sparseSolve(InputArray B, OutputArray X, bool checkSymmetry = true, OutputArray predB = cv::noArray()) const
+    {
+        Eigen::SparseMatrix<_Tp> bigA = toEigen();
+        Mat mb = B.getMat().t();
+        Eigen::Matrix<_Tp, -1, 1> bigB;
+        cv2eigen(mb, bigB);
+
+        Eigen::SparseMatrix<_Tp> bigAtranspose = bigA.transpose();
+        if(checkSymmetry && !bigA.isApprox(bigAtranspose))
+        {
+            CV_Error(Error::StsBadArg, "H matrix is not symmetrical");
+            return false;
+        }
+
+        Eigen::SimplicialLDLT<Eigen::SparseMatrix<_Tp>> solver;
+
+        solver.compute(bigA);
+        if (solver.info() != Eigen::Success)
+        {
+            CV_LOG_INFO(NULL, "Failed to eigen-decompose");
+            return false;
+        }
+        else
+        {
+            Eigen::Matrix<_Tp, -1, 1> solutionX = solver.solve(bigB);
+            if (solver.info() != Eigen::Success)
+            {
+                CV_LOG_INFO(NULL, "Failed to eigen-solve");
+                return false;
+            }
+            else
+            {
+                eigen2cv(solutionX, X);
+                if (predB.needed())
+                {
+                    Eigen::Matrix<_Tp, -1, 1> predBEigen = bigA * solutionX;
+                    eigen2cv(predBEigen, predB);
+                }
+                return true;
+            }
+        }
+    }
+#else
+    bool sparseSolve(InputArray /*B*/, OutputArray /*X*/, bool /*checkSymmetry*/ = true, OutputArray /*predB*/ = cv::noArray()) const
+    {
+        CV_Error(Error::StsNotImplemented, "Eigen library required for matrix solve, dense solver is not implemented");
+    }
+#endif
+
+    static constexpr _Tp NON_ZERO_VAL_THRESHOLD = _Tp(0.0001);
+    size_t nBlocks;
+    IDtoBlockValueMap ijValue;
+};
+
+}  // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/src/rgbd/tsdf.cpp b/modules/3d/src/rgbd/tsdf.cpp
new file mode 100644
index 0000000000..7047e2670e
--- /dev/null
+++ b/modules/3d/src/rgbd/tsdf.cpp
@@ -0,0 +1,1228 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#include "../precomp.hpp"
+#include "tsdf.hpp"
+#include "opencl_kernels_3d.hpp"
+
+namespace cv {
+
+class TSDFVolumeCPU : public TSDFVolume
+{
+   public:
+    // dimension in voxels, size in meters
+    TSDFVolumeCPU(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor, float _truncDist,
+                  int _maxWeight, Vec3i _resolution, bool zFirstMemOrder = true);
+
+    virtual void integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose,
+                           const Matx33f& intrinsics, const int frameId = 0) override;
+    virtual void raycast(const Matx44f& cameraPose, const Matx33f& intrinsics, const Size& frameSize,
+                         OutputArray points, OutputArray normals) const override;
+    virtual void integrate(InputArray, InputArray, float, const Matx44f&, const Matx33f&, const Matx33f&, const int) override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); };
+    virtual void raycast(const Matx44f&, const Matx33f&, const Size&, OutputArray, OutputArray, OutputArray) const override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); };
+
+    virtual void fetchNormals(InputArray points, OutputArray _normals) const override;
+    virtual void fetchPointsNormals(OutputArray points, OutputArray normals) const override;
+
+    virtual void reset() override;
+    virtual TsdfVoxel at(const Vec3i& volumeIdx) const;
+
+    float interpolateVoxel(const cv::Point3f& p) const;
+    Point3f getNormalVoxel(const cv::Point3f& p) const;
+
+#if USE_INTRINSICS
+    float interpolateVoxel(const v_float32x4& p) const;
+    v_float32x4 getNormalVoxel(const v_float32x4& p) const;
+#endif
+    Vec4i volStrides;
+    Vec6f frameParams;
+    Mat pixNorms;
+    // See zFirstMemOrder arg of parent class constructor
+    // for the array layout info
+    // Consist of Voxel elements
+    Mat volume;
+};
+
+
+TSDFVolume::TSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
+                       int _maxWeight, Point3i _resolution, bool zFirstMemOrder)
+    : Volume(_voxelSize, _pose, _raycastStepFactor),
+      volResolution(_resolution),
+      maxWeight( WeightType(_maxWeight) )
+{
+    CV_Assert(_maxWeight < 255);
+    // Unlike original code, this should work with any volume size
+    // Not only when (x,y,z % 32) == 0
+    volSize   = Point3f(volResolution) * voxelSize;
+    truncDist = std::max(_truncDist, 2.1f * voxelSize);
+
+    // (xRes*yRes*zRes) array
+    // Depending on zFirstMemOrder arg:
+    // &elem(x, y, z) = data + x*zRes*yRes + y*zRes + z;
+    // &elem(x, y, z) = data + x + y*xRes + z*xRes*yRes;
+    int xdim, ydim, zdim;
+    if(zFirstMemOrder)
+    {
+        xdim = volResolution.z * volResolution.y;
+        ydim = volResolution.z;
+        zdim = 1;
+    }
+    else
+    {
+        xdim = 1;
+        ydim = volResolution.x;
+        zdim = volResolution.x * volResolution.y;
+    }
+
+    volDims = Vec4i(xdim, ydim, zdim);
+    neighbourCoords = Vec8i(
+        volDims.dot(Vec4i(0, 0, 0)),
+        volDims.dot(Vec4i(0, 0, 1)),
+        volDims.dot(Vec4i(0, 1, 0)),
+        volDims.dot(Vec4i(0, 1, 1)),
+        volDims.dot(Vec4i(1, 0, 0)),
+        volDims.dot(Vec4i(1, 0, 1)),
+        volDims.dot(Vec4i(1, 1, 0)),
+        volDims.dot(Vec4i(1, 1, 1))
+    );
+}
+
+// dimension in voxels, size in meters
+TSDFVolumeCPU::TSDFVolumeCPU(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor,
+                             float _truncDist, int _maxWeight, Vec3i _resolution,
+                             bool zFirstMemOrder)
+    : TSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _resolution,
+                 zFirstMemOrder)
+{
+    int xdim, ydim, zdim;
+    if (zFirstMemOrder)
+    {
+        xdim = volResolution.z * volResolution.y;
+        ydim = volResolution.z;
+        zdim = 1;
+    }
+    else
+    {
+        xdim = 1;
+        ydim = volResolution.x;
+        zdim = volResolution.x * volResolution.y;
+    }
+    volStrides = Vec4i(xdim, ydim, zdim);
+
+    volume = Mat(1, volResolution.x * volResolution.y * volResolution.z, rawType<TsdfVoxel>());
+
+    reset();
+}
+
+// zero volume, leave rest params the same
+void TSDFVolumeCPU::reset()
+{
+    CV_TRACE_FUNCTION();
+
+    volume.forEach<VecTsdfVoxel>([](VecTsdfVoxel& vv, const int* /* position */)
+    {
+        TsdfVoxel& v = reinterpret_cast<TsdfVoxel&>(vv);
+        v.tsdf = floatToTsdf(0.0f); v.weight = 0;
+    });
+}
+
+TsdfVoxel TSDFVolumeCPU::at(const Vec3i& volumeIdx) const
+{
+    //! Out of bounds
+    if ((volumeIdx[0] >= volResolution.x || volumeIdx[0] < 0) ||
+        (volumeIdx[1] >= volResolution.y || volumeIdx[1] < 0) ||
+        (volumeIdx[2] >= volResolution.z || volumeIdx[2] < 0))
+    {
+        return TsdfVoxel(floatToTsdf(1.f), 0);
+    }
+
+    const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
+    int coordBase =
+        volumeIdx[0] * volDims[0] + volumeIdx[1] * volDims[1] + volumeIdx[2] * volDims[2];
+    return volData[coordBase];
+}
+
+// use depth instead of distance (optimization)
+void TSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose,
+                              const Matx33f& _intrinsics, const int frameId)
+{
+    CV_TRACE_FUNCTION();
+    CV_UNUSED(frameId);
+    CV_Assert(_depth.type() == DEPTH_TYPE);
+    CV_Assert(!_depth.empty());
+    Depth depth = _depth.getMat();
+
+    Intr intrinsics(_intrinsics);
+    Vec6f newParams((float)depth.rows, (float)depth.cols,
+                    intrinsics.fx, intrinsics.fy,
+                    intrinsics.cx, intrinsics.cy);
+    if (!(frameParams == newParams))
+    {
+        frameParams = newParams;
+        pixNorms = preCalculationPixNorm(depth.size(), intrinsics);
+    }
+
+    integrateVolumeUnit(truncDist, voxelSize, maxWeight, (this->pose).matrix, volResolution, volStrides, depth,
+        depthFactor, cameraPose, intrinsics, pixNorms, volume);
+}
+
+#if USE_INTRINSICS
+// all coordinate checks should be done in inclosing cycle
+inline float TSDFVolumeCPU::interpolateVoxel(const Point3f& _p) const
+{
+    v_float32x4 p(_p.x, _p.y, _p.z, 0);
+    return interpolateVoxel(p);
+}
+
+inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
+{
+    // tx, ty, tz = floor(p)
+    v_int32x4 ip  = v_floor(p);
+    v_float32x4 t = p - v_cvt_f32(ip);
+    float tx      = t.get0();
+    t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+    float ty      = t.get0();
+    t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+    float tz      = t.get0();
+
+    int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+    const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
+
+    int ix = ip.get0();
+    ip     = v_rotate_right<1>(ip);
+    int iy = ip.get0();
+    ip     = v_rotate_right<1>(ip);
+    int iz = ip.get0();
+
+    int coordBase = ix*xdim + iy*ydim + iz*zdim;
+
+    TsdfType vx[8];
+    for(int i = 0; i < 8; i++)
+        vx[i] = volData[neighbourCoords[i] + coordBase].tsdf;
+
+    v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
+    v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
+    v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+
+    v_float32x4 v00_10 = vxx;
+    v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
+
+    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
+    float v0         = v0_1.get0();
+    v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
+    float v1         = v0_1.get0();
+
+    return v0 + tx*(v1 - v0);
+}
+#else
+inline float TSDFVolumeCPU::interpolateVoxel(const Point3f& p) const
+{
+    int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+
+    int ix = cvFloor(p.x);
+    int iy = cvFloor(p.y);
+    int iz = cvFloor(p.z);
+
+    float tx = p.x - ix;
+    float ty = p.y - iy;
+    float tz = p.z - iz;
+
+    int coordBase = ix*xdim + iy*ydim + iz*zdim;
+    const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
+
+    float vx[8];
+    for (int i = 0; i < 8; i++)
+        vx[i] = tsdfToFloat(volData[neighbourCoords[i] + coordBase].tsdf);
+
+    float v00 = vx[0] + tz*(vx[1] - vx[0]);
+    float v01 = vx[2] + tz*(vx[3] - vx[2]);
+    float v10 = vx[4] + tz*(vx[5] - vx[4]);
+    float v11 = vx[6] + tz*(vx[7] - vx[6]);
+
+    float v0 = v00 + ty*(v01 - v00);
+    float v1 = v10 + ty*(v11 - v10);
+
+    return v0 + tx*(v1 - v0);
+
+}
+#endif
+
+#if USE_INTRINSICS
+//gradientDeltaFactor is fixed at 1.0 of voxel size
+inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
+{
+    v_float32x4 p(_p.x, _p.y, _p.z, 0.f);
+    v_float32x4 result = getNormalVoxel(p);
+    float CV_DECL_ALIGNED(16) ares[4];
+    v_store_aligned(ares, result);
+    return Point3f(ares[0], ares[1], ares[2]);
+}
+
+inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
+{
+    if(v_check_any (p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
+       v_check_any (p >= v_float32x4((float)(volResolution.x-2),
+                                     (float)(volResolution.y-2),
+                                     (float)(volResolution.z-2), 1.f))
+                   )
+        return nanv;
+
+    v_int32x4 ip  = v_floor(p);
+    v_float32x4 t = p - v_cvt_f32(ip);
+    float tx      = t.get0();
+    t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+    float ty      = t.get0();
+    t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+    float tz      = t.get0();
+
+    const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+    const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
+
+    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
+    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
+    int iz = ip.get0();
+
+    int coordBase = ix*xdim + iy*ydim + iz*zdim;
+
+    float CV_DECL_ALIGNED(16) an[4];
+    an[0] = an[1] = an[2] = an[3] = 0.f;
+    for(int c = 0; c < 3; c++)
+    {
+        const int dim = volDims[c];
+        float& nv     = an[c];
+
+        float vx[8];
+        for(int i = 0; i < 8; i++)
+            vx[i] = tsdfToFloat(volData[neighbourCoords[i] + coordBase + 1*dim].tsdf) -
+                    tsdfToFloat(volData[neighbourCoords[i] + coordBase - 1*dim].tsdf);
+
+        v_float32x4 v0246 (vx[0], vx[2], vx[4], vx[6]);
+        v_float32x4 v1357 (vx[1], vx[3], vx[5], vx[7]);
+        v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+
+        v_float32x4 v00_10 = vxx;
+        v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
+
+        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
+        float v0         = v0_1.get0();
+        v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
+        float v1         = v0_1.get0();
+
+        nv = v0 + tx*(v1 - v0);
+    }
+
+    v_float32x4 n       = v_load_aligned(an);
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n*n)));
+
+    return Norm.get0() < 0.0001f ? nanv : n/Norm;
+}
+#else
+inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
+{
+    const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
+    const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
+
+    if(p.x < 1 || p.x >= volResolution.x - 2 ||
+       p.y < 1 || p.y >= volResolution.y - 2 ||
+       p.z < 1 || p.z >= volResolution.z - 2)
+        return nan3;
+
+    int ix = cvFloor(p.x);
+    int iy = cvFloor(p.y);
+    int iz = cvFloor(p.z);
+
+    float tx = p.x - ix;
+    float ty = p.y - iy;
+    float tz = p.z - iz;
+
+    int coordBase = ix*xdim + iy*ydim + iz*zdim;
+
+    Vec3f an;
+    for(int c = 0; c < 3; c++)
+    {
+        const int dim = volDims[c];
+        float& nv = an[c];
+
+        float vx[8];
+        for (int i = 0; i < 8; i++)
+            vx[i] = tsdfToFloat(volData[neighbourCoords[i] + coordBase + 1 * dim].tsdf) -
+                    tsdfToFloat(volData[neighbourCoords[i] + coordBase - 1 * dim].tsdf);
+
+        float v00 = vx[0] + tz*(vx[1] - vx[0]);
+        float v01 = vx[2] + tz*(vx[3] - vx[2]);
+        float v10 = vx[4] + tz*(vx[5] - vx[4]);
+        float v11 = vx[6] + tz*(vx[7] - vx[6]);
+
+        float v0 = v00 + ty*(v01 - v00);
+        float v1 = v10 + ty*(v11 - v10);
+
+        nv = v0 + tx*(v1 - v0);
+    }
+
+    float nv = sqrt(an[0] * an[0] +
+                    an[1] * an[1] +
+                    an[2] * an[2]);
+    return nv < 0.0001f ? nan3 : an / nv;
+}
+#endif
+
+struct RaycastInvoker : ParallelLoopBody
+{
+    RaycastInvoker(Points& _points, Normals& _normals, const Matx44f& cameraPose,
+                  const Intr& intrinsics, const TSDFVolumeCPU& _volume) :
+        ParallelLoopBody(),
+        points(_points),
+        normals(_normals),
+        volume(_volume),
+        tstep(volume.truncDist * volume.raycastStepFactor),
+        // We do subtract voxel size to minimize checks after
+        // Note: origin of volume coordinate is placed
+        // in the center of voxel (0,0,0), not in the corner of the voxel!
+        boxMax(volume.volSize - Point3f(volume.voxelSize,
+                                        volume.voxelSize,
+                                        volume.voxelSize)),
+        boxMin(),
+        cam2vol(volume.pose.inv() * Affine3f(cameraPose)),
+        vol2cam(Affine3f(cameraPose.inv()) * volume.pose),
+        reproj(intrinsics.makeReprojector())
+    {  }
+
+#if USE_INTRINSICS
+    virtual void operator() (const Range& range) const override
+    {
+        const v_float32x4 vfxy(reproj.fxinv, reproj.fyinv, 0, 0);
+        const v_float32x4 vcxy(reproj.cx, reproj.cy, 0, 0);
+
+        const float (&cm)[16] = cam2vol.matrix.val;
+        const v_float32x4 camRot0(cm[0], cm[4], cm[8], 0);
+        const v_float32x4 camRot1(cm[1], cm[5], cm[9], 0);
+        const v_float32x4 camRot2(cm[2], cm[6], cm[10], 0);
+        const v_float32x4 camTrans(cm[3], cm[7], cm[11], 0);
+
+        const v_float32x4 boxDown(boxMin.x, boxMin.y, boxMin.z, 0.f);
+        const v_float32x4 boxUp(boxMax.x, boxMax.y, boxMax.z, 0.f);
+
+        const v_float32x4 invVoxelSize = v_float32x4(volume.voxelSizeInv,
+                                                     volume.voxelSizeInv,
+                                                     volume.voxelSizeInv, 1.f);
+
+        const float (&vm)[16] = vol2cam.matrix.val;
+        const v_float32x4 volRot0(vm[0], vm[4], vm[8], 0);
+        const v_float32x4 volRot1(vm[1], vm[5], vm[9], 0);
+        const v_float32x4 volRot2(vm[2], vm[6], vm[10], 0);
+        const v_float32x4 volTrans(vm[3], vm[7], vm[11], 0);
+
+        for(int y = range.start; y < range.end; y++)
+        {
+            ptype* ptsRow = points[y];
+            ptype* nrmRow = normals[y];
+
+            for(int x = 0; x < points.cols; x++)
+            {
+                v_float32x4 point = nanv, normal = nanv;
+
+                v_float32x4 orig = camTrans;
+
+                // get direction through pixel in volume space:
+
+                // 1. reproject (x, y) on projecting plane where z = 1.f
+                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy)*vfxy;
+                planed             = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
+
+                // 2. rotate to volume space
+                planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
+
+                // 3. normalize
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed*planed)));
+                v_float32x4 dir     = planed*invNorm;
+
+                // compute intersection of ray with all six bbox planes
+                v_float32x4 rayinv = v_setall_f32(1.f)/dir;
+                // div by zero should be eliminated by these products
+                v_float32x4 tbottom = rayinv*(boxDown - orig);
+                v_float32x4 ttop    = rayinv*(boxUp   - orig);
+
+                // re-order intersections to find smallest and largest on each axis
+                v_float32x4 minAx = v_min(ttop, tbottom);
+                v_float32x4 maxAx = v_max(ttop, tbottom);
+
+                // near clipping plane
+                const float clip = 0.f;
+                float _minAx[4], _maxAx[4];
+                v_store(_minAx, minAx);
+                v_store(_maxAx, maxAx);
+                float tmin       = max( {_minAx[0], _minAx[1], _minAx[2], clip} );
+                float tmax       = min( {_maxAx[0], _maxAx[1], _maxAx[2]} );
+
+                // precautions against getting coordinates out of bounds
+                tmin = tmin + tstep;
+                tmax = tmax - tstep;
+
+                if(tmin < tmax)
+                {
+                    // interpolation optimized a little
+                    orig *= invVoxelSize;
+                    dir  *= invVoxelSize;
+
+                    int xdim            = volume.volDims[0];
+                    int ydim            = volume.volDims[1];
+                    int zdim            = volume.volDims[2];
+                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
+                    v_float32x4 next    = (orig + dir * v_setall_f32(tmin));
+                    float f = volume.interpolateVoxel(next), fnext = f;
+
+                    //raymarch
+                    int steps = 0;
+                    int nSteps = cvFloor((tmax - tmin)/tstep);
+                    for(; steps < nSteps; steps++)
+                    {
+                        next += rayStep;
+                        v_int32x4 ip = v_round(next);
+                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
+                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
+                        int iz = ip.get0();
+                        int coord = ix*xdim + iy*ydim + iz*zdim;
+
+                        fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(coord).tsdf);
+                        if(fnext != f)
+                        {
+                            fnext = volume.interpolateVoxel(next);
+
+                            // when ray crosses a surface
+                            if(std::signbit(f) != std::signbit(fnext))
+                                break;
+
+                            f = fnext;
+                        }
+                    }
+
+                    // if ray penetrates a surface from outside
+                    // linearly interpolate t between two f values
+                    if(f > 0.f && fnext < 0.f)
+                    {
+                        v_float32x4 tp = next - rayStep;
+                        float ft    = volume.interpolateVoxel(tp);
+                        float ftdt  = volume.interpolateVoxel(next);
+                        float ts = tmin + tstep*(steps - ft/(ftdt - ft));
+
+                        // avoid division by zero
+                        if(!cvIsNaN(ts) && !cvIsInf(ts))
+                        {
+                            v_float32x4 pv = (orig + dir*v_setall_f32(ts));
+                            v_float32x4 nv = volume.getNormalVoxel(pv);
+
+                            if(!isNaN(nv))
+                            {
+                                //convert pv and nv to camera space
+                                normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
+                                // interpolation optimized a little
+                                point = v_matmuladd(pv*v_float32x4(volume.voxelSize,
+                                                                   volume.voxelSize,
+                                                                   volume.voxelSize, 1.f),
+                                                    volRot0, volRot1, volRot2, volTrans);
+                            }
+                        }
+                    }
+                }
+
+                v_store((float*)(&ptsRow[x]), point);
+                v_store((float*)(&nrmRow[x]), normal);
+            }
+        }
+    }
+#else
+    virtual void operator() (const Range& range) const override
+    {
+        const Point3f camTrans = cam2vol.translation();
+        const Matx33f  camRot  = cam2vol.rotation();
+        const Matx33f  volRot  = vol2cam.rotation();
+
+        for(int y = range.start; y < range.end; y++)
+        {
+            ptype* ptsRow = points[y];
+            ptype* nrmRow = normals[y];
+
+            for(int x = 0; x < points.cols; x++)
+            {
+                Point3f point = nan3, normal = nan3;
+
+                Point3f orig = camTrans;
+                // direction through pixel in volume space
+                Point3f dir = normalize(Vec3f(camRot * reproj(Point3f(float(x), float(y), 1.f))));
+
+                // compute intersection of ray with all six bbox planes
+                Vec3f rayinv(1.f/dir.x, 1.f/dir.y, 1.f/dir.z);
+                Point3f tbottom = rayinv.mul(boxMin - orig);
+                Point3f ttop    = rayinv.mul(boxMax - orig);
+
+                // re-order intersections to find smallest and largest on each axis
+                Point3f minAx(min(ttop.x, tbottom.x), min(ttop.y, tbottom.y), min(ttop.z, tbottom.z));
+                Point3f maxAx(max(ttop.x, tbottom.x), max(ttop.y, tbottom.y), max(ttop.z, tbottom.z));
+
+                // near clipping plane
+                const float clip = 0.f;
+                //float tmin = max(max(max(minAx.x, minAx.y), max(minAx.x, minAx.z)), clip);
+                //float tmax =     min(min(maxAx.x, maxAx.y), min(maxAx.x, maxAx.z));
+                float tmin = max({ minAx.x, minAx.y, minAx.z, clip });
+                float tmax = min({ maxAx.x, maxAx.y, maxAx.z });
+
+                // precautions against getting coordinates out of bounds
+                tmin = tmin + tstep;
+                tmax = tmax - tstep;
+
+                if(tmin < tmax)
+                {
+                    // interpolation optimized a little
+                    orig = orig*volume.voxelSizeInv;
+                    dir  =  dir*volume.voxelSizeInv;
+
+                    Point3f rayStep = dir * tstep;
+                    Point3f next = (orig + dir * tmin);
+                    float f = volume.interpolateVoxel(next), fnext = f;
+
+                    //raymarch
+                    int steps = 0;
+                    int nSteps = int(floor((tmax - tmin)/tstep));
+                    for(; steps < nSteps; steps++)
+                    {
+                        next += rayStep;
+                        int xdim = volume.volDims[0];
+                        int ydim = volume.volDims[1];
+                        int zdim = volume.volDims[2];
+                        int ix = cvRound(next.x);
+                        int iy = cvRound(next.y);
+                        int iz = cvRound(next.z);
+                        fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(ix*xdim + iy*ydim + iz*zdim).tsdf);
+                        if(fnext != f)
+                        {
+                            fnext = volume.interpolateVoxel(next);
+                            // when ray crosses a surface
+                            if(std::signbit(f) != std::signbit(fnext))
+                                break;
+
+                            f = fnext;
+                        }
+                    }
+                    // if ray penetrates a surface from outside
+                    // linearly interpolate t between two f values
+                    if(f > 0.f && fnext < 0.f)
+                    {
+                        Point3f tp    = next - rayStep;
+                        float ft   = volume.interpolateVoxel(tp);
+                        float ftdt = volume.interpolateVoxel(next);
+                        // float t = tmin + steps*tstep;
+                        // float ts = t - tstep*ft/(ftdt - ft);
+                        float ts = tmin + tstep*(steps - ft/(ftdt - ft));
+
+                        // avoid division by zero
+                        if(!cvIsNaN(ts) && !cvIsInf(ts))
+                        {
+                            Point3f pv = (orig + dir*ts);
+                            Point3f nv = volume.getNormalVoxel(pv);
+
+                            if(!isNaN(nv))
+                            {
+                                //convert pv and nv to camera space
+                                normal = volRot * nv;
+                                // interpolation optimized a little
+                                point = vol2cam * (pv*volume.voxelSize);
+                            }
+                        }
+                    }
+                }
+                ptsRow[x] = toPtype(point);
+                nrmRow[x] = toPtype(normal);
+            }
+        }
+    }
+#endif
+
+    Points& points;
+    Normals& normals;
+    const TSDFVolumeCPU& volume;
+
+    const float tstep;
+
+    const Point3f boxMax;
+    const Point3f boxMin;
+
+    const Affine3f cam2vol;
+    const Affine3f vol2cam;
+    const Intr::Reprojector reproj;
+};
+
+
+void TSDFVolumeCPU::raycast(const Matx44f& cameraPose, const Matx33f& intrinsics, const Size& frameSize,
+                            OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(frameSize.area() > 0);
+
+    _points.create (frameSize, POINT_TYPE);
+    _normals.create(frameSize, POINT_TYPE);
+
+    Points points   =  _points.getMat();
+    Normals normals = _normals.getMat();
+
+    RaycastInvoker ri(points, normals, cameraPose, Intr(intrinsics), *this);
+
+    const int nstripes = -1;
+    parallel_for_(Range(0, points.rows), ri, nstripes);
+}
+
+
+struct FetchPointsNormalsInvoker : ParallelLoopBody
+{
+    FetchPointsNormalsInvoker(const TSDFVolumeCPU& _volume,
+                              std::vector<std::vector<ptype>>& _pVecs,
+                              std::vector<std::vector<ptype>>& _nVecs,
+                              bool _needNormals) :
+        ParallelLoopBody(),
+        vol(_volume),
+        pVecs(_pVecs),
+        nVecs(_nVecs),
+        needNormals(_needNormals)
+    {
+        volDataStart = vol.volume.ptr<TsdfVoxel>();
+    }
+
+    inline void coord(std::vector<ptype>& points, std::vector<ptype>& normals,
+                      int x, int y, int z, Point3f V, float v0, int axis) const
+    {
+        // 0 for x, 1 for y, 2 for z
+        bool limits = false;
+        Point3i shift;
+        float Vc = 0.f;
+        if(axis == 0)
+        {
+            shift  = Point3i(1, 0, 0);
+            limits = (x + 1 < vol.volResolution.x);
+            Vc     = V.x;
+        }
+        if(axis == 1)
+        {
+            shift  = Point3i(0, 1, 0);
+            limits = (y + 1 < vol.volResolution.y);
+            Vc     = V.y;
+        }
+        if(axis == 2)
+        {
+            shift  = Point3i(0, 0, 1);
+            limits = (z + 1 < vol.volResolution.z);
+            Vc     = V.z;
+        }
+
+        if(limits)
+        {
+            const TsdfVoxel& voxeld = volDataStart[(x+shift.x)*vol.volDims[0] +
+                                                   (y+shift.y)*vol.volDims[1] +
+                                                   (z+shift.z)*vol.volDims[2]];
+            float vd = tsdfToFloat(voxeld.tsdf);
+            if(voxeld.weight != 0 && vd != 1.f)
+            {
+                if((v0 > 0 && vd < 0) || (v0 < 0 && vd > 0))
+                {
+                    //linearly interpolate coordinate
+                    float Vn    = Vc + vol.voxelSize;
+                    float dinv  = 1.f/(abs(v0)+abs(vd));
+                    float inter = (Vc*abs(vd) + Vn*abs(v0))*dinv;
+
+                    Point3f p(shift.x ? inter : V.x,
+                              shift.y ? inter : V.y,
+                              shift.z ? inter : V.z);
+                    {
+                        points.push_back(toPtype(vol.pose * p));
+                        if(needNormals)
+                            normals.push_back(toPtype(vol.pose.rotation() *
+                                                      vol.getNormalVoxel(p*vol.voxelSizeInv)));
+                    }
+                }
+            }
+        }
+    }
+
+    virtual void operator() (const Range& range) const override
+    {
+        std::vector<ptype> points, normals;
+        for(int x = range.start; x < range.end; x++)
+        {
+            const TsdfVoxel* volDataX = volDataStart + x*vol.volDims[0];
+            for(int y = 0; y < vol.volResolution.y; y++)
+            {
+                const TsdfVoxel* volDataY = volDataX + y*vol.volDims[1];
+                for(int z = 0; z < vol.volResolution.z; z++)
+                {
+                    const TsdfVoxel& voxel0 = volDataY[z*vol.volDims[2]];
+                    float v0             = tsdfToFloat(voxel0.tsdf);
+                    if(voxel0.weight != 0 && v0 != 1.f)
+                    {
+                        Point3f V(Point3f((float)x + 0.5f, (float)y + 0.5f, (float)z + 0.5f)*vol.voxelSize);
+
+                        coord(points, normals, x, y, z, V, v0, 0);
+                        coord(points, normals, x, y, z, V, v0, 1);
+                        coord(points, normals, x, y, z, V, v0, 2);
+
+                    } // if voxel is not empty
+                }
+            }
+        }
+
+        AutoLock al(mutex);
+        pVecs.push_back(points);
+        nVecs.push_back(normals);
+    }
+
+    const TSDFVolumeCPU& vol;
+    std::vector<std::vector<ptype>>& pVecs;
+    std::vector<std::vector<ptype>>& nVecs;
+    const TsdfVoxel* volDataStart;
+    bool needNormals;
+    mutable Mutex mutex;
+};
+
+void TSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    if(_points.needed())
+    {
+        std::vector<std::vector<ptype>> pVecs, nVecs;
+        FetchPointsNormalsInvoker fi(*this, pVecs, nVecs, _normals.needed());
+        Range range(0, volResolution.x);
+        const int nstripes = -1;
+        parallel_for_(range, fi, nstripes);
+
+        std::vector<ptype> points, normals;
+        for(size_t i = 0; i < pVecs.size(); i++)
+        {
+            points.insert(points.end(), pVecs[i].begin(), pVecs[i].end());
+            normals.insert(normals.end(), nVecs[i].begin(), nVecs[i].end());
+        }
+
+        _points.create((int)points.size(), 1, POINT_TYPE);
+        if(!points.empty())
+            Mat((int)points.size(), 1, POINT_TYPE, &points[0]).copyTo(_points.getMat());
+
+        if(_normals.needed())
+        {
+            _normals.create((int)normals.size(), 1, POINT_TYPE);
+            if(!normals.empty())
+                Mat((int)normals.size(), 1, POINT_TYPE, &normals[0]).copyTo(_normals.getMat());
+        }
+    }
+}
+
+void TSDFVolumeCPU::fetchNormals(InputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(!_points.empty());
+    if(_normals.needed())
+    {
+        Points points = _points.getMat();
+        CV_Assert(points.type() == POINT_TYPE);
+
+        _normals.createSameSize(_points, _points.type());
+        Normals normals = _normals.getMat();
+
+        const TSDFVolumeCPU& _vol = *this;
+        auto PushNormals = [&](const ptype& pp, const int* position)
+        {
+            const TSDFVolumeCPU& vol(_vol);
+            Affine3f invPose(vol.pose.inv());
+            Point3f p = fromPtype(pp);
+            Point3f n = nan3;
+            if (!isNaN(p))
+            {
+                Point3f voxPt = (invPose * p);
+                voxPt = voxPt * vol.voxelSizeInv;
+                n = vol.pose.rotation() * vol.getNormalVoxel(voxPt);
+            }
+            normals(position[0], position[1]) = toPtype(n);
+        };
+        points.forEach(PushNormals);
+    }
+}
+
+///////// GPU implementation /////////
+
+#ifdef HAVE_OPENCL
+
+class TSDFVolumeGPU : public TSDFVolume
+{
+   public:
+    // dimension in voxels, size in meters
+    TSDFVolumeGPU(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
+                  int _maxWeight, Point3i _resolution);
+
+    virtual void integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose,
+                           const Matx33f& intrinsics, const int frameId = 0) override;
+    virtual void raycast(const Matx44f& cameraPose, const Matx33f& intrinsics, const Size& frameSize,
+                         OutputArray _points, OutputArray _normals) const override;
+    virtual void integrate(InputArray, InputArray, float, const Matx44f&, const Matx33f&, const Matx33f&, const int) override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); };
+    virtual void raycast(const Matx44f&, const Matx33f&, const Size&, OutputArray, OutputArray, OutputArray) const override
+    { CV_Error(Error::StsNotImplemented, "Not implemented"); };
+
+    virtual void fetchPointsNormals(OutputArray points, OutputArray normals) const override;
+    virtual void fetchNormals(InputArray points, OutputArray normals) const override;
+
+    virtual void reset() override;
+
+    Vec6f frameParams;
+    UMat pixNorms;
+    // See zFirstMemOrder arg of parent class constructor
+    // for the array layout info
+    // Array elem is CV_8UC2, read as (int8, uint8)
+    UMat volume;
+};
+
+TSDFVolumeGPU::TSDFVolumeGPU(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist, int _maxWeight,
+                             Point3i _resolution) :
+    TSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _resolution, false)
+{
+    volume = UMat(1, volResolution.x * volResolution.y * volResolution.z, CV_8UC2);
+
+    reset();
+}
+
+
+// zero volume, leave rest params the same
+void TSDFVolumeGPU::reset()
+{
+    CV_TRACE_FUNCTION();
+
+    volume.setTo(Scalar(0, 0));
+}
+
+// use depth instead of distance (optimization)
+void TSDFVolumeGPU::integrate(InputArray _depth, float depthFactor,
+                              const Matx44f& cameraPose, const Matx33f& _intrinsics, const int frameId)
+{
+    CV_TRACE_FUNCTION();
+    CV_UNUSED(frameId);
+    CV_Assert(!_depth.empty());
+
+    UMat depth = _depth.getUMat();
+
+    String errorStr;
+    String name           = "integrate";
+    ocl::ProgramSource source = ocl::_3d::tsdf_oclsrc;
+    String options        = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if(k.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    UMat vol2camGpu;
+    Affine3f vol2cam(Affine3f(cameraPose.inv()) * pose);
+    Mat(vol2cam.matrix).copyTo(vol2camGpu);
+
+    float dfac = 1.f/depthFactor;
+    Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z);
+    Intr intrinsics(_intrinsics);
+    Vec2f fxy(intrinsics.fx, intrinsics.fy), cxy(intrinsics.cx, intrinsics.cy);
+    Vec6f newParams((float)depth.rows, (float)depth.cols,
+                    intrinsics.fx, intrinsics.fy,
+                    intrinsics.cx, intrinsics.cy);
+    if (!(frameParams == newParams))
+    {
+        frameParams = newParams;
+        pixNorms = preCalculationPixNormGPU(depth.size(), intrinsics);
+    }
+
+    // TODO: optimization possible
+    // Use sampler for depth (mask needed)
+    k.args(ocl::KernelArg::ReadOnly(depth),
+           ocl::KernelArg::PtrReadWrite(volume),
+           ocl::KernelArg::PtrReadOnly(vol2camGpu),
+           voxelSize,
+           volResGpu.val,
+           volDims.val,
+           fxy.val,
+           cxy.val,
+           dfac,
+           truncDist,
+           int(maxWeight),
+           ocl::KernelArg::PtrReadOnly(pixNorms));
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)volResolution.x;
+    globalSize[1] = (size_t)volResolution.y;
+
+    if(!k.run(2, globalSize, NULL, true))
+        throw std::runtime_error("Failed to run kernel");
+}
+
+
+void TSDFVolumeGPU::raycast(const Matx44f& cameraPose, const Matx33f& _intrinsics, const Size& frameSize,
+                            OutputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(frameSize.area() > 0);
+
+    String errorStr;
+    String name           = "raycast";
+    ocl::ProgramSource source = ocl::_3d::tsdf_oclsrc;
+    String options        = "-cl-mad-enable";
+    ocl::Kernel k;
+    k.create(name.c_str(), source, options, &errorStr);
+
+    if(k.empty())
+        throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+    _points.create (frameSize, CV_32FC4);
+    _normals.create(frameSize, CV_32FC4);
+
+    UMat points  =  _points.getUMat();
+    UMat normals = _normals.getUMat();
+
+    UMat vol2camGpu, cam2volGpu;
+    Affine3f vol2cam = Affine3f(cameraPose.inv()) * pose;
+    Affine3f cam2vol = pose.inv() * Affine3f(cameraPose);
+    Mat(cam2vol.matrix).copyTo(cam2volGpu);
+    Mat(vol2cam.matrix).copyTo(vol2camGpu);
+    Intr intrinsics(_intrinsics);
+    Intr::Reprojector r = intrinsics.makeReprojector();
+    // We do subtract voxel size to minimize checks after
+    // Note: origin of volume coordinate is placed
+    // in the center of voxel (0,0,0), not in the corner of the voxel!
+    Vec4f boxMin, boxMax(volSize.x - voxelSize,
+                         volSize.y - voxelSize,
+                         volSize.z - voxelSize);
+    Vec2f finv(r.fxinv, r.fyinv), cxy(r.cx, r.cy);
+    float tstep = truncDist * raycastStepFactor;
+
+    Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z);
+
+    k.args(ocl::KernelArg::WriteOnlyNoSize(points),
+           ocl::KernelArg::WriteOnlyNoSize(normals),
+           frameSize,
+           ocl::KernelArg::PtrReadOnly(volume),
+           ocl::KernelArg::PtrReadOnly(vol2camGpu),
+           ocl::KernelArg::PtrReadOnly(cam2volGpu),
+           finv.val, cxy.val,
+           boxMin.val, boxMax.val,
+           tstep,
+           voxelSize,
+           volResGpu.val,
+           volDims.val,
+           neighbourCoords.val);
+
+    size_t globalSize[2];
+    globalSize[0] = (size_t)frameSize.width;
+    globalSize[1] = (size_t)frameSize.height;
+
+    if(!k.run(2, globalSize, NULL, true))
+        throw std::runtime_error("Failed to run kernel");
+}
+
+
+void TSDFVolumeGPU::fetchNormals(InputArray _points, OutputArray _normals) const
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(!_points.empty());
+
+    if(_normals.needed())
+    {
+        UMat points = _points.getUMat();
+        CV_Assert(points.type() == POINT_TYPE);
+
+        _normals.createSameSize(_points, POINT_TYPE);
+        UMat normals = _normals.getUMat();
+
+        String errorStr;
+        String name           = "getNormals";
+        ocl::ProgramSource source = ocl::_3d::tsdf_oclsrc;
+        String options        = "-cl-mad-enable";
+        ocl::Kernel k;
+        k.create(name.c_str(), source, options, &errorStr);
+
+        if(k.empty())
+            throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+        UMat volPoseGpu, invPoseGpu;
+        Mat(pose      .matrix).copyTo(volPoseGpu);
+        Mat(pose.inv().matrix).copyTo(invPoseGpu);
+        Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z);
+        Size frameSize = points.size();
+
+        k.args(ocl::KernelArg::ReadOnlyNoSize(points),
+               ocl::KernelArg::WriteOnlyNoSize(normals),
+               frameSize,
+               ocl::KernelArg::PtrReadOnly(volume),
+               ocl::KernelArg::PtrReadOnly(volPoseGpu),
+               ocl::KernelArg::PtrReadOnly(invPoseGpu),
+               voxelSizeInv,
+               volResGpu.val,
+               volDims.val,
+               neighbourCoords.val);
+
+        size_t globalSize[2];
+        globalSize[0] = (size_t)points.cols;
+        globalSize[1] = (size_t)points.rows;
+
+        if(!k.run(2, globalSize, NULL, true))
+            throw std::runtime_error("Failed to run kernel");
+    }
+}
+
+void TSDFVolumeGPU::fetchPointsNormals(OutputArray points, OutputArray normals) const
+{
+    CV_TRACE_FUNCTION();
+
+    if(points.needed())
+    {
+        bool needNormals = normals.needed();
+
+        // 1. scan to count points in each group and allocate output arrays
+
+        ocl::Kernel kscan;
+
+        String errorStr;
+        ocl::ProgramSource source = ocl::_3d::tsdf_oclsrc;
+        String options        = "-cl-mad-enable";
+
+        kscan.create("scanSize", source, options, &errorStr);
+
+        if(kscan.empty())
+            throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+        size_t globalSize[3];
+        globalSize[0] = (size_t)volResolution.x;
+        globalSize[1] = (size_t)volResolution.y;
+        globalSize[2] = (size_t)volResolution.z;
+
+        const ocl::Device& device = ocl::Device::getDefault();
+        size_t wgsLimit           = device.maxWorkGroupSize();
+        size_t memSize            = device.localMemSize();
+        // local mem should keep a point (and a normal) for each thread in a group
+        // use 4 float per each point and normal
+        size_t elemSize     = (sizeof(float)*4)*(needNormals ? 2 : 1);
+        const size_t lcols  = 8;
+        const size_t lrows  = 8;
+        size_t lplanes      = min(memSize/elemSize, wgsLimit)/lcols/lrows;
+        lplanes             = roundDownPow2(lplanes);
+        size_t localSize[3] = {lcols, lrows, lplanes};
+        Vec3i ngroups((int)divUp(globalSize[0], (unsigned int)localSize[0]),
+                      (int)divUp(globalSize[1], (unsigned int)localSize[1]),
+                      (int)divUp(globalSize[2], (unsigned int)localSize[2]));
+
+        const size_t counterSize = sizeof(int);
+        size_t lszscan = localSize[0]*localSize[1]*localSize[2]*counterSize;
+
+        const int gsz[3] = {ngroups[2], ngroups[1], ngroups[0]};
+        UMat groupedSum(3, gsz, CV_32S, Scalar(0));
+
+        UMat volPoseGpu;
+        Mat(pose.matrix).copyTo(volPoseGpu);
+        Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z);
+
+        kscan.args(ocl::KernelArg::PtrReadOnly(volume),
+                   volResGpu.val,
+                   volDims.val,
+                   neighbourCoords.val,
+                   ocl::KernelArg::PtrReadOnly(volPoseGpu),
+                   voxelSize,
+                   voxelSizeInv,
+                   ocl::KernelArg::Local(lszscan),
+                   ocl::KernelArg::WriteOnlyNoSize(groupedSum));
+
+        if(!kscan.run(3, globalSize, localSize, true))
+            throw std::runtime_error("Failed to run kernel");
+
+        Mat groupedSumCpu = groupedSum.getMat(ACCESS_READ);
+        int gpuSum        = (int)sum(groupedSumCpu)[0];
+        // should be no CPU copies when new kernel is executing
+        groupedSumCpu.release();
+
+        // 2. fill output arrays according to per-group points count
+
+        points.create(gpuSum, 1, POINT_TYPE);
+        UMat pts = points.getUMat();
+        UMat nrm;
+        if(needNormals)
+        {
+            normals.create(gpuSum, 1, POINT_TYPE);
+            nrm = normals.getUMat();
+        }
+        else
+        {
+            // it won't be accessed but empty args are forbidden
+            nrm = UMat(1, 1, POINT_TYPE);
+        }
+
+        if (gpuSum)
+        {
+            ocl::Kernel kfill;
+            kfill.create("fillPtsNrm", source, options, &errorStr);
+
+            if(kfill.empty())
+                throw std::runtime_error("Failed to create kernel: " + errorStr);
+
+            UMat atomicCtr(1, 1, CV_32S, Scalar(0));
+
+            // mem size to keep pts (and normals optionally) for all work-items in a group
+            size_t lszfill = localSize[0]*localSize[1]*localSize[2]*elemSize;
+
+            kfill.args(ocl::KernelArg::PtrReadOnly(volume),
+                       volResGpu.val,
+                       volDims.val,
+                       neighbourCoords.val,
+                       ocl::KernelArg::PtrReadOnly(volPoseGpu),
+                       voxelSize,
+                       voxelSizeInv,
+                       ((int)needNormals),
+                       ocl::KernelArg::Local(lszfill),
+                       ocl::KernelArg::PtrReadWrite(atomicCtr),
+                       ocl::KernelArg::ReadOnlyNoSize(groupedSum),
+                       ocl::KernelArg::WriteOnlyNoSize(pts),
+                       ocl::KernelArg::WriteOnlyNoSize(nrm)
+                       );
+
+            if(!kfill.run(3, globalSize, localSize, true))
+                throw std::runtime_error("Failed to run kernel");
+        }
+    }
+}
+
+#endif
+
+Ptr<TSDFVolume> makeTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor,
+                               float _truncDist, int _maxWeight, Point3i _resolution)
+{
+#ifdef HAVE_OPENCL
+    if (ocl::useOpenCL())
+        return makePtr<TSDFVolumeGPU>(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight,
+                                      _resolution);
+#endif
+    return makePtr<TSDFVolumeCPU>(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight,
+                                  _resolution);
+}
+
+Ptr<TSDFVolume> makeTSDFVolume(const VolumeParams& _params)
+{
+    Mat pm = _params.pose;
+    CV_Assert(pm.size() == Size(4, 4));
+    CV_Assert(pm.type() == CV_32F);
+    Matx44f pose = pm;
+    cv::Point3i resolution(_params.resolutionX, _params.resolutionY, _params.resolutionZ);
+#ifdef HAVE_OPENCL
+    if (ocl::useOpenCL())
+        return makePtr<TSDFVolumeGPU>(_params.voxelSize, pose, _params.raycastStepFactor,
+                                      _params.tsdfTruncDist, _params.maxWeight, resolution);
+#endif
+    return makePtr<TSDFVolumeCPU>(_params.voxelSize, pose, _params.raycastStepFactor,
+                                  _params.tsdfTruncDist, _params.maxWeight, resolution);
+
+}
+
+} // namespace cv
diff --git a/modules/3d/src/rgbd/tsdf.hpp b/modules/3d/src/rgbd/tsdf.hpp
new file mode 100644
index 0000000000..572957e23c
--- /dev/null
+++ b/modules/3d/src/rgbd/tsdf.hpp
@@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#ifndef OPENCV_3D_TSDF_HPP
+#define OPENCV_3D_TSDF_HPP
+
+#include "../precomp.hpp"
+#include "tsdf_functions.hpp"
+
+namespace cv
+{
+
+class TSDFVolume : public Volume
+{
+   public:
+    // dimension in voxels, size in meters
+    TSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
+               int _maxWeight, Point3i _resolution, bool zFirstMemOrder = true);
+    virtual ~TSDFVolume() = default;
+
+   public:
+
+    Point3i volResolution;
+    WeightType maxWeight;
+
+    Point3f volSize;
+    float truncDist;
+    Vec4i volDims;
+    Vec8i neighbourCoords;
+};
+
+Ptr<TSDFVolume> makeTSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor,
+                               float _truncDist, int _maxWeight, Point3i _resolution);
+Ptr<TSDFVolume> makeTSDFVolume(const VolumeParams& _params);
+}  // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/src/rgbd/tsdf_functions.cpp b/modules/3d/src/rgbd/tsdf_functions.cpp
new file mode 100644
index 0000000000..0ced2f7a9b
--- /dev/null
+++ b/modules/3d/src/rgbd/tsdf_functions.cpp
@@ -0,0 +1,615 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#include "../precomp.hpp"
+#include "tsdf_functions.hpp"
+#include "opencl_kernels_3d.hpp"
+
+namespace cv {
+
+cv::Mat preCalculationPixNorm(Size size, const Intr& intrinsics)
+{
+    Point2f fl(intrinsics.fx, intrinsics.fy);
+    Point2f pp(intrinsics.cx, intrinsics.cy);
+    Mat pixNorm(size.height, size.width, CV_32F);
+    std::vector<float> x(size.width);
+    std::vector<float> y(size.height);
+    for (int i = 0; i < size.width; i++)
+        x[i] = (i - pp.x) / fl.x;
+    for (int i = 0; i < size.height; i++)
+        y[i] = (i - pp.y) / fl.y;
+
+    for (int i = 0; i < size.height; i++)
+    {
+        for (int j = 0; j < size.width; j++)
+        {
+            pixNorm.at<float>(i, j) = sqrtf(x[j] * x[j] + y[i] * y[i] + 1.0f);
+        }
+    }
+    return pixNorm;
+}
+
+#ifdef HAVE_OPENCL
+cv::UMat preCalculationPixNormGPU(Size size, const Intr& intrinsics)
+{
+    // calculating this on CPU then uploading to GPU is faster than calculating this on GPU
+    Mat cpuPixNorm = preCalculationPixNorm(size, intrinsics);
+
+    UMat pixNorm(size, CV_32F);
+    cpuPixNorm.copyTo(pixNorm);
+
+    return pixNorm;
+}
+#endif
+
+
+void integrateVolumeUnit(
+    float truncDist, float voxelSize, int maxWeight,
+    cv::Matx44f _pose, Point3i volResolution, Vec4i volStrides,
+    InputArray _depth, float depthFactor, const cv::Matx44f& cameraPose,
+    const cv::Intr& intrinsics, InputArray _pixNorms, InputArray _volume)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_depth.type() == DEPTH_TYPE);
+    CV_Assert(!_depth.empty());
+    cv::Affine3f vpose(_pose);
+    Depth depth = _depth.getMat();
+
+    Range integrateRange(0, volResolution.x);
+
+    Mat volume = _volume.getMat();
+    Mat pixNorms = _pixNorms.getMat();
+    const Intr::Projector proj(intrinsics.makeProjector());
+    const cv::Affine3f vol2cam(Affine3f(cameraPose.inv()) * vpose);
+    const float truncDistInv(1.f / truncDist);
+    const float dfac(1.f / depthFactor);
+    TsdfVoxel* volDataStart = volume.ptr<TsdfVoxel>();;
+
+#if USE_INTRINSICS
+    auto IntegrateInvoker = [&](const Range& range)
+    {
+        // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt;
+        Point3f zStepPt = Point3f(vol2cam.matrix(0, 2),
+            vol2cam.matrix(1, 2),
+            vol2cam.matrix(2, 2)) * voxelSize;
+
+        v_float32x4 zStep(zStepPt.x, zStepPt.y, zStepPt.z, 0);
+        v_float32x4 vfxy(proj.fx, proj.fy, 0.f, 0.f), vcxy(proj.cx, proj.cy, 0.f, 0.f);
+        const v_float32x4 upLimits = v_cvt_f32(v_int32x4(depth.cols - 1, depth.rows - 1, 0, 0));
+
+        for (int x = range.start; x < range.end; x++)
+        {
+            TsdfVoxel* volDataX = volDataStart + x * volStrides[0];
+            for (int y = 0; y < volResolution.y; y++)
+            {
+                TsdfVoxel* volDataY = volDataX + y * volStrides[1];
+                // optimization of camSpace transformation (vector addition instead of matmul at each z)
+                Point3f basePt = vol2cam * (Point3f((float)x, (float)y, 0) * voxelSize);
+                v_float32x4 camSpacePt(basePt.x, basePt.y, basePt.z, 0);
+
+                int startZ, endZ;
+                if (abs(zStepPt.z) > 1e-5)
+                {
+                    int baseZ = (int)(-basePt.z / zStepPt.z);
+                    if (zStepPt.z > 0)
+                    {
+                        startZ = baseZ;
+                        endZ = volResolution.z;
+                    }
+                    else
+                    {
+                        startZ = 0;
+                        endZ = baseZ;
+                    }
+                }
+                else
+                {
+                    if (basePt.z > 0)
+                    {
+                        startZ = 0;
+                        endZ = volResolution.z;
+                    }
+                    else
+                    {
+                        // z loop shouldn't be performed
+                        startZ = endZ = 0;
+                    }
+                }
+                startZ = max(0, startZ);
+                endZ = min(int(volResolution.z), endZ);
+                for (int z = startZ; z < endZ; z++)
+                {
+                    // optimization of the following:
+                    //Point3f volPt = Point3f(x, y, z)*voxelSize;
+                    //Point3f camSpacePt = vol2cam * volPt;
+                    camSpacePt += zStep;
+
+                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    if (zCamSpace <= 0.f)
+                        continue;
+
+                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
+                    // leave only first 2 lanes
+                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
+                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+
+                    depthType v;
+                    // bilinearly interpolate depth at projected
+                    {
+                        const v_float32x4& pt = projected;
+                        // check coords >= 0 and < imgSize
+                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
+                            v_reinterpret_as_u32(pt >= upLimits);
+                        limits = limits | v_rotate_right<1>(limits);
+                        if (limits.get0())
+                            continue;
+
+                        // xi, yi = floor(pt)
+                        v_int32x4 ip = v_floor(pt);
+                        v_int32x4 ipshift = ip;
+                        int xi = ipshift.get0();
+                        ipshift = v_rotate_right<1>(ipshift);
+                        int yi = ipshift.get0();
+
+                        const depthType* row0 = depth[yi + 0];
+                        const depthType* row1 = depth[yi + 1];
+
+                        // v001 = [v(xi + 0, yi + 0), v(xi + 1, yi + 0)]
+                        v_float32x4 v001 = v_load_low(row0 + xi);
+                        // v101 = [v(xi + 0, yi + 1), v(xi + 1, yi + 1)]
+                        v_float32x4 v101 = v_load_low(row1 + xi);
+
+                        v_float32x4 vall = v_combine_low(v001, v101);
+
+                        // assume correct depth is positive
+                        // don't fix missing data
+                        if (v_check_all(vall > v_setzero_f32()))
+                        {
+                            v_float32x4 t = pt - v_cvt_f32(ip);
+                            float tx = t.get0();
+                            t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+                            v_float32x4 ty = v_setall_f32(t.get0());
+                            // vx is y-interpolated between rows 0 and 1
+                            v_float32x4 vx = v001 + ty * (v101 - v001);
+                            float v0 = vx.get0();
+                            vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
+                            float v1 = vx.get0();
+                            v = v0 + tx * (v1 - v0);
+                        }
+                        else
+                            continue;
+                    }
+
+                    // norm(camPixVec) produces double which is too slow
+                    int _u = (int)projected.get0();
+                    int _v = (int)v_rotate_right<1>(projected).get0();
+                    if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
+                        continue;
+                    float pixNorm = pixNorms.at<float>(_v, _u);
+                    // float pixNorm = sqrt(v_reduce_sum(camPixVec*camPixVec));
+                    // difference between distances of point and of surface to camera
+                    float sdf = pixNorm * (v * dfac - zCamSpace);
+                    // possible alternative is:
+                    // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1);
+                    if (sdf >= -truncDist)
+                    {
+                        TsdfType tsdf = floatToTsdf(fmin(1.f, sdf * truncDistInv));
+
+                        TsdfVoxel& voxel = volDataY[z * volStrides[2]];
+                        WeightType& weight = voxel.weight;
+                        TsdfType& value = voxel.tsdf;
+
+                        // update TSDF
+                        value = floatToTsdf((tsdfToFloat(value) * weight + tsdfToFloat(tsdf)) / (weight + 1));
+                        weight = (weight + 1) < maxWeight ? (weight + 1) : (WeightType) maxWeight;
+                    }
+                }
+            }
+        }
+    };
+#else
+    auto IntegrateInvoker = [&](const Range& range)
+    {
+        for (int x = range.start; x < range.end; x++)
+        {
+            TsdfVoxel* volDataX = volDataStart + x * volStrides[0];
+            for (int y = 0; y < volResolution.y; y++)
+            {
+                TsdfVoxel* volDataY = volDataX + y * volStrides[1];
+                // optimization of camSpace transformation (vector addition instead of matmul at each z)
+                Point3f basePt = vol2cam * (Point3f(float(x), float(y), 0.0f) * voxelSize);
+                Point3f camSpacePt = basePt;
+                // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt;
+                // zStep == vol2cam*[Point3f(x, y, 1) - Point3f(x, y, 0)]*voxelSize
+                Point3f zStep = Point3f(vol2cam.matrix(0, 2),
+                    vol2cam.matrix(1, 2),
+                    vol2cam.matrix(2, 2)) * voxelSize;
+                int startZ, endZ;
+                if (abs(zStep.z) > 1e-5)
+                {
+                    int baseZ = int(-basePt.z / zStep.z);
+                    if (zStep.z > 0)
+                    {
+                        startZ = baseZ;
+                        endZ = volResolution.z;
+                    }
+                    else
+                    {
+                        startZ = 0;
+                        endZ = baseZ;
+                    }
+                }
+                else
+                {
+                    if (basePt.z > 0)
+                    {
+                        startZ = 0;
+                        endZ = volResolution.z;
+                    }
+                    else
+                    {
+                        // z loop shouldn't be performed
+                        startZ = endZ = 0;
+                    }
+                }
+                startZ = max(0, startZ);
+                endZ = min(int(volResolution.z), endZ);
+
+                for (int z = startZ; z < endZ; z++)
+                {
+                    // optimization of the following:
+                    //Point3f volPt = Point3f(x, y, z)*volume.voxelSize;
+                    //Point3f camSpacePt = vol2cam * volPt;
+
+                    camSpacePt += zStep;
+                    if (camSpacePt.z <= 0)
+                        continue;
+
+                    Point3f camPixVec;
+                    Point2f projected = proj(camSpacePt, camPixVec);
+
+                    depthType v = bilinearDepth(depth, projected);
+                    if (v == 0) {
+                        continue;
+                    }
+
+                    int _u = projected.x;
+                    int _v = projected.y;
+                    if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
+                        continue;
+                    float pixNorm = pixNorms.at<float>(_v, _u);
+
+                    // difference between distances of point and of surface to camera
+                    float sdf = pixNorm * (v * dfac - camSpacePt.z);
+                    // possible alternative is:
+                    // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1);
+                    if (sdf >= -truncDist)
+                    {
+                        TsdfType tsdf = floatToTsdf(fmin(1.f, sdf * truncDistInv));
+
+                        TsdfVoxel& voxel = volDataY[z * volStrides[2]];
+                        WeightType& weight = voxel.weight;
+                        TsdfType& value = voxel.tsdf;
+
+                        // update TSDF
+                        value = floatToTsdf((tsdfToFloat(value) * weight + tsdfToFloat(tsdf)) / (weight + 1));
+                        weight = min(int(weight + 1), int(maxWeight));
+                    }
+                }
+            }
+        }
+    };
+#endif
+
+    parallel_for_(integrateRange, IntegrateInvoker);
+}
+
+
+void integrateRGBVolumeUnit(
+    float truncDist, float voxelSize, int maxWeight,
+    cv::Matx44f _pose, Point3i volResolution, Vec4i volStrides,
+    InputArray _depth, InputArray _rgb, float depthFactor, const cv::Matx44f& cameraPose,
+    const cv::Intr& depth_intrinsics, const cv::Intr& rgb_intrinsics, InputArray _pixNorms, InputArray _volume)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_depth.type() == DEPTH_TYPE);
+    CV_Assert(!_depth.empty());
+    cv::Affine3f vpose(_pose);
+    Depth depth = _depth.getMat();
+    Colors color = _rgb.getMat();
+    Range integrateRange(0, volResolution.x);
+
+    Mat volume = _volume.getMat();
+    Mat pixNorms = _pixNorms.getMat();
+    const Intr::Projector projDepth(depth_intrinsics.makeProjector());
+    const Intr::Projector projRGB(rgb_intrinsics);
+    const cv::Affine3f vol2cam(Affine3f(cameraPose.inv()) * vpose);
+    const float truncDistInv(1.f / truncDist);
+    const float dfac(1.f / depthFactor);
+    RGBTsdfVoxel* volDataStart = volume.ptr<RGBTsdfVoxel>();
+
+#if USE_INTRINSICS
+    auto IntegrateInvoker = [&](const Range& range)
+    {
+        // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt;
+        Point3f zStepPt = Point3f(vol2cam.matrix(0, 2),
+            vol2cam.matrix(1, 2),
+            vol2cam.matrix(2, 2)) * voxelSize;
+
+        v_float32x4 zStep(zStepPt.x, zStepPt.y, zStepPt.z, 0);
+        v_float32x4 vfxy(projDepth.fx, projDepth.fy, 0.f, 0.f), vcxy(projDepth.cx, projDepth.cy, 0.f, 0.f);
+        v_float32x4 rgb_vfxy(projRGB.fx, projRGB.fy, 0.f, 0.f), rgb_vcxy(projRGB.cx, projRGB.cy, 0.f, 0.f);
+        const v_float32x4 upLimits = v_cvt_f32(v_int32x4(depth.cols - 1, depth.rows - 1, 0, 0));
+
+        for (int x = range.start; x < range.end; x++)
+        {
+            RGBTsdfVoxel* volDataX = volDataStart + x * volStrides[0];
+            for (int y = 0; y < volResolution.y; y++)
+            {
+                RGBTsdfVoxel* volDataY = volDataX + y * volStrides[1];
+                // optimization of camSpace transformation (vector addition instead of matmul at each z)
+                Point3f basePt = vol2cam * (Point3f((float)x, (float)y, 0) * voxelSize);
+                v_float32x4 camSpacePt(basePt.x, basePt.y, basePt.z, 0);
+
+                int startZ, endZ;
+                if (abs(zStepPt.z) > 1e-5)
+                {
+                    int baseZ = (int)(-basePt.z / zStepPt.z);
+                    if (zStepPt.z > 0)
+                    {
+                        startZ = baseZ;
+                        endZ = volResolution.z;
+                    }
+                    else
+                    {
+                        startZ = 0;
+                        endZ = baseZ;
+                    }
+                }
+                else
+                {
+                    if (basePt.z > 0)
+                    {
+                        startZ = 0;
+                        endZ = volResolution.z;
+                    }
+                    else
+                    {
+                        // z loop shouldn't be performed
+                        startZ = endZ = 0;
+                    }
+                }
+                startZ = max(0, startZ);
+                endZ = min(int(volResolution.z), endZ);
+                for (int z = startZ; z < endZ; z++)
+                {
+                    // optimization of the following:
+                    //Point3f volPt = Point3f(x, y, z)*voxelSize;
+                    //Point3f camSpacePt = vol2cam * volPt;
+                    camSpacePt += zStep;
+
+                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    if (zCamSpace <= 0.f)
+                        continue;
+
+                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
+                    // leave only first 2 lanes
+                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
+                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+
+                    depthType v;
+                    // bilinearly interpolate depth at projected
+                    {
+                        const v_float32x4& pt = projected;
+                        // check coords >= 0 and < imgSize
+                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
+                            v_reinterpret_as_u32(pt >= upLimits);
+                        limits = limits | v_rotate_right<1>(limits);
+                        if (limits.get0())
+                            continue;
+
+                        // xi, yi = floor(pt)
+                        v_int32x4 ip = v_floor(pt);
+                        v_int32x4 ipshift = ip;
+                        int xi = ipshift.get0();
+                        ipshift = v_rotate_right<1>(ipshift);
+                        int yi = ipshift.get0();
+
+                        const depthType* row0 = depth[yi + 0];
+                        const depthType* row1 = depth[yi + 1];
+
+                        // v001 = [v(xi + 0, yi + 0), v(xi + 1, yi + 0)]
+                        v_float32x4 v001 = v_load_low(row0 + xi);
+                        // v101 = [v(xi + 0, yi + 1), v(xi + 1, yi + 1)]
+                        v_float32x4 v101 = v_load_low(row1 + xi);
+
+                        v_float32x4 vall = v_combine_low(v001, v101);
+
+                        // assume correct depth is positive
+                        // don't fix missing data
+                        if (v_check_all(vall > v_setzero_f32()))
+                        {
+                            v_float32x4 t = pt - v_cvt_f32(ip);
+                            float tx = t.get0();
+                            t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+                            v_float32x4 ty = v_setall_f32(t.get0());
+                            // vx is y-interpolated between rows 0 and 1
+                            v_float32x4 vx = v001 + ty * (v101 - v001);
+                            float v0 = vx.get0();
+                            vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
+                            float v1 = vx.get0();
+                            v = v0 + tx * (v1 - v0);
+                        }
+                        else
+                            continue;
+                    }
+
+                    v_float32x4 projectedRGB = v_muladd(camPixVec, rgb_vfxy, rgb_vcxy);
+                    // leave only first 2 lanes
+                    projectedRGB = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
+                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+
+                    // norm(camPixVec) produces double which is too slow
+                    int _u = (int)projected.get0();
+                    int _v = (int)v_rotate_right<1>(projected).get0();
+                    int rgb_u = (int)projectedRGB.get0();
+                    int rgb_v = (int)v_rotate_right<1>(projectedRGB).get0();
+
+                    if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows &&
+                        rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols))
+                        continue;
+                    float pixNorm = pixNorms.at<float>(_v, _u);
+                    Vec4f colorRGB = color.at<Vec4f>(rgb_v, rgb_u);
+                    //float pixNorm = sqrt(v_reduce_sum(camPixVec*camPixVec));
+                    // difference between distances of point and of surface to camera
+                    float sdf = pixNorm * (v * dfac - zCamSpace);
+                    // possible alternative is:
+                    // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1);
+                    if (sdf >= -truncDist)
+                    {
+                        TsdfType tsdf = floatToTsdf(fmin(1.f, sdf * truncDistInv));
+
+                        RGBTsdfVoxel& voxel = volDataY[z * volStrides[2]];
+                        WeightType& weight = voxel.weight;
+                        TsdfType& value = voxel.tsdf;
+                        ColorType& r = voxel.r;
+                        ColorType& g = voxel.g;
+                        ColorType& b = voxel.b;
+
+                        // update RGB
+                        r = (ColorType)((float)(r * weight) + (colorRGB[0])) / (weight + 1);
+                        g = (ColorType)((float)(g * weight) + (colorRGB[1])) / (weight + 1);
+                        b = (ColorType)((float)(b * weight) + (colorRGB[2])) / (weight + 1);
+                        colorFix(r, g, b);
+                        // update TSDF
+                        value = floatToTsdf((tsdfToFloat(value) * weight + tsdfToFloat(tsdf)) / (weight + 1));
+                        weight = WeightType(min(int(weight + 1), int(maxWeight)));
+                    }
+                }
+            }
+        }
+    };
+#else
+    auto IntegrateInvoker = [&](const Range& range)
+    {
+        for (int x = range.start; x < range.end; x++)
+        {
+            RGBTsdfVoxel* volDataX = volDataStart + x * volStrides[0];
+            for (int y = 0; y < volResolution.y; y++)
+            {
+                RGBTsdfVoxel* volDataY = volDataX + y * volStrides[1];
+                // optimization of camSpace transformation (vector addition instead of matmul at each z)
+                Point3f basePt = vol2cam * (Point3f(float(x), float(y), 0.0f) * voxelSize);
+                Point3f camSpacePt = basePt;
+                // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt;
+                // zStep == vol2cam*[Point3f(x, y, 1) - Point3f(x, y, 0)]*voxelSize
+                Point3f zStep = Point3f(vol2cam.matrix(0, 2),
+                    vol2cam.matrix(1, 2),
+                    vol2cam.matrix(2, 2)) * voxelSize;
+                int startZ, endZ;
+                if (abs(zStep.z) > 1e-5)
+                {
+                    int baseZ = int(-basePt.z / zStep.z);
+                    if (zStep.z > 0)
+                    {
+                        startZ = baseZ;
+                        endZ = volResolution.z;
+                    }
+                    else
+                    {
+                        startZ = 0;
+                        endZ = baseZ;
+                    }
+                }
+                else
+                {
+                    if (basePt.z > 0)
+                    {
+                        startZ = 0;
+                        endZ = volResolution.z;
+                    }
+                    else
+                    {
+                        // z loop shouldn't be performed
+                        startZ = endZ = 0;
+                    }
+                }
+                startZ = max(0, startZ);
+                endZ = min(int(volResolution.z), endZ);
+
+                for (int z = startZ; z < endZ; z++)
+                {
+                    // optimization of the following:
+                    //Point3f volPt = Point3f(x, y, z)*volume.voxelSize;
+                    //Point3f camSpacePt = vol2cam * volPt;
+
+                    camSpacePt += zStep;
+                    if (camSpacePt.z <= 0)
+                        continue;
+
+                    Point3f camPixVec;
+                    Point2f projected = projDepth(camSpacePt, camPixVec);
+                    Point2f projectedRGB = projRGB(camSpacePt, camPixVec);
+
+
+                    depthType v = bilinearDepth(depth, projected);
+                    if (v == 0) {
+                        continue;
+                    }
+
+                    int _u = (int) projected.x;
+                    int _v = (int) projected.y;
+
+                    int rgb_u = (int) projectedRGB.x;
+                    int rgb_v = (int) projectedRGB.y;
+
+                    if (!(_v >= 0 && _v < depth.rows && _u >= 0 && _u < depth.cols  &&
+                        rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols))
+                        continue;
+
+                    float pixNorm = pixNorms.at<float>(_v, _u);
+                    Vec4f colorRGB = color.at<Vec4f>(rgb_v, rgb_u);
+                    // difference between distances of point and of surface to camera
+                    float sdf = pixNorm * (v * dfac - camSpacePt.z);
+                    // possible alternative is:
+                    // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1);
+                    if (sdf >= -truncDist)
+                    {
+                        TsdfType tsdf = floatToTsdf(fmin(1.f, sdf * truncDistInv));
+
+                        RGBTsdfVoxel& voxel = volDataY[z * volStrides[2]];
+                        WeightType& weight = voxel.weight;
+                        TsdfType& value = voxel.tsdf;
+                        ColorType& r = voxel.r;
+                        ColorType& g = voxel.g;
+                        ColorType& b = voxel.b;
+
+                        // update RGB
+                        if (weight < 1)
+                        {
+                            r = (ColorType)((float)(r * weight) + (colorRGB[0])) / (weight + 1);
+                            g = (ColorType)((float)(g * weight) + (colorRGB[1])) / (weight + 1);
+                            b = (ColorType)((float)(b * weight) + (colorRGB[2])) / (weight + 1);
+                        }
+
+                        // update TSDF
+                        value = floatToTsdf((tsdfToFloat(value) * weight + tsdfToFloat(tsdf)) / (weight + 1));
+                        weight = WeightType( min(int(weight + 1), int(maxWeight)) );
+                    }
+                }
+            }
+        }
+    };
+#endif
+    parallel_for_(integrateRange, IntegrateInvoker);
+}
+
+} // namespace cv
diff --git a/modules/3d/src/rgbd/tsdf_functions.hpp b/modules/3d/src/rgbd/tsdf_functions.hpp
new file mode 100644
index 0000000000..e5c8abb305
--- /dev/null
+++ b/modules/3d/src/rgbd/tsdf_functions.hpp
@@ -0,0 +1,407 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#ifndef OPENCV_3D_TSDF_FUNCTIONS_HPP
+#define OPENCV_3D_TSDF_FUNCTIONS_HPP
+
+#include "../precomp.hpp"
+#include "utils.hpp"
+
+namespace cv
+{
+
+typedef int8_t TsdfType;
+typedef uchar WeightType;
+
+struct TsdfVoxel
+{
+    TsdfVoxel(TsdfType _tsdf, WeightType _weight) :
+        tsdf(_tsdf), weight(_weight)
+    { }
+    TsdfType tsdf;
+    WeightType weight;
+};
+
+typedef Vec<uchar, sizeof(TsdfVoxel)> VecTsdfVoxel;
+
+typedef short int ColorType;
+struct RGBTsdfVoxel
+{
+    RGBTsdfVoxel(TsdfType _tsdf, WeightType _weight, ColorType _r, ColorType _g, ColorType _b) :
+        tsdf(_tsdf), weight(_weight), r(_r), g(_g), b(_b)
+    { }
+    TsdfType tsdf;
+    WeightType weight;
+    ColorType r, g, b;
+};
+
+typedef Vec<uchar, sizeof(RGBTsdfVoxel)> VecRGBTsdfVoxel;
+
+inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
+{
+    v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
+    return v_cvt_f32(num) * num128;
+}
+
+inline TsdfType floatToTsdf(float num)
+{
+    //CV_Assert(-1 < num <= 1);
+    int8_t res = int8_t(num * (-128.f));
+    res = res ? res : (num < 0 ? 1 : -1);
+    return res;
+}
+
+inline float tsdfToFloat(TsdfType num)
+{
+    return float(num) * (-1.f / 128.f);
+}
+
+inline void colorFix(ColorType& r, ColorType& g, ColorType&b)
+{
+    if (r > 255) r = 255;
+    if (g > 255) g = 255;
+    if (b > 255) b = 255;
+}
+
+inline void colorFix(Point3f& c)
+{
+    if (c.x > 255) c.x = 255;
+    if (c.y > 255) c.y = 255;
+    if (c.z > 255) c.z = 255;
+}
+
+cv::Mat preCalculationPixNorm(Size size, const Intr& intrinsics);
+cv::UMat preCalculationPixNormGPU(Size size, const Intr& intrinsics);
+
+inline depthType bilinearDepth(const Depth& m, cv::Point2f pt)
+{
+    const bool fixMissingData = false;
+    const depthType defaultValue = qnan;
+    if (pt.x < 0 || pt.x >= m.cols - 1 ||
+        pt.y < 0 || pt.y >= m.rows - 1)
+        return defaultValue;
+
+    int xi = cvFloor(pt.x), yi = cvFloor(pt.y);
+
+    const depthType* row0 = m[yi + 0];
+    const depthType* row1 = m[yi + 1];
+
+    depthType v00 = row0[xi + 0];
+    depthType v01 = row0[xi + 1];
+    depthType v10 = row1[xi + 0];
+    depthType v11 = row1[xi + 1];
+
+    // assume correct depth is positive
+    bool b00 = v00 > 0;
+    bool b01 = v01 > 0;
+    bool b10 = v10 > 0;
+    bool b11 = v11 > 0;
+
+    if (!fixMissingData)
+    {
+        if (!(b00 && b01 && b10 && b11))
+            return defaultValue;
+        else
+        {
+            float tx = pt.x - xi, ty = pt.y - yi;
+            depthType v0 = v00 + tx * (v01 - v00);
+            depthType v1 = v10 + tx * (v11 - v10);
+            return v0 + ty * (v1 - v0);
+        }
+    }
+    else
+    {
+        int nz = b00 + b01 + b10 + b11;
+        if (nz == 0)
+        {
+            return defaultValue;
+        }
+        if (nz == 1)
+        {
+            if (b00) return v00;
+            if (b01) return v01;
+            if (b10) return v10;
+            if (b11) return v11;
+        }
+        if (nz == 2)
+        {
+            if (b00 && b10) v01 = v00, v11 = v10;
+            if (b01 && b11) v00 = v01, v10 = v11;
+            if (b00 && b01) v10 = v00, v11 = v01;
+            if (b10 && b11) v00 = v10, v01 = v11;
+            if (b00 && b11) v01 = v10 = (v00 + v11) * 0.5f;
+            if (b01 && b10) v00 = v11 = (v01 + v10) * 0.5f;
+        }
+        if (nz == 3)
+        {
+            if (!b00) v00 = v10 + v01 - v11;
+            if (!b01) v01 = v00 + v11 - v10;
+            if (!b10) v10 = v00 + v11 - v01;
+            if (!b11) v11 = v01 + v10 - v00;
+        }
+
+        float tx = pt.x - xi, ty = pt.y - yi;
+        depthType v0 = v00 + tx * (v01 - v00);
+        depthType v1 = v10 + tx * (v11 - v10);
+        return v0 + ty * (v1 - v0);
+    }
+}
+
+void integrateVolumeUnit(
+    float truncDist, float voxelSize, int maxWeight,
+    cv::Matx44f _pose, Point3i volResolution, Vec4i volStrides,
+    InputArray _depth, float depthFactor, const cv::Matx44f& cameraPose,
+    const cv::Intr& intrinsics, InputArray _pixNorms, InputArray _volume);
+
+void integrateRGBVolumeUnit(
+    float truncDist, float voxelSize, int maxWeight,
+    cv::Matx44f _pose, Point3i volResolution, Vec4i volStrides,
+    InputArray _depth, InputArray _rgb, float depthFactor, const cv::Matx44f& cameraPose,
+    const cv::Intr& depth_intrinsics, const cv::Intr& rgb_intrinsics, InputArray _pixNorms, InputArray _volume);
+
+
+class CustomHashSet
+{
+public:
+    static const int hashDivisor = 32768;
+    static const int startCapacity = 2048;
+
+    std::vector<int> hashes;
+    // 0-3 for key, 4th for internal use
+    // don't keep keep value
+    std::vector<Vec4i> data;
+    int capacity;
+    int last;
+
+    CustomHashSet()
+    {
+        hashes.resize(hashDivisor);
+        for (int i = 0; i < hashDivisor; i++)
+            hashes[i] = -1;
+        capacity = startCapacity;
+
+        data.resize(capacity);
+        for (int i = 0; i < capacity; i++)
+            data[i] = { 0, 0, 0, -1 };
+
+        last = 0;
+    }
+
+    ~CustomHashSet() { }
+
+    inline size_t calc_hash(Vec3i x) const
+    {
+        uint32_t seed = 0;
+        constexpr uint32_t GOLDEN_RATIO = 0x9e3779b9;
+        for (int i = 0; i < 3; i++)
+        {
+            seed ^= x[i] + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+
+    // should work on existing elements too
+    // 0 - need resize
+    // 1 - idx is inserted
+    // 2 - idx already exists
+    int insert(Vec3i idx)
+    {
+        if (last < capacity)
+        {
+            int hash = int(calc_hash(idx) % hashDivisor);
+            int place = hashes[hash];
+            if (place >= 0)
+            {
+                int oldPlace = place;
+                while (place >= 0)
+                {
+                    if (data[place][0] == idx[0] &&
+                        data[place][1] == idx[1] &&
+                        data[place][2] == idx[2])
+                        return 2;
+                    else
+                    {
+                        oldPlace = place;
+                        place = data[place][3];
+                        //std::cout << "place=" << place << std::endl;
+                    }
+                }
+
+                // found, create here
+                data[oldPlace][3] = last;
+            }
+            else
+            {
+                // insert at last
+                hashes[hash] = last;
+            }
+
+            data[last][0] = idx[0];
+            data[last][1] = idx[1];
+            data[last][2] = idx[2];
+            data[last][3] = -1;
+            last++;
+
+            return 1;
+        }
+        else
+            return 0;
+    }
+
+    int find(Vec3i idx) const
+    {
+        int hash = int(calc_hash(idx) % hashDivisor);
+        int place = hashes[hash];
+        // search a place
+        while (place >= 0)
+        {
+            if (data[place][0] == idx[0] &&
+                data[place][1] == idx[1] &&
+                data[place][2] == idx[2])
+                break;
+            else
+            {
+                place = data[place][3];
+            }
+        }
+
+        return place;
+    }
+};
+
+// TODO: remove this structure as soon as HashTSDFGPU data is completely on GPU;
+// until then CustomHashTable can be replaced by this one if needed
+
+const int NAN_ELEMENT = -2147483647;
+
+struct Volume_NODE
+{
+    Vec4i idx = Vec4i(NAN_ELEMENT);
+    int32_t row = -1;
+    int32_t nextVolumeRow = -1;
+    int32_t dummy = 0;
+    int32_t dummy2 = 0;
+};
+
+const int _hash_divisor = 32768;
+const int _list_size = 4;
+
+class VolumesTable
+{
+public:
+    const int hash_divisor = _hash_divisor;
+    const int list_size = _list_size;
+    const int32_t free_row = -1;
+    const int32_t free_isActive = 0;
+
+    const cv::Vec4i nan4 = cv::Vec4i(NAN_ELEMENT);
+
+    int bufferNums;
+    cv::Mat volumes;
+
+    VolumesTable() : bufferNums(1)
+    {
+        this->volumes = cv::Mat(hash_divisor * list_size, 1, rawType<Volume_NODE>());
+        for (int i = 0; i < volumes.size().height; i++)
+        {
+            Volume_NODE* v = volumes.ptr<Volume_NODE>(i);
+            v->idx = nan4;
+            v->row = -1;
+            v->nextVolumeRow = -1;
+        }
+    }
+    const VolumesTable& operator=(const VolumesTable& vt)
+    {
+        this->volumes = vt.volumes;
+        this->bufferNums = vt.bufferNums;
+        return *this;
+    }
+    ~VolumesTable() {};
+
+    bool insert(Vec3i idx, int row)
+    {
+        CV_Assert(row >= 0);
+
+        int bufferNum = 0;
+        int hash = int(calc_hash(idx) % hash_divisor);
+        int start = getPos(idx, bufferNum);
+        int i = start;
+
+        while (i >= 0)
+        {
+            Volume_NODE* v = volumes.ptr<Volume_NODE>(i);
+
+            if (v->idx[0] == NAN_ELEMENT)
+            {
+                Vec4i idx4(idx[0], idx[1], idx[2], 0);
+
+                bool extend = false;
+                if (i != start && i % list_size == 0)
+                {
+                    if (bufferNum >= bufferNums - 1)
+                    {
+                        extend = true;
+                        volumes.resize(hash_divisor * bufferNums);
+                        bufferNums++;
+                    }
+                    bufferNum++;
+                    v->nextVolumeRow = (bufferNum * hash_divisor + hash) * list_size;
+                }
+                else
+                {
+                    v->nextVolumeRow = i + 1;
+                }
+
+                v->idx = idx4;
+                v->row = row;
+
+                return extend;
+            }
+
+            i = v->nextVolumeRow;
+        }
+        return false;
+    }
+    int findRow(Vec3i idx) const
+    {
+        int bufferNum = 0;
+        int i = getPos(idx, bufferNum);
+
+        while (i >= 0)
+        {
+            const Volume_NODE* v = volumes.ptr<Volume_NODE>(i);
+
+            if (v->idx == Vec4i(idx[0], idx[1], idx[2], 0))
+                return v->row;
+            else
+                i = v->nextVolumeRow;
+        }
+
+        return -1;
+    }
+
+    inline int getPos(Vec3i idx, int bufferNum) const
+    {
+        int hash = int(calc_hash(idx) % hash_divisor);
+        return (bufferNum * hash_divisor + hash) * list_size;
+    }
+
+    inline size_t calc_hash(Vec3i x) const
+    {
+        uint32_t seed = 0;
+        constexpr uint32_t GOLDEN_RATIO = 0x9e3779b9;
+        for (int i = 0; i < 3; i++)
+        {
+            seed ^= x[i] + GOLDEN_RATIO + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+};
+
+}  // namespace cv
+
+#endif // include guard
diff --git a/modules/3d/src/rgbd/utils.cpp b/modules/3d/src/rgbd/utils.cpp
new file mode 100644
index 0000000000..985c14115e
--- /dev/null
+++ b/modules/3d/src/rgbd/utils.cpp
@@ -0,0 +1,44 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../precomp.hpp"
+#include "utils.hpp"
+
+namespace cv
+{
+
+/** If the input image is of type CV_16UC1 (like the Kinect one), the image is converted to floats, divided
+ * by 1000 to get a depth in meters, and the values 0 are converted to std::numeric_limits<float>::quiet_NaN()
+ * Otherwise, the image is simply converted to floats
+ * @param in_in the depth image (if given as short int CV_U, it is assumed to be the depth in millimeters
+ *              (as done with the Microsoft Kinect), it is assumed in meters)
+ * @param depth the desired output depth (floats or double)
+ * @param out_out The rescaled float depth image
+ */
+void rescaleDepth(InputArray in_in, int type, OutputArray out_out, double depth_factor)
+{
+    cv::Mat in = in_in.getMat();
+    CV_Assert(in.type() == CV_64FC1 || in.type() == CV_32FC1 || in.type() == CV_16UC1 || in.type() == CV_16SC1);
+    CV_Assert(type == CV_64FC1 || type == CV_32FC1);
+
+    int in_depth = in.depth();
+
+    out_out.create(in.size(), type);
+    cv::Mat out = out_out.getMat();
+    if (in_depth == CV_16U)
+    {
+      in.convertTo(out, type, 1 / depth_factor); //convert to float so that it is in meters
+      cv::Mat valid_mask = in == std::numeric_limits<ushort>::min(); // Should we do std::numeric_limits<ushort>::max() too ?
+      out.setTo(std::numeric_limits<float>::quiet_NaN(), valid_mask); //set a$
+    }
+    if (in_depth == CV_16S)
+    {
+      in.convertTo(out, type, 1 / depth_factor); //convert to float so tha$
+      cv::Mat valid_mask = (in == std::numeric_limits<short>::min()) | (in == std::numeric_limits<short>::max()); // Should we do std::numeric_limits<ushort>::max() too ?
+      out.setTo(std::numeric_limits<float>::quiet_NaN(), valid_mask); //set a$
+    }
+    if ((in_depth == CV_32F) || (in_depth == CV_64F))
+      in.convertTo(out, type);
+}
+} // namespace cv
diff --git a/modules/3d/src/rgbd/utils.hpp b/modules/3d/src/rgbd/utils.hpp
new file mode 100644
index 0000000000..056b207899
--- /dev/null
+++ b/modules/3d/src/rgbd/utils.hpp
@@ -0,0 +1,262 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Partially rewritten from https://github.com/Nerei/kinfu_remake
+// Copyright(c) 2012, Anatoly Baksheev. All rights reserved.
+
+#ifndef OPENCV_3D_UTILS_HPP
+#define OPENCV_3D_UTILS_HPP
+
+#include "../precomp.hpp"
+
+namespace cv
+{
+
+
+ /** Checks if the value is a valid depth. For CV_16U or CV_16S, the convention is to be invalid if it is
+  * a limit. For a float/double, we just check if it is a NaN
+  * @param depth the depth to check for validity
+  */
+inline bool isValidDepth(const float& depth)
+{
+    return !cvIsNaN(depth);
+}
+
+inline bool isValidDepth(const double& depth)
+{
+    return !cvIsNaN(depth);
+}
+
+inline bool isValidDepth(const short int& depth)
+{
+    return (depth != std::numeric_limits<short int>::min()) &&
+           (depth != std::numeric_limits<short int>::max());
+}
+
+inline bool isValidDepth(const unsigned short int& depth)
+{
+    return (depth != std::numeric_limits<unsigned short int>::min()) &&
+           (depth != std::numeric_limits<unsigned short int>::max());
+}
+
+inline bool isValidDepth(const int& depth)
+{
+    return (depth != std::numeric_limits<int>::min()) &&
+           (depth != std::numeric_limits<int>::max());
+}
+
+
+inline bool isValidDepth(const unsigned int& depth)
+{
+    return (depth != std::numeric_limits<unsigned int>::min()) &&
+           (depth != std::numeric_limits<unsigned int>::max());
+}
+
+/** If the input image is of type CV_16UC1 (like the Kinect one), the image is converted to floats, divided
+ * by 1000 to get a depth in meters, and the values 0 are converted to std::numeric_limits<float>::quiet_NaN()
+ * Otherwise, the image is simply converted to floats
+ * @param in the depth image (if given as short int CV_U, it is assumed to be the depth in millimeters
+ *              (as done with the Microsoft Kinect), it is assumed in meters)
+ * @param the desired output depth (floats or double)
+ * @param out The rescaled float depth image
+ */
+/* void rescaleDepth(InputArray in_in, int depth, OutputArray out_out); */
+
+template<typename T>
+void
+rescaleDepthTemplated(const Mat& in, Mat& out);
+
+template<>
+inline void
+rescaleDepthTemplated<float>(const Mat& in, Mat& out)
+{
+  rescaleDepth(in, CV_32F, out);
+}
+
+template<>
+inline void
+rescaleDepthTemplated<double>(const Mat& in, Mat& out)
+{
+  rescaleDepth(in, CV_64F, out);
+}
+
+
+// One place to turn intrinsics on and off
+#define USE_INTRINSICS CV_SIMD128
+
+typedef float depthType;
+
+const float qnan = std::numeric_limits<float>::quiet_NaN();
+const cv::Vec3f nan3(qnan, qnan, qnan);
+#if USE_INTRINSICS
+const cv::v_float32x4 nanv(qnan, qnan, qnan, qnan);
+#endif
+
+inline bool isNaN(cv::Point3f p)
+{
+    return (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z));
+}
+
+#if USE_INTRINSICS
+static inline bool isNaN(const cv::v_float32x4& p)
+{
+    return cv::v_check_any(p != p);
+}
+#endif
+
+inline size_t roundDownPow2(size_t x)
+{
+    size_t shift = 0;
+    while(x != 0)
+    {
+        shift++; x >>= 1;
+    }
+    return (size_t)(1ULL << (shift-1));
+}
+
+template<> class DataType<cv::Point3f>
+{
+public:
+    typedef float       value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_32F,
+           channels     = 3,
+           fmt          = (int)'f',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<cv::Vec3f>
+{
+public:
+    typedef float       value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_32F,
+           channels     = 3,
+           fmt          = (int)'f',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<cv::Vec4f>
+{
+public:
+    typedef float       value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_32F,
+           channels     = 4,
+           fmt          = (int)'f',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+
+typedef cv::Vec4f ptype;
+inline cv::Vec3f fromPtype(const ptype& x)
+{
+    return cv::Vec3f(x[0], x[1], x[2]);
+}
+
+inline ptype toPtype(const cv::Vec3f& x)
+{
+    return ptype(x[0], x[1], x[2], 0);
+}
+
+enum
+{
+    DEPTH_TYPE = DataType<depthType>::type,
+    POINT_TYPE = DataType<ptype    >::type,
+    COLOR_TYPE = DataType<ptype    >::type
+};
+
+typedef cv::Mat_< ptype > Points;
+typedef Points Normals;
+typedef Points Colors;
+
+typedef cv::Mat_< depthType > Depth;
+
+void makeFrameFromDepth(InputArray depth, OutputArray pyrPoints, OutputArray pyrNormals,
+                        const Matx33f intr, int levels, float depthFactor,
+                        float sigmaDepth, float sigmaSpatial, int kernelSize,
+                        float truncateThreshold);
+void buildPyramidPointsNormals(InputArray _points, InputArray _normals,
+                               OutputArrayOfArrays pyrPoints, OutputArrayOfArrays pyrNormals,
+                               int levels);
+
+struct Intr
+{
+    /** @brief Camera intrinsics */
+    /** Reprojects screen point to camera space given z coord. */
+    struct Reprojector
+    {
+        Reprojector() {}
+        inline Reprojector(Intr intr)
+        {
+            fxinv = 1.f/intr.fx, fyinv = 1.f/intr.fy;
+            cx = intr.cx, cy = intr.cy;
+        }
+        template<typename T>
+        inline cv::Point3_<T> operator()(cv::Point3_<T> p) const
+        {
+            T x = p.z * (p.x - cx) * fxinv;
+            T y = p.z * (p.y - cy) * fyinv;
+            return cv::Point3_<T>(x, y, p.z);
+        }
+
+        float fxinv, fyinv, cx, cy;
+    };
+
+    /** Projects camera space vector onto screen */
+    struct Projector
+    {
+        inline Projector(Intr intr) : fx(intr.fx), fy(intr.fy), cx(intr.cx), cy(intr.cy) { }
+        template<typename T>
+        inline cv::Point_<T> operator()(cv::Point3_<T> p) const
+        {
+            T invz = T(1)/p.z;
+            T x = fx*(p.x*invz) + cx;
+            T y = fy*(p.y*invz) + cy;
+            return cv::Point_<T>(x, y);
+        }
+        template<typename T>
+        inline cv::Point_<T> operator()(cv::Point3_<T> p, cv::Point3_<T>& pixVec) const
+        {
+            T invz = T(1)/p.z;
+            pixVec = cv::Point3_<T>(p.x*invz, p.y*invz, 1);
+            T x = fx*pixVec.x + cx;
+            T y = fy*pixVec.y + cy;
+            return cv::Point_<T>(x, y);
+        }
+        float fx, fy, cx, cy;
+    };
+    Intr() : fx(), fy(), cx(), cy() { }
+    Intr(float _fx, float _fy, float _cx, float _cy) : fx(_fx), fy(_fy), cx(_cx), cy(_cy) { }
+    Intr(cv::Matx33f m) : fx(m(0, 0)), fy(m(1, 1)), cx(m(0, 2)), cy(m(1, 2)) { }
+    // scale intrinsics to pyramid level
+    inline Intr scale(int pyr) const
+    {
+        float factor = (1.f /(1 << pyr));
+        return Intr(fx*factor, fy*factor, cx*factor, cy*factor);
+    }
+    inline Reprojector makeReprojector() const { return Reprojector(*this); }
+    inline Projector   makeProjector()   const { return Projector(*this);   }
+
+    inline cv::Matx33f getMat() const { return Matx33f(fx, 0, cx, 0, fy, cy, 0, 0, 1); }
+
+    float fx, fy, cx, cy;
+};
+
+} // namespace cv
+
+
+#endif // include guard
diff --git a/modules/3d/src/rgbd/volume.cpp b/modules/3d/src/rgbd/volume.cpp
new file mode 100644
index 0000000000..6c07d27b25
--- /dev/null
+++ b/modules/3d/src/rgbd/volume.cpp
@@ -0,0 +1,120 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../precomp.hpp"
+#include "tsdf.hpp"
+#include "hash_tsdf.hpp"
+#include "colored_tsdf.hpp"
+
+namespace cv
+{
+
+Ptr<VolumeParams> VolumeParams::defaultParams(int _volumeKind)
+{
+    VolumeParams params;
+    params.kind              = _volumeKind;
+    params.maxWeight         = 64;
+    params.raycastStepFactor = 0.25f;
+    params.unitResolution    = 0;  // unitResolution not used for TSDF
+    float volumeSize         = 3.0f;
+    Matx44f pose = Affine3f().translate(Vec3f(-volumeSize / 2.f, -volumeSize / 2.f, 0.5f)).matrix;
+    params.pose = Mat(pose);
+
+    if(params.kind == VolumeKind::TSDF)
+    {
+        params.resolutionX = 512;
+        params.resolutionY = 512;
+        params.resolutionZ = 512;
+        params.voxelSize           = volumeSize / 512.f;
+        params.depthTruncThreshold = 0.f;  // depthTruncThreshold not required for TSDF
+        params.tsdfTruncDist = 7 * params.voxelSize;  //! About 0.04f in meters
+        return makePtr<VolumeParams>(params);
+    }
+    else if(params.kind == VolumeKind::HASHTSDF)
+    {
+        params.unitResolution      = 16;
+        params.voxelSize           = volumeSize / 512.f;
+        params.depthTruncThreshold = 4.f;
+        params.tsdfTruncDist = 7 * params.voxelSize;  //! About 0.04f in meters
+        return makePtr<VolumeParams>(params);
+    }
+    else if (params.kind == VolumeKind::COLOREDTSDF)
+    {
+        params.resolutionX = 512;
+        params.resolutionY = 512;
+        params.resolutionZ = 512;
+        params.voxelSize = volumeSize / 512.f;
+        params.depthTruncThreshold = 0.f;  // depthTruncThreshold not required for TSDF
+        params.tsdfTruncDist = 7 * params.voxelSize;  //! About 0.04f in meters
+        return makePtr<VolumeParams>(params);
+    }
+    CV_Error(Error::StsBadArg, "Invalid VolumeType does not have parameters");
+}
+
+Ptr<VolumeParams> VolumeParams::coarseParams(int _volumeKind)
+{
+    Ptr<VolumeParams> params = defaultParams(_volumeKind);
+
+    params->raycastStepFactor = 0.75f;
+    float volumeSize          = 3.0f;
+    if(params->kind == VolumeKind::TSDF)
+    {
+        params->resolutionX = 128;
+        params->resolutionY = 128;
+        params->resolutionZ = 128;
+        params->voxelSize  = volumeSize / 128.f;
+        params->tsdfTruncDist = 2 * params->voxelSize;  //! About 0.04f in meters
+        return params;
+    }
+    else if(params->kind == VolumeKind::HASHTSDF)
+    {
+        params->voxelSize = volumeSize / 128.f;
+        params->tsdfTruncDist = 2 * params->voxelSize;  //! About 0.04f in meters
+        return params;
+    }
+    else if (params->kind == VolumeKind::COLOREDTSDF)
+    {
+        params->resolutionX = 128;
+        params->resolutionY = 128;
+        params->resolutionZ = 128;
+        params->voxelSize = volumeSize / 128.f;
+        params->tsdfTruncDist = 2 * params->voxelSize;  //! About 0.04f in meters
+        return params;
+    }
+    CV_Error(Error::StsBadArg, "Invalid VolumeType does not have parameters");
+}
+
+Ptr<Volume> makeVolume(const Ptr<VolumeParams>& _volumeParams)
+{
+    int kind = _volumeParams->kind;
+    if(kind == VolumeParams::VolumeKind::TSDF)
+        return makeTSDFVolume(*_volumeParams);
+    else if(kind == VolumeParams::VolumeKind::HASHTSDF)
+        return makeHashTSDFVolume(*_volumeParams);
+    else if(kind == VolumeParams::VolumeKind::COLOREDTSDF)
+        return makeColoredTSDFVolume(*_volumeParams);
+    CV_Error(Error::StsBadArg, "Invalid VolumeType does not have parameters");
+}
+
+Ptr<Volume> makeVolume(int _volumeKind, float _voxelSize, Matx44f _pose,
+                       float _raycastStepFactor, float _truncDist, int _maxWeight, float _truncateThreshold,
+                       int _resolutionX, int _resolutionY, int _resolutionZ)
+{
+    Point3i _presolution(_resolutionX, _resolutionY, _resolutionZ);
+    if (_volumeKind == VolumeParams::VolumeKind::TSDF)
+    {
+        return makeTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _presolution);
+    }
+    else if (_volumeKind == VolumeParams::VolumeKind::HASHTSDF)
+    {
+        return makeHashTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _truncateThreshold);
+    }
+    else if (_volumeKind == VolumeParams::VolumeKind::COLOREDTSDF)
+    {
+        return makeColoredTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _presolution);
+    }
+    CV_Error(Error::StsBadArg, "Invalid VolumeType does not have parameters");
+}
+
+}  // namespace cv
diff --git a/modules/3d/src/solvepnp.cpp b/modules/3d/src/solvepnp.cpp
index e358557859..2d67126309 100644
--- a/modules/3d/src/solvepnp.cpp
+++ b/modules/3d/src/solvepnp.cpp
@@ -689,8 +689,8 @@ static void exponentialMapToSE3Inv(const Mat& twist, Mat& R1, Mat& t1)
     Rodrigues(rvec, R);
 
     double theta = sqrt(wx*wx + wy*wy + wz*wz);
-    double sinc = std::fabs(theta) < 1e-8 ? 1 : sin(theta) / theta;
-    double mcosc = (std::fabs(theta) < 1e-8) ? 0.5 : (1-cos(theta)) / (theta*theta);
+    double sinc = std::fabs(theta) < 1e-8 ? 1 : std::sin(theta) / theta;
+    double mcosc = (std::fabs(theta) < 1e-8) ? 0.5 : (1- std::cos(theta)) / (theta*theta);
     double msinc = (std::abs(theta) < 1e-8) ? (1/6.0) : (1-sinc) / (theta*theta);
 
     Matx31d dt;
diff --git a/modules/3d/src/undistort.dispatch.cpp b/modules/3d/src/undistort.dispatch.cpp
index 289ec45efe..e69f5bfe23 100644
--- a/modules/3d/src/undistort.dispatch.cpp
+++ b/modules/3d/src/undistort.dispatch.cpp
@@ -553,7 +553,7 @@ static Point2f mapPointSpherical(const Point2f& p, float alpha, Vec4d* J, enum U
             double fy1 = iR/std::sqrt(1 - y1*y1);
             *J = Vec4d(fx1*(kx*x + k), fx1*ky*x, fy1*kx*y, fy1*(ky*y + k));
         }
-        return Point2f((float)asin(x1), (float)asin(y1));
+        return Point2f((float)std::asin(x1), (float)std::asin(y1));
     }
     CV_Error(Error::StsBadArg, "Unknown projection type");
 }
diff --git a/modules/3d/src/usac/utils.cpp b/modules/3d/src/usac/utils.cpp
index 9c4d2b91af..6bf460da2a 100644
--- a/modules/3d/src/usac/utils.cpp
+++ b/modules/3d/src/usac/utils.cpp
@@ -128,7 +128,7 @@ Matx33d Math::getSkewSymmetric(const Vec3d &v) {
 Matx33d Math::rotVec2RotMat (const Vec3d &v) {
     const double phi = sqrt(v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
     const double x = v[0] / phi, y = v[1] / phi, z = v[2] / phi;
-    const double a = sin(phi), b = cos(phi);
+    const double a = std::sin(phi), b = std::cos(phi);
     // R = I + sin(phi) * skew(v) + (1 - cos(phi) * skew(v)^2
     return Matx33d((b - 1)*y*y + (b - 1)*z*z + 1, -a*z - x*y*(b - 1), a*y - x*z*(b - 1),
      a*z - x*y*(b - 1), (b - 1)*x*x + (b - 1)*z*z + 1, -a*x - y*z*(b - 1),
@@ -144,10 +144,10 @@ Vec3d Math::rotMat2RotVec (const Matx33d &R) {
                                              R(0,2)-R(2,0),
                                              R(1,0)-R(0,1));
     } else if (3 - FLT_EPSILON > trace && trace > -1 + FLT_EPSILON) {
-        double theta = acos((trace - 1) / 2);
-        rot_vec = (theta / (2 * sin(theta))) * Vec3d(R(2,1)-R(1,2),
-                                                     R(0,2)-R(2,0),
-                                                     R(1,0)-R(0,1));
+        double theta = std::acos((trace - 1) / 2);
+        rot_vec = (theta / (2 * std::sin(theta))) * Vec3d(R(2,1)-R(1,2),
+                                                          R(0,2)-R(2,0),
+                                                          R(1,0)-R(0,1));
     } else {
         int a;
         if (R(0,0) > R(1,1))
diff --git a/modules/3d/test/ocl/test_tsdf.cpp b/modules/3d/test/ocl/test_tsdf.cpp
new file mode 100644
index 0000000000..d3214fe1b5
--- /dev/null
+++ b/modules/3d/test/ocl/test_tsdf.cpp
@@ -0,0 +1,524 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace opencv_test {
+namespace {
+
+using namespace cv;
+
+/** Reprojects screen point to camera space given z coord. */
+struct Reprojector
+{
+    Reprojector() {}
+    inline Reprojector(Matx33f intr)
+    {
+        fxinv = 1.f / intr(0, 0), fyinv = 1.f / intr(1, 1);
+        cx = intr(0, 2), cy = intr(1, 2);
+    }
+    template<typename T>
+    inline cv::Point3_<T> operator()(cv::Point3_<T> p) const
+    {
+        T x = p.z * (p.x - cx) * fxinv;
+        T y = p.z * (p.y - cy) * fyinv;
+        return cv::Point3_<T>(x, y, p.z);
+    }
+
+    float fxinv, fyinv, cx, cy;
+};
+
+template<class Scene>
+struct RenderInvoker : ParallelLoopBody
+{
+    RenderInvoker(Mat_<float>& _frame, Affine3f _pose,
+        Reprojector _reproj, float _depthFactor, bool _onlySemisphere)
+        : ParallelLoopBody(),
+        frame(_frame),
+        pose(_pose),
+        reproj(_reproj),
+        depthFactor(_depthFactor),
+        onlySemisphere(_onlySemisphere)
+    { }
+
+    virtual void operator ()(const cv::Range& r) const
+    {
+        for (int y = r.start; y < r.end; y++)
+        {
+            float* frameRow = frame[y];
+            for (int x = 0; x < frame.cols; x++)
+            {
+                float pix = 0;
+
+                Point3f orig = pose.translation();
+                // direction through pixel
+                Point3f screenVec = reproj(Point3f((float)x, (float)y, 1.f));
+                float xyt = 1.f / (screenVec.x * screenVec.x +
+                    screenVec.y * screenVec.y + 1.f);
+                Point3f dir = normalize(Vec3f(pose.rotation() * screenVec));
+                // screen space axis
+                dir.y = -dir.y;
+
+                const float maxDepth = 20.f;
+                const float maxSteps = 256;
+                float t = 0.f;
+                for (int step = 0; step < maxSteps && t < maxDepth; step++)
+                {
+                    Point3f p = orig + dir * t;
+                    float d = Scene::map(p, onlySemisphere);
+                    if (d < 0.000001f)
+                    {
+                        float depth = std::sqrt(t * t * xyt);
+                        pix = depth * depthFactor;
+                        break;
+                    }
+                    t += d;
+                }
+
+                frameRow[x] = pix;
+            }
+        }
+    }
+
+    Mat_<float>& frame;
+    Affine3f pose;
+    Reprojector reproj;
+    float depthFactor;
+    bool onlySemisphere;
+};
+
+struct Scene
+{
+    virtual ~Scene() {}
+    static Ptr<Scene> create(Size sz, Matx33f _intr, float _depthFactor, bool onlySemisphere);
+    virtual Mat depth(Affine3f pose) = 0;
+    virtual std::vector<Affine3f> getPoses() = 0;
+};
+
+struct SemisphereScene : Scene
+{
+    const int framesPerCycle = 72;
+    const float nCycles = 0.25f;
+    const Affine3f startPose = Affine3f(Vec3f(0.f, 0.f, 0.f), Vec3f(1.5f, 0.3f, -2.1f));
+
+    Size frameSize;
+    Matx33f intr;
+    float depthFactor;
+    bool onlySemisphere;
+
+    SemisphereScene(Size sz, Matx33f _intr, float _depthFactor, bool _onlySemisphere) :
+        frameSize(sz), intr(_intr), depthFactor(_depthFactor), onlySemisphere(_onlySemisphere)
+    { }
+
+    static float map(Point3f p, bool onlySemisphere)
+    {
+        float plane = p.y + 0.5f;
+        Point3f spherePose = p - Point3f(-0.0f, 0.3f, 1.1f);
+        float sphereRadius = 0.5f;
+        float sphere = (float)cv::norm(spherePose) - sphereRadius;
+        float sphereMinusBox = sphere;
+
+        float subSphereRadius = 0.05f;
+        Point3f subSpherePose = p - Point3f(0.3f, -0.1f, -0.3f);
+        float subSphere = (float)cv::norm(subSpherePose) - subSphereRadius;
+
+        float res;
+        if (!onlySemisphere)
+            res = min({ sphereMinusBox, subSphere, plane });
+        else
+            res = sphereMinusBox;
+
+        return res;
+    }
+
+    Mat depth(Affine3f pose) override
+    {
+        Mat_<float> frame(frameSize);
+        Reprojector reproj(intr);
+
+        Range range(0, frame.rows);
+        parallel_for_(range, RenderInvoker<SemisphereScene>(frame, pose, reproj, depthFactor, onlySemisphere));
+
+        return std::move(frame);
+    }
+
+    std::vector<Affine3f> getPoses() override
+    {
+        std::vector<Affine3f> poses;
+        for (int i = 0; i < framesPerCycle * nCycles; i++)
+        {
+            float angle = (float)(CV_2PI * i / framesPerCycle);
+            Affine3f pose;
+            pose = pose.rotate(startPose.rotation());
+            pose = pose.rotate(Vec3f(0.f, -0.5f, 0.f) * angle);
+            pose = pose.translate(Vec3f(startPose.translation()[0] * sin(angle),
+                startPose.translation()[1],
+                startPose.translation()[2] * cos(angle)));
+            poses.push_back(pose);
+        }
+
+        return poses;
+    }
+
+};
+
+Ptr<Scene> Scene::create(Size sz, Matx33f _intr, float _depthFactor, bool _onlySemisphere)
+{
+    return makePtr<SemisphereScene>(sz, _intr, _depthFactor, _onlySemisphere);
+}
+
+// this is a temporary solution
+// ----------------------------
+
+typedef cv::Vec4f ptype;
+typedef cv::Mat_< ptype > Points;
+typedef Points Normals;
+typedef Size2i Size;
+
+template<int p>
+inline float specPow(float x)
+{
+    if (p % 2 == 0)
+    {
+        float v = specPow<p / 2>(x);
+        return v * v;
+    }
+    else
+    {
+        float v = specPow<(p - 1) / 2>(x);
+        return v * v * x;
+    }
+}
+
+template<>
+inline float specPow<0>(float /*x*/)
+{
+    return 1.f;
+}
+
+template<>
+inline float specPow<1>(float x)
+{
+    return x;
+}
+
+inline cv::Vec3f fromPtype(const ptype& x)
+{
+    return cv::Vec3f(x[0], x[1], x[2]);
+}
+
+inline Point3f normalize(const Vec3f& v)
+{
+    double nv = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+    return v * (nv ? 1. / nv : 0.);
+}
+
+void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray image, Affine3f lightPose)
+{
+    Size sz = _points.size();
+    image.create(sz, CV_8UC4);
+
+    Points  points = _points.getMat();
+    Normals normals = _normals.getMat();
+
+    Mat_<Vec4b> img = image.getMat();
+
+    Range range(0, sz.height);
+    const int nstripes = -1;
+    parallel_for_(range, [&](const Range&)
+        {
+            for (int y = range.start; y < range.end; y++)
+            {
+                Vec4b* imgRow = img[y];
+                const ptype* ptsRow = points[y];
+                const ptype* nrmRow = normals[y];
+
+                for (int x = 0; x < sz.width; x++)
+                {
+                    Point3f p = fromPtype(ptsRow[x]);
+                    Point3f n = fromPtype(nrmRow[x]);
+
+                    Vec4b color;
+
+                    if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z))
+                    {
+                        color = Vec4b(0, 32, 0, 0);
+                    }
+                    else
+                    {
+                        const float Ka = 0.3f;  //ambient coeff
+                        const float Kd = 0.5f;  //diffuse coeff
+                        const float Ks = 0.2f;  //specular coeff
+                        const int   sp = 20;  //specular power
+
+                        const float Ax = 1.f;   //ambient color,  can be RGB
+                        const float Dx = 1.f;   //diffuse color,  can be RGB
+                        const float Sx = 1.f;   //specular color, can be RGB
+                        const float Lx = 1.f;   //light color
+
+                        Point3f l = normalize(lightPose.translation() - Vec3f(p));
+                        Point3f v = normalize(-Vec3f(p));
+                        Point3f r = normalize(Vec3f(2.f * n * n.dot(l) - l));
+
+                        uchar ix = (uchar)((Ax * Ka * Dx + Lx * Kd * Dx * max(0.f, n.dot(l)) +
+                            Lx * Ks * Sx * specPow<sp>(max(0.f, r.dot(v)))) * 255.f);
+                        color = Vec4b(ix, ix, ix, 0);
+                    }
+
+                    imgRow[x] = color;
+                }
+            }
+        }, nstripes);
+}
+// ----------------------------
+
+static const bool display = false;
+static const bool parallelCheck = false;
+
+class Settings
+{
+public:
+    float depthFactor;
+    Matx33f intr;
+    Size frameSize;
+    Vec3f lightPose;
+
+    Ptr<Volume> volume;
+    Ptr<Scene> scene;
+    std::vector<Affine3f> poses;
+
+    Settings(bool useHashTSDF, bool onlySemisphere)
+    {
+        frameSize = Size(640, 480);
+
+        float fx, fy, cx, cy;
+        fx = fy = 525.f;
+        cx = frameSize.width / 2 - 0.5f;
+        cy = frameSize.height / 2 - 0.5f;
+        intr = Matx33f(fx,  0, cx,
+                        0, fy, cy,
+                        0,  0,  1);
+
+        // 5000 for the 16-bit PNG files
+        // 1 for the 32-bit float images in the ROS bag files
+        depthFactor = 5000;
+
+        Vec3i volumeDims = Vec3i::all(512); //number of voxels
+
+        float volSize = 3.f;
+        float voxelSize = volSize / 512.f; //meters
+
+        // default pose of volume cube
+        Affine3f volumePose = Affine3f().translate(Vec3f(-volSize / 2.f, -volSize / 2.f, 0.5f));
+        float tsdf_trunc_dist = 7 * voxelSize; // about 0.04f in meters
+        int tsdf_max_weight = 64;   //frames
+
+        float raycast_step_factor = 0.25f;  //in voxel sizes
+        // gradient delta factor is fixed at 1.0f and is not used
+        //p.gradient_delta_factor = 0.5f; //in voxel sizes
+
+        //p.lightPose = p.volume_pose.translation()/4; //meters
+        lightPose = Vec3f::all(0.f); //meters
+
+        // depth truncation is not used by default but can be useful in some scenes
+        float truncateThreshold = 0.f; //meters
+
+        VolumeParams::VolumeKind volumeKind = VolumeParams::VolumeKind::TSDF;
+
+        if (useHashTSDF)
+        {
+            volumeKind = VolumeParams::VolumeKind::HASHTSDF;
+            truncateThreshold = 4.f;
+        }
+        else
+        {
+            volSize = 3.f;
+            volumeDims = Vec3i::all(128); //number of voxels
+            voxelSize = volSize / 128.f;
+            tsdf_trunc_dist = 2 * voxelSize; // 0.04f in meters
+
+            raycast_step_factor = 0.75f;  //in voxel sizes
+        }
+
+        volume = makeVolume(volumeKind, voxelSize, volumePose.matrix,
+                            raycast_step_factor, tsdf_trunc_dist, tsdf_max_weight,
+                            truncateThreshold, volumeDims[0], volumeDims[1], volumeDims[2]);
+
+        scene = Scene::create(frameSize, intr, depthFactor, onlySemisphere);
+        poses = scene->getPoses();
+    }
+};
+
+void displayImage(Mat depth, Mat points, Mat normals, float depthFactor, Vec3f lightPose)
+{
+    Mat image;
+    patchNaNs(points);
+    imshow("depth", depth * (1.f / depthFactor / 4.f));
+    renderPointsNormals(points, normals, image, lightPose);
+    imshow("render", image);
+    waitKey(2000);
+}
+
+void normalsCheck(Mat normals)
+{
+    Vec4f vector;
+    for (auto pvector = normals.begin<Vec4f>(); pvector < normals.end<Vec4f>(); pvector++)
+    {
+        vector = *pvector;
+        if (!cvIsNaN(vector[0]))
+        {
+            float length = vector[0] * vector[0] +
+                vector[1] * vector[1] +
+                vector[2] * vector[2];
+            ASSERT_LT(abs(1 - length), 0.0001f) << "There is normal with length != 1";
+        }
+    }
+}
+
+int counterOfValid(Mat points)
+{
+    Vec4f* v;
+    int i, j;
+    int count = 0;
+    for (i = 0; i < points.rows; ++i)
+    {
+        v = (points.ptr<Vec4f>(i));
+        for (j = 0; j < points.cols; ++j)
+        {
+            if ((v[j])[0] != 0 ||
+                (v[j])[1] != 0 ||
+                (v[j])[2] != 0)
+            {
+                count++;
+            }
+        }
+    }
+    return count;
+}
+
+void normal_test(bool isHashTSDF, bool isRaycast, bool isFetchPointsNormals, bool isFetchNormals)
+{
+    Settings settings(isHashTSDF, false);
+
+    Mat depth = settings.scene->depth(settings.poses[0]);
+    UMat udepth;
+    depth.copyTo(udepth);
+    UMat upoints, unormals, utmpnormals;
+    UMat unewPoints, unewNormals;
+    Mat  points, normals;
+    AccessFlag af = ACCESS_READ;
+
+    settings.volume->integrate(udepth, settings.depthFactor, settings.poses[0].matrix, settings.intr);
+
+    if (isRaycast)
+    {
+        settings.volume->raycast(settings.poses[0].matrix, settings.intr, settings.frameSize, upoints, unormals);
+    }
+    if (isFetchPointsNormals)
+    {
+        settings.volume->fetchPointsNormals(upoints, unormals);
+    }
+    if (isFetchNormals)
+    {
+        settings.volume->fetchPointsNormals(upoints, utmpnormals);
+        settings.volume->fetchNormals(upoints, unormals);
+    }
+
+    normals = unormals.getMat(af);
+    points  = upoints.getMat(af);
+
+    auto normalCheck = [](Vec4f& vector, const int*)
+    {
+        if (!cvIsNaN(vector[0]))
+        {
+            float length = vector[0] * vector[0] +
+                vector[1] * vector[1] +
+                vector[2] * vector[2];
+            ASSERT_LT(abs(1 - length), 0.0001f) << "There is normal with length != 1";
+        }
+    };
+
+    if (parallelCheck)
+        normals.forEach<Vec4f>(normalCheck);
+    else
+        normalsCheck(normals);
+
+    if (isRaycast && display)
+        displayImage(depth, points, normals, settings.depthFactor, settings.lightPose);
+
+    if (isRaycast)
+    {
+        settings.volume->raycast(settings.poses[17].matrix, settings.intr, settings.frameSize, unewPoints, unewNormals);
+        normals = unewNormals.getMat(af);
+        points  = unewPoints.getMat(af);
+        normalsCheck(normals);
+
+        if (parallelCheck)
+            normals.forEach<Vec4f>(normalCheck);
+        else
+            normalsCheck(normals);
+
+        if (display)
+            displayImage(depth, points, normals, settings.depthFactor, settings.lightPose);
+    }
+}
+
+void valid_points_test(bool isHashTSDF)
+{
+    Settings settings(isHashTSDF, true);
+
+    Mat depth = settings.scene->depth(settings.poses[0]);
+    UMat udepth;
+    depth.copyTo(udepth);
+    UMat upoints, unormals, unewPoints, unewNormals;
+    AccessFlag af = ACCESS_READ;
+    Mat  points, normals;
+    int anfas, profile;
+
+    settings.volume->integrate(udepth, settings.depthFactor, settings.poses[0].matrix, settings.intr);
+    settings.volume->raycast(settings.poses[0].matrix, settings.intr, settings.frameSize, upoints, unormals);
+    normals = unormals.getMat(af);
+    points  = upoints.getMat(af);
+    patchNaNs(points);
+    anfas = counterOfValid(points);
+
+    ASSERT_NE(anfas, 0) << "There is no points in anfas";
+
+    if (display)
+        displayImage(depth, points, normals, settings.depthFactor, settings.lightPose);
+
+    settings.volume->raycast(settings.poses[17].matrix, settings.intr, settings.frameSize, unewPoints, unewNormals);
+    normals = unewNormals.getMat(af);
+    points  = unewPoints.getMat(af);
+    patchNaNs(points);
+    profile = counterOfValid(points);
+
+    ASSERT_NE(profile, 0) << "There is no points in profile";
+
+    // TODO: why profile == 2*anfas ?
+    float percentValidity = float(anfas) / float(profile);
+
+    ASSERT_LT(abs(0.5 - percentValidity), 0.3) << "percentValidity out of [0.3; 0.7] (percentValidity=" << percentValidity << ")";
+
+    if (display)
+        displayImage(depth, points, normals, settings.depthFactor, settings.lightPose);
+}
+
+TEST(TSDF_GPU, raycast_normals)      { normal_test(false, true, false, false); }
+TEST(TSDF_GPU, fetch_points_normals) { normal_test(false, false, true, false); }
+TEST(TSDF_GPU, fetch_normals)        { normal_test(false, false, false, true); }
+TEST(TSDF_GPU, valid_points)         { valid_points_test(false); }
+
+TEST(HashTSDF_GPU, raycast_normals)      { normal_test(true, true, false, false); }
+TEST(HashTSDF_GPU, fetch_points_normals) { normal_test(true, false, true, false); }
+TEST(HashTSDF_GPU, fetch_normals)        { normal_test(true, false, false, true); }
+TEST(HashTSDF_GPU, valid_points)         { valid_points_test(true); }
+
+}
+}  // namespace
+
+#endif
diff --git a/modules/3d/test/test_normal.cpp b/modules/3d/test/test_normal.cpp
new file mode 100644
index 0000000000..b4ce20d2f2
--- /dev/null
+++ b/modules/3d/test/test_normal.cpp
@@ -0,0 +1,421 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "test_precomp.hpp"
+#include <opencv2/3d.hpp>
+
+namespace opencv_test { namespace {
+
+#if 0
+Point3f
+rayPlaneIntersection(Point2f uv, const Mat& centroid, const Mat& normal, const Mat_<float>& Kinv)
+{
+    Matx33d dKinv(Kinv);
+    Vec3d dNormal(normal);
+    return rayPlaneIntersection(Vec3d(uv.x, uv.y, 1), centroid.dot(normal), dNormal, dKinv);
+}
+#endif
+
+Vec3f rayPlaneIntersection(const Vec3d& uv1, double centroid_dot_normal, const Vec3d& normal, const Matx33d& Kinv)
+{
+    Matx31d L = Kinv * uv1; //a ray passing through camera optical center
+    //and uv.
+    L = L * (1.0 / cv::norm(L));
+    double LdotNormal = L.dot(normal);
+    double d;
+    if (std::fabs(LdotNormal) > 1e-9)
+    {
+        d = centroid_dot_normal / LdotNormal;
+    }
+    else
+    {
+        d = 1.0;
+        std::cout << "warning, LdotNormal nearly 0! " << LdotNormal << std::endl;
+        std::cout << "contents of L, Normal: " << Mat(L) << ", " << Mat(normal) << std::endl;
+    }
+    Vec3f xyz((float)(d * L(0)), (float)(d * L(1)), (float)(d * L(2)));
+    return xyz;
+}
+
+const int W = 640;
+const int H = 480;
+//int window_size = 5;
+float focal_length = 525;
+float cx = W / 2.f + 0.5f;
+float cy = H / 2.f + 0.5f;
+
+Mat K = (Mat_<double>(3, 3) << focal_length, 0, cx, 0, focal_length, cy, 0, 0, 1);
+Mat Kinv = K.inv();
+
+void points3dToDepth16U(const Mat_<Vec3f>& points3d, Mat& depthMap);
+
+void points3dToDepth16U(const Mat_<Vec3f>& points3d, Mat& depthMap)
+{
+    std::vector<Point3f> points3dvec;
+    for (int i = 0; i < H; i++)
+        for (int j = 0; j < W; j++)
+            points3dvec.push_back(Point3f(points3d(i, j)[0], points3d(i, j)[1], points3d(i, j)[2]));
+
+    std::vector<Point2f> img_points;
+    depthMap = Mat::zeros(H, W, CV_32F);
+    Vec3f R(0.0, 0.0, 0.0);
+    Vec3f T(0.0, 0.0, 0.0);
+    cv::projectPoints(points3dvec, R, T, K, Mat(), img_points);
+
+    int index = 0;
+    for (int i = 0; i < H; i++)
+    {
+
+        for (int j = 0; j < W; j++)
+        {
+            float value = (points3d.at<Vec3f>(i, j))[2]; // value is the z
+            depthMap.at<float>(cvRound(img_points[index].y), cvRound(img_points[index].x)) = value;
+            index++;
+        }
+    }
+    depthMap.convertTo(depthMap, CV_16U, 1000);
+}
+
+static RNG rng;
+struct Plane
+{
+    Vec3d n, p;
+    double p_dot_n;
+    Plane()
+    {
+        n[0] = rng.uniform(-0.5, 0.5);
+        n[1] = rng.uniform(-0.5, 0.5);
+        n[2] = -0.3; //rng.uniform(-1.f, 0.5f);
+        n = n / cv::norm(n);
+        set_d((float)rng.uniform(-2.0, 0.6));
+    }
+
+    void
+        set_d(float d)
+    {
+        p = Vec3d(0, 0, d / n[2]);
+        p_dot_n = p.dot(n);
+    }
+
+    Vec3f
+        intersection(float u, float v, const Matx33f& Kinv_in) const
+    {
+        return rayPlaneIntersection(Vec3d(u, v, 1), p_dot_n, n, Kinv_in);
+    }
+};
+
+void gen_points_3d(std::vector<Plane>& planes_out, Mat_<unsigned char> &plane_mask, Mat& points3d, Mat& normals,
+                   int n_planes)
+{
+    std::vector<Plane> planes;
+    for (int i = 0; i < n_planes; i++)
+    {
+        Plane px;
+        for (int j = 0; j < 1; j++)
+        {
+            px.set_d(rng.uniform(-3.f, -0.5f));
+            planes.push_back(px);
+        }
+    }
+    Mat_ < Vec3f > outp(H, W);
+    Mat_ < Vec3f > outn(H, W);
+    plane_mask.create(H, W);
+
+    // n  ( r - r_0) = 0
+    // n * r_0 = d
+    //
+    // r_0 = (0,0,0)
+    // r[0]
+    for (int v = 0; v < H; v++)
+    {
+        for (int u = 0; u < W; u++)
+        {
+            unsigned int plane_index = (unsigned int)((u / float(W)) * planes.size());
+            Plane plane = planes[plane_index];
+            outp(v, u) = plane.intersection((float)u, (float)v, Kinv);
+            outn(v, u) = plane.n;
+            plane_mask(v, u) = (uchar)plane_index;
+        }
+    }
+    planes_out = planes;
+    points3d = outp;
+    normals = outn;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class RgbdNormalsTest
+{
+public:
+    RgbdNormalsTest() { }
+    ~RgbdNormalsTest() { }
+
+    void run()
+    {
+        Mat_<unsigned char> plane_mask;
+        for (unsigned char i = 0; i < 3; ++i)
+        {
+            RgbdNormals::RgbdNormalsMethod method = RgbdNormals::RGBD_NORMALS_METHOD_FALS;;
+            // inner vector: whether it's 1 plane or 3 planes
+            // outer vector: float or double
+            std::vector<std::vector<float> > errors(2);
+            errors[0].resize(4);
+            errors[1].resize(4);
+            switch (i)
+            {
+            case 0:
+                method = RgbdNormals::RGBD_NORMALS_METHOD_FALS;
+                CV_LOG_INFO(NULL, "*** FALS");
+                errors[0][0] = 0.006f;
+                errors[0][1] = 0.03f;
+                errors[1][0] = 0.0001f;
+                errors[1][1] = 0.02f;
+                break;
+            case 1:
+                method = RgbdNormals::RGBD_NORMALS_METHOD_LINEMOD;
+                CV_LOG_INFO(NULL, "*** LINEMOD");
+                errors[0][0] = 0.04f;
+                errors[0][1] = 0.07f;
+                errors[0][2] = 0.04f; // depth 16U 1 plane
+                errors[0][3] = 0.07f; // depth 16U 3 planes
+
+                errors[1][0] = 0.05f;
+                errors[1][1] = 0.08f;
+                errors[1][2] = 0.05f; // depth 16U 1 plane
+                errors[1][3] = 0.08f; // depth 16U 3 planes
+                break;
+            case 2:
+                method = RgbdNormals::RGBD_NORMALS_METHOD_SRI;
+                CV_LOG_INFO(NULL, "*** SRI");
+                errors[0][0] = 0.02f;
+                errors[0][1] = 0.04f;
+                errors[1][0] = 0.02f;
+                errors[1][1] = 0.04f;
+                break;
+            }
+
+            for (unsigned char j = 0; j < 2; ++j)
+            {
+                int depth = (j % 2 == 0) ? CV_32F : CV_64F;
+                if (depth == CV_32F)
+                {
+                    CV_LOG_INFO(NULL, " * float");
+                }
+                else
+                {
+                    CV_LOG_INFO(NULL, " * double");
+                }
+
+                Ptr<RgbdNormals> normals_computer = RgbdNormals::create(H, W, depth, K, 5, method);
+                normals_computer->cache();
+
+                std::vector<Plane> plane_params;
+                Mat points3d, ground_normals;
+                // 1 plane, continuous scene, very low error..
+                CV_LOG_INFO(NULL, "1 plane - input 3d points");
+                float err_mean = 0;
+                for (int ii = 0; ii < 5; ++ii)
+                {
+                    gen_points_3d(plane_params, plane_mask, points3d, ground_normals, 1);
+                    err_mean += testit(points3d, ground_normals, normals_computer);
+                }
+                CV_LOG_INFO(NULL, "mean diff: " << (err_mean / 5));
+                EXPECT_LE(err_mean / 5, errors[j][0]);
+
+                // 3 discontinuities, more error expected.
+                CV_LOG_INFO(NULL, "3 planes");
+                err_mean = 0;
+                for (int ii = 0; ii < 5; ++ii)
+                {
+                    gen_points_3d(plane_params, plane_mask, points3d, ground_normals, 3);
+                    err_mean += testit(points3d, ground_normals, normals_computer);
+                }
+                CV_LOG_INFO(NULL, "mean diff: " << (err_mean / 5));
+                EXPECT_LE(err_mean / 5, errors[j][1]);
+
+                if (method == RgbdNormals::RGBD_NORMALS_METHOD_LINEMOD)
+                {
+                    // depth 16U test
+                    CV_LOG_INFO(NULL, "** depth 16U - 1 plane");
+                    err_mean = 0;
+                    for (int ii = 0; ii < 5; ++ii)
+                    {
+                        gen_points_3d(plane_params, plane_mask, points3d, ground_normals, 1);
+                        Mat depthMap;
+                        points3dToDepth16U(points3d, depthMap);
+                        err_mean += testit(depthMap, ground_normals, normals_computer);
+                    }
+                    CV_LOG_INFO(NULL, "mean diff: " << (err_mean / 5));
+                    EXPECT_LE(err_mean / 5, errors[j][2]);
+
+                    CV_LOG_INFO(NULL, "** depth 16U - 3 plane");
+                    err_mean = 0;
+                    for (int ii = 0; ii < 5; ++ii)
+                    {
+                        gen_points_3d(plane_params, plane_mask, points3d, ground_normals, 3);
+                        Mat depthMap;
+                        points3dToDepth16U(points3d, depthMap);
+                        err_mean += testit(depthMap, ground_normals, normals_computer);
+                    }
+                    CV_LOG_INFO(NULL, "mean diff: " << (err_mean / 5));
+                    EXPECT_LE(err_mean / 5, errors[j][3]);
+                }
+            }
+        }
+
+        //TODO test NaNs in data
+    }
+
+    float testit(const Mat& points3d, const Mat& in_ground_normals, const Ptr<RgbdNormals>& normals_computer)
+    {
+        TickMeter tm;
+        tm.start();
+        Mat in_normals;
+        if (normals_computer->getMethod() == RgbdNormals::RGBD_NORMALS_METHOD_LINEMOD && points3d.channels() == 3)
+        {
+            std::vector<Mat> channels;
+            split(points3d, channels);
+            normals_computer->apply(channels[2], in_normals);
+        }
+        else
+            normals_computer->apply(points3d, in_normals);
+        tm.stop();
+
+        Mat_<Vec3f> normals, ground_normals;
+        in_normals.convertTo(normals, CV_32FC3);
+        in_ground_normals.convertTo(ground_normals, CV_32FC3);
+
+        float err = 0;
+        for (int y = 0; y < normals.rows; ++y)
+            for (int x = 0; x < normals.cols; ++x)
+            {
+                Vec3f vec1 = normals(y, x), vec2 = ground_normals(y, x);
+                vec1 = vec1 / cv::norm(vec1);
+                vec2 = vec2 / cv::norm(vec2);
+
+                float dot = vec1.dot(vec2);
+                // Just for rounding errors
+                if (std::abs(dot) < 1)
+                    err += std::min(std::acos(dot), std::acos(-dot));
+            }
+
+        err /= normals.rows * normals.cols;
+        CV_LOG_INFO(NULL, "Average error: " << err << " Speed: " << tm.getTimeMilli() << " ms");
+        return err;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class RgbdPlaneTest
+{
+public:
+    RgbdPlaneTest() { }
+    ~RgbdPlaneTest() { }
+
+    void run()
+    {
+        std::vector<Plane> planes;
+        Mat points3d, ground_normals;
+        Mat_<unsigned char> plane_mask;
+        gen_points_3d(planes, plane_mask, points3d, ground_normals, 1);
+        testit(planes, plane_mask, points3d); // 1 plane, continuous scene, very low error..
+        for (int ii = 0; ii < 10; ii++)
+        {
+            gen_points_3d(planes, plane_mask, points3d, ground_normals, 3); //three planes
+            testit(planes, plane_mask, points3d); // 3 discontinuities, more error expected.
+        }
+    }
+
+    void testit(const std::vector<Plane>& gt_planes, const Mat& gt_plane_mask, const Mat& points3d)
+    {
+        for (char i_test = 0; i_test < 2; ++i_test)
+        {
+            TickMeter tm1, tm2;
+            Mat plane_mask;
+            std::vector<Vec4f> plane_coefficients;
+
+            if (i_test == 0)
+            {
+                tm1.start();
+                // First, get the normals
+                int depth = CV_32F;
+                Ptr<RgbdNormals> normals_computer = RgbdNormals::create(H, W, depth, K, 5, RgbdNormals::RGBD_NORMALS_METHOD_FALS);
+                Mat normals;
+                normals_computer->apply(points3d, normals);
+                tm1.stop();
+
+                tm2.start();
+                findPlanes(points3d, normals, plane_mask, plane_coefficients);
+                tm2.stop();
+            }
+            else
+            {
+                tm2.start();
+                findPlanes(points3d, noArray(), plane_mask, plane_coefficients);
+                tm2.stop();
+            }
+
+            // Compare each found plane to each ground truth plane
+            int n_planes = (int)plane_coefficients.size();
+            int n_gt_planes = (int)gt_planes.size();
+            Mat_<int> matching(n_gt_planes, n_planes);
+            for (int j = 0; j < n_gt_planes; ++j)
+            {
+                Mat gt_mask = gt_plane_mask == j;
+                int n_gt = countNonZero(gt_mask);
+                int n_max = 0, i_max = 0;
+                for (int i = 0; i < n_planes; ++i)
+                {
+                    Mat dst;
+                    bitwise_and(gt_mask, plane_mask == i, dst);
+                    matching(j, i) = countNonZero(dst);
+                    if (matching(j, i) > n_max)
+                    {
+                        n_max = matching(j, i);
+                        i_max = i;
+                    }
+                }
+                // Get the best match
+                ASSERT_LE(float(n_max - n_gt) / n_gt, 0.001);
+                // Compare the normals
+                Vec3d normal(plane_coefficients[i_max][0], plane_coefficients[i_max][1], plane_coefficients[i_max][2]);
+                ASSERT_GE(std::abs(gt_planes[j].n.dot(normal)), 0.95);
+            }
+
+            CV_LOG_INFO(NULL, "Speed: ");
+            if (i_test == 0)
+                CV_LOG_INFO(NULL, "normals " << tm1.getTimeMilli() << " ms and ");
+            CV_LOG_INFO(NULL, "plane " << tm2.getTimeMilli() << " ms");
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(RGBD_Normals, compute)
+{
+  RgbdNormalsTest test;
+  test.run();
+}
+
+TEST(RGBD_Plane, compute)
+{
+  RgbdPlaneTest test;
+  test.run();
+}
+
+TEST(RGBD_Plane, regression_2309_valgrind_check)
+{
+    Mat points(640, 480, CV_32FC3, Scalar::all(0));
+    // Note, 640%9 is 1 and 480%9 is 3
+    int blockSize = 9;
+
+    Mat mask;
+    std::vector<cv::Vec4f> planes;
+    // Will corrupt memory; valgrind gets triggered
+    findPlanes(points, noArray(), mask, planes, blockSize);
+}
+
+}} // namespace
diff --git a/modules/3d/test/test_odometry.cpp b/modules/3d/test/test_odometry.cpp
new file mode 100644
index 0000000000..71569f5304
--- /dev/null
+++ b/modules/3d/test/test_odometry.cpp
@@ -0,0 +1,406 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+#define SHOW_DEBUG_IMAGES  0
+
+static
+void warpFrame(const Mat& image, const Mat& depth, const Mat& rvec, const Mat& tvec, const Mat& K,
+               Mat& warpedImage, Mat& warpedDepth)
+{
+    CV_Assert(!image.empty());
+    CV_Assert(image.type() == CV_8UC1);
+
+    CV_Assert(depth.size() == image.size());
+    CV_Assert(depth.type() == CV_32FC1);
+
+    CV_Assert(!rvec.empty());
+    CV_Assert(rvec.total() == 3);
+    CV_Assert(rvec.type() == CV_64FC1);
+
+    CV_Assert(!tvec.empty());
+    CV_Assert(tvec.size() == Size(1, 3));
+    CV_Assert(tvec.type() == CV_64FC1);
+
+    warpedImage.create(image.size(), CV_8UC1);
+    warpedImage = Scalar(0);
+    warpedDepth.create(image.size(), CV_32FC1);
+    warpedDepth = Scalar(FLT_MAX);
+
+    Mat cloud;
+    depthTo3d(depth, K, cloud);
+    Mat Rt = Mat::eye(4, 4, CV_64FC1);
+    {
+        Mat R, dst;
+        cv::Rodrigues(rvec, R);
+
+        dst = Rt(Rect(0,0,3,3));
+        R.copyTo(dst);
+
+        dst = Rt(Rect(3,0,1,3));
+        tvec.copyTo(dst);
+    }
+    Mat warpedCloud, warpedImagePoints;
+    perspectiveTransform(cloud, warpedCloud, Rt);
+    projectPoints(warpedCloud.reshape(3, 1), Mat(3,1,CV_32FC1, Scalar(0)), Mat(3,1,CV_32FC1, Scalar(0)), K, Mat(1,5,CV_32FC1, Scalar(0)), warpedImagePoints);
+    warpedImagePoints = warpedImagePoints.reshape(2, cloud.rows);
+    Rect r(0, 0, image.cols, image.rows);
+    for(int y = 0; y < cloud.rows; y++)
+    {
+        for(int x = 0; x < cloud.cols; x++)
+        {
+            Point p = warpedImagePoints.at<Point2f>(y,x);
+            if(r.contains(p))
+            {
+                float curDepth = warpedDepth.at<float>(p.y, p.x);
+                float newDepth = warpedCloud.at<Point3f>(y, x).z;
+                if(newDepth < curDepth && newDepth > 0)
+                {
+                    warpedImage.at<uchar>(p.y, p.x) = image.at<uchar>(y,x);
+                    warpedDepth.at<float>(p.y, p.x) = newDepth;
+                }
+            }
+        }
+    }
+    warpedDepth.setTo(std::numeric_limits<float>::quiet_NaN(), warpedDepth > 100);
+}
+
+static
+void dilateFrame(Mat& image, Mat& depth)
+{
+    CV_Assert(!image.empty());
+    CV_Assert(image.type() == CV_8UC1);
+
+    CV_Assert(!depth.empty());
+    CV_Assert(depth.type() == CV_32FC1);
+    CV_Assert(depth.size() == image.size());
+
+    Mat mask(image.size(), CV_8UC1, Scalar(255));
+    for(int y = 0; y < depth.rows; y++)
+        for(int x = 0; x < depth.cols; x++)
+            if(cvIsNaN(depth.at<float>(y,x)) || depth.at<float>(y,x) > 10 || depth.at<float>(y,x) <= FLT_EPSILON)
+                mask.at<uchar>(y,x) = 0;
+
+    image.setTo(255, ~mask);
+    Mat minImage;
+    erode(image, minImage, Mat());
+
+    image.setTo(0, ~mask);
+    Mat maxImage;
+    dilate(image, maxImage, Mat());
+
+    depth.setTo(FLT_MAX, ~mask);
+    Mat minDepth;
+    erode(depth, minDepth, Mat());
+
+    depth.setTo(0, ~mask);
+    Mat maxDepth;
+    dilate(depth, maxDepth, Mat());
+
+    Mat dilatedMask;
+    dilate(mask, dilatedMask, Mat(), Point(-1,-1), 1);
+    for(int y = 0; y < depth.rows; y++)
+        for(int x = 0; x < depth.cols; x++)
+            if(!mask.at<uchar>(y,x) && dilatedMask.at<uchar>(y,x))
+            {
+                image.at<uchar>(y,x) = static_cast<uchar>(0.5f * (static_cast<float>(minImage.at<uchar>(y,x)) +
+                                                                  static_cast<float>(maxImage.at<uchar>(y,x))));
+                depth.at<float>(y,x) = 0.5f * (minDepth.at<float>(y,x) + maxDepth.at<float>(y,x));
+            }
+}
+
+class OdometryTest
+{
+public:
+    OdometryTest(const Ptr<Odometry>& _odometry,
+                 double _maxError1,
+                 double _maxError5,
+                 double _idError = DBL_EPSILON) :
+        odometry(_odometry),
+        maxError1(_maxError1),
+        maxError5(_maxError5),
+        idError(_idError)
+    { }
+
+    void readData(Mat& image, Mat& depth) const;
+    static Mat getCameraMatrix()
+    {
+        float fx = 525.0f, // default
+              fy = 525.0f,
+              cx = 319.5f,
+              cy = 239.5f;
+        Matx33f K(fx,  0, cx,
+                   0, fy, cy,
+                   0,  0,  1);
+        return Mat(K);
+    }
+    static void generateRandomTransformation(Mat& R, Mat& t);
+
+    void run();
+    void checkUMats();
+
+    Ptr<Odometry> odometry;
+    double maxError1;
+    double maxError5;
+    double idError;
+};
+
+
+void OdometryTest::readData(Mat& image, Mat& depth) const
+{
+    std::string dataPath = cvtest::TS::ptr()->get_data_path();
+    std::string imageFilename = dataPath + "/cv/rgbd/rgb.png";
+    std::string depthFilename = dataPath + "/cv/rgbd/depth.png";
+
+    image = imread(imageFilename,  0);
+    depth = imread(depthFilename, -1);
+
+    if(image.empty())
+    {
+        FAIL() << "Image " << imageFilename.c_str() << " can not be read" << std::endl;
+    }
+    if(depth.empty())
+    {
+        FAIL() << "Depth" << depthFilename.c_str() << "can not be read" << std::endl;
+    }
+
+    CV_DbgAssert(image.type() == CV_8UC1);
+    CV_DbgAssert(depth.type() == CV_16UC1);
+    {
+        Mat depth_flt;
+        depth.convertTo(depth_flt, CV_32FC1, 1.f/5000.f);
+        depth_flt.setTo(std::numeric_limits<float>::quiet_NaN(), depth_flt < FLT_EPSILON);
+        depth = depth_flt;
+    }
+}
+
+void OdometryTest::generateRandomTransformation(Mat& rvec, Mat& tvec)
+{
+    const float maxRotation = (float)(3.f / 180.f * CV_PI); //rad
+    const float maxTranslation = 0.02f; //m
+
+    RNG& rng = theRNG();
+    rvec.create(3, 1, CV_64FC1);
+    tvec.create(3, 1, CV_64FC1);
+
+    randu(rvec, Scalar(-1000), Scalar(1000));
+    normalize(rvec, rvec, rng.uniform(0.007f, maxRotation));
+
+    randu(tvec, Scalar(-1000), Scalar(1000));
+    normalize(tvec, tvec, rng.uniform(0.008f, maxTranslation));
+}
+
+void OdometryTest::checkUMats()
+{
+    Mat K = getCameraMatrix();
+
+    Mat image, depth;
+    readData(image, depth);
+
+    odometry->setCameraMatrix(K);
+
+    Mat calcRt;
+
+    UMat uimage, udepth, umask;
+    image.copyTo(uimage);
+    depth.copyTo(udepth);
+    Mat(image.size(), CV_8UC1, Scalar(255)).copyTo(umask);
+
+    bool isComputed = odometry->compute(uimage, udepth, umask,
+                                        uimage, udepth, umask,
+                                        calcRt);
+    ASSERT_TRUE(isComputed);
+    double diff = cv::norm(calcRt, Mat::eye(4, 4, CV_64FC1));
+    if (diff > idError)
+    {
+        FAIL() << "Incorrect transformation between the same frame (not the identity matrix), diff = " << diff << std::endl;
+    }
+}
+
+void OdometryTest::run()
+{
+    Mat K = getCameraMatrix();
+
+    Mat image, depth;
+    readData(image, depth);
+
+    odometry->setCameraMatrix(K);
+
+    Mat calcRt;
+
+    // 1. Try to find Rt between the same frame (try masks also).
+    Mat mask(image.size(), CV_8UC1, Scalar(255));
+    bool isComputed = odometry->compute(image, depth, mask, image, depth, mask, calcRt);
+    if(!isComputed)
+    {
+        FAIL() << "Can not find Rt between the same frame" << std::endl;
+    }
+    double diff = cv::norm(calcRt, Mat::eye(4,4,CV_64FC1));
+    if(diff > idError)
+    {
+        FAIL() << "Incorrect transformation between the same frame (not the identity matrix), diff = " << diff << std::endl;
+    }
+
+    // 2. Generate random rigid body motion in some ranges several times (iterCount).
+    // On each iteration an input frame is warped using generated transformation.
+    // Odometry is run on the following pair: the original frame and the warped one.
+    // Comparing a computed transformation with an applied one we compute 2 errors:
+    // better_1time_count - count of poses which error is less than ground truth pose,
+    // better_5times_count - count of poses which error is 5 times less than ground truth pose.
+    int iterCount = 100;
+    int better_1time_count = 0;
+    int better_5times_count = 0;
+    for(int iter = 0; iter < iterCount; iter++)
+    {
+        Mat rvec, tvec;
+        generateRandomTransformation(rvec, tvec);
+        Mat warpedImage, warpedDepth;
+        warpFrame(image, depth, rvec, tvec, K, warpedImage, warpedDepth);
+        dilateFrame(warpedImage, warpedDepth); // due to inaccuracy after warping
+
+        isComputed = odometry->compute(image, depth, mask, warpedImage, warpedDepth, mask, calcRt);
+        if(!isComputed)
+            continue;
+
+        Mat calcR = calcRt(Rect(0,0,3,3)), calcRvec;
+        cv::Rodrigues(calcR, calcRvec);
+        calcRvec = calcRvec.reshape(rvec.channels(), rvec.rows);
+        Mat calcTvec = calcRt(Rect(3,0,1,3));
+
+#if SHOW_DEBUG_IMAGES
+        imshow("image", image);
+        imshow("warpedImage", warpedImage);
+        Mat resultImage, resultDepth;
+        warpFrame(image, depth, calcRvec, calcTvec, K, resultImage, resultDepth);
+        imshow("resultImage", resultImage);
+        waitKey();
+#endif
+
+        // compare rotation
+        double rdiffnorm = cv::norm(rvec - calcRvec),
+               rnorm = cv::norm(rvec);
+        double tdiffnorm = cv::norm(tvec - calcTvec),
+               tnorm = cv::norm(tvec);
+        if(rdiffnorm < rnorm &&  tdiffnorm < tnorm)
+            better_1time_count++;
+        if(5. * rdiffnorm < rnorm && 5 * tdiffnorm < tnorm)
+            better_5times_count++;
+
+        CV_LOG_INFO(NULL, "Iter " << iter);
+        CV_LOG_INFO(NULL, "rdiffnorm " << rdiffnorm << "; rnorm " << rnorm);
+        CV_LOG_INFO(NULL, "tdiffnorm " << tdiffnorm << "; tnorm " << tnorm);
+
+        CV_LOG_INFO(NULL, "better_1time_count " << better_1time_count << "; better_5time_count " << better_5times_count);
+    }
+
+    if(static_cast<double>(better_1time_count) < maxError1 * static_cast<double>(iterCount))
+    {
+        FAIL() << "Incorrect count of accurate poses [1st case]: "
+            << static_cast<double>(better_1time_count) << " / "
+            << maxError1 * static_cast<double>(iterCount) << std::endl;
+    }
+
+    if(static_cast<double>(better_5times_count) < maxError5 * static_cast<double>(iterCount))
+    {
+        FAIL() << "Incorrect count of accurate poses [2nd case]: "
+            << static_cast<double>(better_5times_count) << " / "
+            << maxError5 * static_cast<double>(iterCount) << std::endl;
+    }
+}
+
+/****************************************************************************************\
+*                                Tests registrations                                     *
+\****************************************************************************************/
+
+TEST(RGBD_Odometry_Rgbd, algorithmic)
+{
+    OdometryTest test(cv::Odometry::createFromName("RgbdOdometry"), 0.99, 0.89);
+    test.run();
+}
+
+TEST(RGBD_Odometry_ICP, algorithmic)
+{
+    OdometryTest test(cv::Odometry::createFromName("ICPOdometry"), 0.99, 0.99);
+    test.run();
+}
+
+TEST(RGBD_Odometry_RgbdICP, algorithmic)
+{
+    OdometryTest test(cv::Odometry::createFromName("RgbdICPOdometry"), 0.99, 0.99);
+    test.run();
+}
+
+TEST(RGBD_Odometry_FastICP, algorithmic)
+{
+    OdometryTest test(cv::Odometry::createFromName("FastICPOdometry"), 0.99, 0.99, FLT_EPSILON);
+    test.run();
+}
+
+TEST(RGBD_Odometry_Rgbd, UMats)
+{
+    OdometryTest test(cv::Odometry::createFromName("RgbdOdometry"), 0.99, 0.89);
+    test.checkUMats();
+}
+
+TEST(RGBD_Odometry_ICP, UMats)
+{
+    OdometryTest test(cv::Odometry::createFromName("ICPOdometry"), 0.99, 0.99);
+    test.checkUMats();
+}
+
+TEST(RGBD_Odometry_RgbdICP, UMats)
+{
+    OdometryTest test(cv::Odometry::createFromName("RgbdICPOdometry"), 0.99, 0.99);
+    test.checkUMats();
+}
+
+TEST(RGBD_Odometry_FastICP, UMats)
+{
+    OdometryTest test(cv::Odometry::createFromName("FastICPOdometry"), 0.99, 0.99, FLT_EPSILON);
+    test.checkUMats();
+}
+
+
+/****************************************************************************************\
+*                                Depth to 3d tests                                       *
+\****************************************************************************************/
+
+TEST(RGBD_DepthTo3d, compute)
+{
+    // K from a VGA Kinect
+    Mat K = OdometryTest::getCameraMatrix();
+
+    // Create a random depth image
+    RNG rng;
+    Mat_<float> depth(480, 640);
+    rng.fill(depth, RNG::UNIFORM, 0, 100);
+
+    // Create some 3d points on the plane
+    int rows = depth.rows, cols = depth.cols;
+    Mat_<Vec3f> points3d;
+    depthTo3d(depth, K, points3d);
+
+    // Make sure the points belong to the plane
+    Mat points = points3d.reshape(1, rows * cols);
+    Mat image_points;
+    Mat rvec;
+    cv::Rodrigues(Mat::eye(3, 3, CV_32F), rvec);
+    Mat tvec = (Mat_<float>(1, 3) << 0, 0, 0);
+    projectPoints(points, rvec, tvec, K, Mat(), image_points);
+    image_points = image_points.reshape(2, rows);
+
+    float avg_diff = 0;
+    for (int y = 0; y < rows; ++y)
+        for (int x = 0; x < cols; ++x)
+            avg_diff += (float)cv::norm(image_points.at<Vec2f>(y, x) - Vec2f((float)x, (float)y));
+
+    // Verify the function works
+    ASSERT_LE(avg_diff / rows / cols, 1e-4) << "Average error for ground truth is: " << (avg_diff / rows / cols);
+}
+
+
+}} // namespace
diff --git a/modules/3d/test/test_pose_graph.cpp b/modules/3d/test/test_pose_graph.cpp
new file mode 100644
index 0000000000..1480cdfd4d
--- /dev/null
+++ b/modules/3d/test/test_pose_graph.cpp
@@ -0,0 +1,152 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "test_precomp.hpp"
+#include <opencv2/3d/detail/pose_graph.hpp>
+
+namespace opencv_test { namespace {
+
+using namespace cv;
+
+static Affine3d readAffine(std::istream& input)
+{
+    Vec3d p;
+    Vec4d q;
+    input >> p[0] >> p[1] >> p[2];
+    input >> q[1] >> q[2] >> q[3] >> q[0];
+    // Normalize the quaternion to account for precision loss due to
+    // serialization.
+    return Affine3d(Quatd(q).toRotMat3x3(), p);
+};
+
+// Rewritten from Ceres pose graph demo: https://ceres-solver.org/
+static Ptr<detail::PoseGraph> readG2OFile(const std::string& g2oFileName)
+{
+    Ptr<detail::PoseGraph> pg = detail::PoseGraph::create();
+
+    // for debugging purposes
+    size_t minId = 0, maxId = 1 << 30;
+
+    std::ifstream infile(g2oFileName.c_str());
+    if (!infile)
+    {
+        CV_Error(cv::Error::StsError, "failed to open file");
+    }
+
+    while (infile.good())
+    {
+        std::string data_type;
+        // Read whether the type is a node or a constraint
+        infile >> data_type;
+        if (data_type == "VERTEX_SE3:QUAT")
+        {
+            size_t id;
+            infile >> id;
+            Affine3d pose = readAffine(infile);
+
+            if (id < minId || id >= maxId)
+                continue;
+
+            bool fixed = (id == minId);
+
+            // Ensure we don't have duplicate poses
+            if (pg->isNodeExist(id))
+            {
+                CV_LOG_INFO(NULL, "duplicated node, id=" << id);
+            }
+            pg->addNode(id, pose, fixed);
+        }
+        else if (data_type == "EDGE_SE3:QUAT")
+        {
+            size_t startId, endId;
+            infile >> startId >> endId;
+            Affine3d pose = readAffine(infile);
+
+            Matx66d info;
+            for (int i = 0; i < 6 && infile.good(); ++i)
+            {
+                for (int j = i; j < 6 && infile.good(); ++j)
+                {
+                    infile >> info(i, j);
+                    if (i != j)
+                    {
+                        info(j, i) = info(i, j);
+                    }
+                }
+            }
+
+            if ((startId >= minId && startId < maxId) && (endId >= minId && endId < maxId))
+            {
+                pg->addEdge(startId, endId, pose, info);
+            }
+        }
+        else
+        {
+            CV_Error(cv::Error::StsError, "unknown tag");
+        }
+
+        // Clear any trailing whitespace from the line
+        infile >> std::ws;
+    }
+
+    return pg;
+}
+
+
+TEST( PoseGraph, sphereG2O )
+{
+    // Test takes 15+ sec in Release mode and 400+ sec in Debug mode
+    applyTestTag(CV_TEST_TAG_LONG, CV_TEST_TAG_DEBUG_VERYLONG);
+
+    // The dataset was taken from here: https://lucacarlone.mit.edu/datasets/
+    // Connected paper:
+    // L.Carlone, R.Tron, K.Daniilidis, and F.Dellaert.
+    // Initialization Techniques for 3D SLAM : a Survey on Rotation Estimation and its Use in Pose Graph Optimization.
+    // In IEEE Intl.Conf.on Robotics and Automation(ICRA), pages 4597 - 4604, 2015.
+
+    std::string filename = cvtest::TS::ptr()->get_data_path() + "/cv/rgbd/sphere_bignoise_vertex3.g2o";
+    Ptr<detail::PoseGraph> pg = readG2OFile(filename);
+
+#ifdef HAVE_EIGEN
+    // You may change logging level to view detailed optimization report
+    // For example, set env. variable like this: OPENCV_LOG_LEVEL=INFO
+
+    int iters = pg->optimize();
+
+    ASSERT_GE(iters, 0);
+    ASSERT_LE(iters, 36); // should converge in 36 iterations
+
+    double energy = pg->calcEnergy();
+
+    ASSERT_LE(energy, 1.47723e+06); // should converge to 1.47722e+06 or less
+
+    // Add the "--test_debug" to arguments to see resulting pose graph nodes positions
+    if (cvtest::debugLevel > 0)
+    {
+        // Write edge-only model of how nodes are located in space
+        std::string fname = "pgout.obj";
+        std::fstream of(fname, std::fstream::out);
+        std::vector<size_t> ids = pg->getNodesIds();
+        for (const size_t& id : ids)
+        {
+            Point3d d = pg->getNodePose(id).translation();
+            of << "v " << d.x << " " << d.y << " " << d.z << std::endl;
+        }
+
+        size_t esz = pg->getNumEdges();
+        for (size_t i = 0; i < esz; i++)
+        {
+            size_t sid = pg->getEdgeStart(i), tid = pg->getEdgeEnd(i);
+            of << "l " << sid + 1 << " " << tid + 1 << std::endl;
+        }
+
+        of.close();
+    }
+#else
+    throw SkipTestException("Build with Eigen required for pose graph optimization");
+#endif
+}
+
+
+}} // namespace
diff --git a/modules/3d/test/test_precomp.hpp b/modules/3d/test/test_precomp.hpp
index b60e846160..6fa258d7d0 100644
--- a/modules/3d/test/test_precomp.hpp
+++ b/modules/3d/test/test_precomp.hpp
@@ -9,6 +9,11 @@
 
 #include "opencv2/ts.hpp"
 #include "opencv2/3d.hpp"
+#include <opencv2/core/utils/logger.hpp>
+
+#ifdef HAVE_OPENCL
+#include <opencv2/core/ocl.hpp>
+#endif
 
 namespace cvtest
 {
diff --git a/modules/3d/test/test_registration.cpp b/modules/3d/test/test_registration.cpp
new file mode 100644
index 0000000000..8d393c6d83
--- /dev/null
+++ b/modules/3d/test/test_registration.cpp
@@ -0,0 +1,107 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This code is also subject to the license terms in the LICENSE_WillowGarage.md file found in this module's directory
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+class RgbdDepthRegistrationTest
+{
+public:
+    RgbdDepthRegistrationTest() { }
+    ~RgbdDepthRegistrationTest() { }
+
+    void run()
+    {
+        // Test all three input types for no-op registrations (where a depth image is registered to itself)
+        noOpRandomRegistrationTest<unsigned short>(100, 2500);
+        noOpRandomRegistrationTest<float>(0.1f, 2.5f);
+        noOpRandomRegistrationTest<double>(0.1, 2.5);
+
+        // Test sentinel value handling, occlusion, and dilation
+        {
+            // K from a VGA Kinect
+            Mat K = (Mat_<float>(3, 3) << 525., 0., 319.5, 0., 525., 239.5, 0., 0., 1.);
+
+            int width = 640, height = 480;
+
+            // All elements are zero except for first two along the diagonal
+            Mat_<unsigned short> vgaDepth(height, width, (unsigned short)0);
+            vgaDepth(0, 0) = 1001;
+            vgaDepth(1, 1) = 1000;
+
+            Mat_<unsigned short> registeredDepth;
+            registerDepth(K, K, Mat(), Matx44f::eye(), vgaDepth, Size(width, height), registeredDepth, true);
+
+            // We expect the closer depth of 1000 to occlude the more distant depth and occupy the
+            // upper four left pixels in the depth image because of dilation
+            Mat_<unsigned short> expectedResult(height, width, (unsigned short)0);
+            expectedResult(0, 0) = 1000;
+            expectedResult(0, 1) = 1000;
+            expectedResult(1, 0) = 1000;
+            expectedResult(1, 1) = 1000;
+
+            Mat ad;
+            absdiff(registeredDepth, expectedResult, ad);
+            ASSERT_GT(std::numeric_limits<double>::min(), abs(sum(ad)[0])) << "Dilation and occlusion";
+        }
+    }
+
+    template <class DepthDepth>
+    void noOpRandomRegistrationTest(DepthDepth minDepth, DepthDepth maxDepth)
+    {
+        // K from a VGA Kinect
+        Mat K = (Mat_<float>(3, 3) << 525., 0., 319.5, 0., 525., 239.5, 0., 0., 1.);
+
+        // Create a random depth image
+        RNG rng;
+        Mat_<DepthDepth> randomVGADepth(480, 640);
+        rng.fill(randomVGADepth, RNG::UNIFORM, minDepth, maxDepth);
+
+        Mat registeredDepth;
+        registerDepth(K, K, Mat(), Matx44f::eye(), randomVGADepth, Size(640, 480), registeredDepth);
+
+        // Check per-pixel relative difference
+
+        Mat ad;
+        absdiff(registeredDepth, randomVGADepth, ad);
+        Mat mmin;
+        cv::min(registeredDepth, randomVGADepth, mmin);
+        Mat rel = ad / mmin;
+        double maxDiff = cv::norm(rel, NORM_INF);
+        ASSERT_GT(1e-07, maxDiff) << "No-op registration";
+    }
+};
+
+TEST(RGBD_DepthRegistration, compute)
+{
+    RgbdDepthRegistrationTest test;
+    test.run();
+}
+
+TEST(RGBD_DepthRegistration, issue_2234)
+{
+    Matx33f intrinsicsDepth(100, 0,  50, 0, 100, 50, 0, 0, 1);
+    Matx33f intrinsicsColor(100, 0, 200, 0, 100, 50, 0, 0, 1);
+
+    Mat_<float> depthMat(100, 100, (float)0.);
+    for(int i = 1; i <= 100; i++)
+    {
+        for(int j = 1; j <= 100; j++)
+            depthMat(i-1,j-1) = (float)j;
+    }
+
+    Mat registeredDepth;
+    registerDepth(intrinsicsDepth, intrinsicsColor, Mat(), Matx44f::eye(), depthMat, Size(400, 100), registeredDepth);
+
+    Rect roi( 150, 0, 100, 100 );
+    Mat subM(registeredDepth,roi);
+
+    EXPECT_EQ(0, cvtest::norm(subM, depthMat, NORM_INF));
+}
+
+
+}} // namespace
diff --git a/modules/3d/test/test_tsdf.cpp b/modules/3d/test/test_tsdf.cpp
new file mode 100644
index 0000000000..91fc45355b
--- /dev/null
+++ b/modules/3d/test/test_tsdf.cpp
@@ -0,0 +1,572 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "test_precomp.hpp"
+
+namespace opencv_test {
+namespace {
+
+using namespace cv;
+
+/** Reprojects screen point to camera space given z coord. */
+struct Reprojector
+{
+    Reprojector() {}
+    inline Reprojector(Matx33f intr)
+    {
+        fxinv = 1.f / intr(0, 0), fyinv = 1.f / intr(1, 1);
+        cx = intr(0, 2), cy = intr(1, 2);
+    }
+    template<typename T>
+    inline cv::Point3_<T> operator()(cv::Point3_<T> p) const
+    {
+        T x = p.z * (p.x - cx) * fxinv;
+        T y = p.z * (p.y - cy) * fyinv;
+        return cv::Point3_<T>(x, y, p.z);
+    }
+
+    float fxinv, fyinv, cx, cy;
+};
+
+template<class Scene>
+struct RenderInvoker : ParallelLoopBody
+{
+    RenderInvoker(Mat_<float>& _frame, Affine3f _pose,
+        Reprojector _reproj, float _depthFactor, bool _onlySemisphere)
+        : ParallelLoopBody(),
+        frame(_frame),
+        pose(_pose),
+        reproj(_reproj),
+        depthFactor(_depthFactor),
+        onlySemisphere(_onlySemisphere)
+    { }
+
+    virtual void operator ()(const cv::Range& r) const
+    {
+        for (int y = r.start; y < r.end; y++)
+        {
+            float* frameRow = frame[y];
+            for (int x = 0; x < frame.cols; x++)
+            {
+                float pix = 0;
+
+                Point3f orig = pose.translation();
+                // direction through pixel
+                Point3f screenVec = reproj(Point3f((float)x, (float)y, 1.f));
+                float xyt = 1.f / (screenVec.x * screenVec.x +
+                    screenVec.y * screenVec.y + 1.f);
+                Point3f dir = normalize(Vec3f(pose.rotation() * screenVec));
+                // screen space axis
+                dir.y = -dir.y;
+
+                const float maxDepth = 20.f;
+                const float maxSteps = 256;
+                float t = 0.f;
+                for (int step = 0; step < maxSteps && t < maxDepth; step++)
+                {
+                    Point3f p = orig + dir * t;
+                    float d = Scene::map(p, onlySemisphere);
+                    if (d < 0.000001f)
+                    {
+                        float depth = std::sqrt(t * t * xyt);
+                        pix = depth * depthFactor;
+                        break;
+                    }
+                    t += d;
+                }
+
+                frameRow[x] = pix;
+            }
+        }
+    }
+
+    Mat_<float>& frame;
+    Affine3f pose;
+    Reprojector reproj;
+    float depthFactor;
+    bool onlySemisphere;
+};
+
+struct Scene
+{
+    virtual ~Scene() {}
+    static Ptr<Scene> create(Size sz, Matx33f _intr, float _depthFactor, bool onlySemisphere);
+    virtual Mat depth(Affine3f pose) = 0;
+    virtual std::vector<Affine3f> getPoses() = 0;
+};
+
+struct SemisphereScene : Scene
+{
+    const int framesPerCycle = 72;
+    const float nCycles = 0.25f;
+    const Affine3f startPose = Affine3f(Vec3f(0.f, 0.f, 0.f), Vec3f(1.5f, 0.3f, -2.1f));
+
+    Size frameSize;
+    Matx33f intr;
+    float depthFactor;
+    bool onlySemisphere;
+
+    SemisphereScene(Size sz, Matx33f _intr, float _depthFactor, bool _onlySemisphere) :
+        frameSize(sz), intr(_intr), depthFactor(_depthFactor), onlySemisphere(_onlySemisphere)
+    { }
+
+    static float map(Point3f p, bool onlySemisphere)
+    {
+        float plane = p.y + 0.5f;
+        Point3f spherePose = p - Point3f(-0.0f, 0.3f, 1.1f);
+        float sphereRadius = 0.5f;
+        float sphere = (float)cv::norm(spherePose) - sphereRadius;
+        float sphereMinusBox = sphere;
+
+        float subSphereRadius = 0.05f;
+        Point3f subSpherePose = p - Point3f(0.3f, -0.1f, -0.3f);
+        float subSphere = (float)cv::norm(subSpherePose) - subSphereRadius;
+
+        float res;
+        if (!onlySemisphere)
+            res = min({ sphereMinusBox, subSphere, plane });
+        else
+            res = sphereMinusBox;
+
+        return res;
+    }
+
+    Mat depth(Affine3f pose) override
+    {
+        Mat_<float> frame(frameSize);
+        Reprojector reproj(intr);
+
+        Range range(0, frame.rows);
+        parallel_for_(range, RenderInvoker<SemisphereScene>(frame, pose, reproj, depthFactor, onlySemisphere));
+
+        return std::move(frame);
+    }
+
+    std::vector<Affine3f> getPoses() override
+    {
+        std::vector<Affine3f> poses;
+        for (int i = 0; i < framesPerCycle * nCycles; i++)
+        {
+            float angle = (float)(CV_2PI * i / framesPerCycle);
+            Affine3f pose;
+            pose = pose.rotate(startPose.rotation());
+            pose = pose.rotate(Vec3f(0.f, -0.5f, 0.f) * angle);
+            pose = pose.translate(Vec3f(startPose.translation()[0] * sin(angle),
+                startPose.translation()[1],
+                startPose.translation()[2] * cos(angle)));
+            poses.push_back(pose);
+        }
+
+        return poses;
+    }
+
+};
+
+Ptr<Scene> Scene::create(Size sz, Matx33f _intr, float _depthFactor, bool _onlySemisphere)
+{
+    return makePtr<SemisphereScene>(sz, _intr, _depthFactor, _onlySemisphere);
+}
+
+// this is a temporary solution
+// ----------------------------
+
+typedef cv::Vec4f ptype;
+typedef cv::Mat_< ptype > Points;
+typedef Points Normals;
+typedef Size2i Size;
+
+template<int p>
+inline float specPow(float x)
+{
+    if (p % 2 == 0)
+    {
+        float v = specPow<p / 2>(x);
+        return v * v;
+    }
+    else
+    {
+        float v = specPow<(p - 1) / 2>(x);
+        return v * v * x;
+    }
+}
+
+template<>
+inline float specPow<0>(float /*x*/)
+{
+    return 1.f;
+}
+
+template<>
+inline float specPow<1>(float x)
+{
+    return x;
+}
+
+inline cv::Vec3f fromPtype(const ptype& x)
+{
+    return cv::Vec3f(x[0], x[1], x[2]);
+}
+
+inline Point3f normalize(const Vec3f& v)
+{
+    double nv = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+    return v * (nv ? 1. / nv : 0.);
+}
+
+void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray image, Affine3f lightPose)
+{
+    Size sz = _points.size();
+    image.create(sz, CV_8UC4);
+
+    Points  points = _points.getMat();
+    Normals normals = _normals.getMat();
+
+    Mat_<Vec4b> img = image.getMat();
+
+    Range range(0, sz.height);
+    const int nstripes = -1;
+    parallel_for_(range, [&](const Range&)
+        {
+            for (int y = range.start; y < range.end; y++)
+            {
+                Vec4b* imgRow = img[y];
+                const ptype* ptsRow = points[y];
+                const ptype* nrmRow = normals[y];
+
+                for (int x = 0; x < sz.width; x++)
+                {
+                    Point3f p = fromPtype(ptsRow[x]);
+                    Point3f n = fromPtype(nrmRow[x]);
+
+                    Vec4b color;
+
+                    if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z))
+                    {
+                        color = Vec4b(0, 32, 0, 0);
+                    }
+                    else
+                    {
+                        const float Ka = 0.3f;  //ambient coeff
+                        const float Kd = 0.5f;  //diffuse coeff
+                        const float Ks = 0.2f;  //specular coeff
+                        const int   sp = 20;  //specular power
+
+                        const float Ax = 1.f;   //ambient color,  can be RGB
+                        const float Dx = 1.f;   //diffuse color,  can be RGB
+                        const float Sx = 1.f;   //specular color, can be RGB
+                        const float Lx = 1.f;   //light color
+
+                        Point3f l = normalize(lightPose.translation() - Vec3f(p));
+                        Point3f v = normalize(-Vec3f(p));
+                        Point3f r = normalize(Vec3f(2.f * n * n.dot(l) - l));
+
+                        uchar ix = (uchar)((Ax * Ka * Dx + Lx * Kd * Dx * max(0.f, n.dot(l)) +
+                            Lx * Ks * Sx * specPow<sp>(max(0.f, r.dot(v)))) * 255.f);
+                        color = Vec4b(ix, ix, ix, 0);
+                    }
+
+                    imgRow[x] = color;
+                }
+            }
+        }, nstripes);
+}
+// ----------------------------
+
+static const bool display = false;
+static const bool parallelCheck = false;
+
+class Settings
+{
+public:
+    float depthFactor;
+    Matx33f intr;
+    Size frameSize;
+    Vec3f lightPose;
+
+    Ptr<Volume> volume;
+    Ptr<Scene> scene;
+    std::vector<Affine3f> poses;
+
+    Settings(bool useHashTSDF, bool onlySemisphere)
+    {
+        frameSize = Size(640, 480);
+
+        float fx, fy, cx, cy;
+        fx = fy = 525.f;
+        cx = frameSize.width / 2 - 0.5f;
+        cy = frameSize.height / 2 - 0.5f;
+        intr = Matx33f(fx,  0, cx,
+                        0, fy, cy,
+                        0,  0,  1);
+
+        // 5000 for the 16-bit PNG files
+        // 1 for the 32-bit float images in the ROS bag files
+        depthFactor = 5000;
+
+        Vec3i volumeDims = Vec3i::all(512); //number of voxels
+
+        float volSize = 3.f;
+        float voxelSize = volSize / 512.f; //meters
+
+        // default pose of volume cube
+        Affine3f volumePose = Affine3f().translate(Vec3f(-volSize / 2.f, -volSize / 2.f, 0.5f));
+        float tsdf_trunc_dist = 7 * voxelSize; // about 0.04f in meters
+        int tsdf_max_weight = 64;   //frames
+
+        float raycast_step_factor = 0.25f;  //in voxel sizes
+        // gradient delta factor is fixed at 1.0f and is not used
+        //p.gradient_delta_factor = 0.5f; //in voxel sizes
+
+        //p.lightPose = p.volume_pose.translation()/4; //meters
+        lightPose = Vec3f::all(0.f); //meters
+
+        // depth truncation is not used by default but can be useful in some scenes
+        float truncateThreshold = 0.f; //meters
+
+        VolumeParams::VolumeKind volumeKind = VolumeParams::VolumeKind::TSDF;
+
+        if (useHashTSDF)
+        {
+            volumeKind = VolumeParams::VolumeKind::HASHTSDF;
+            truncateThreshold = 4.f;
+        }
+        else
+        {
+            volSize = 3.f;
+            volumeDims = Vec3i::all(128); //number of voxels
+            voxelSize = volSize / 128.f;
+            tsdf_trunc_dist = 2 * voxelSize; // 0.04f in meters
+
+            raycast_step_factor = 0.75f;  //in voxel sizes
+        }
+
+        volume = makeVolume(volumeKind, voxelSize, volumePose.matrix,
+                            raycast_step_factor, tsdf_trunc_dist, tsdf_max_weight,
+                            truncateThreshold, volumeDims[0], volumeDims[1], volumeDims[2]);
+
+        scene = Scene::create(frameSize, intr, depthFactor, onlySemisphere);
+        poses = scene->getPoses();
+    }
+};
+
+void displayImage(Mat depth, Mat points, Mat normals, float depthFactor, Vec3f lightPose)
+{
+    Mat image;
+    patchNaNs(points);
+    imshow("depth", depth * (1.f / depthFactor / 4.f));
+    renderPointsNormals(points, normals, image, lightPose);
+    imshow("render", image);
+    waitKey(2000);
+}
+
+void normalsCheck(Mat normals)
+{
+    Vec4f vector;
+    for (auto pvector = normals.begin<Vec4f>(); pvector < normals.end<Vec4f>(); pvector++)
+    {
+        vector = *pvector;
+        if (!cvIsNaN(vector[0]))
+        {
+            float length = vector[0] * vector[0] +
+                vector[1] * vector[1] +
+                vector[2] * vector[2];
+            ASSERT_LT(abs(1 - length), 0.0001f) << "There is normal with length != 1";
+        }
+    }
+}
+
+int counterOfValid(Mat points)
+{
+    Vec4f* v;
+    int i, j;
+    int count = 0;
+    for (i = 0; i < points.rows; ++i)
+    {
+        v = (points.ptr<Vec4f>(i));
+        for (j = 0; j < points.cols; ++j)
+        {
+            if ((v[j])[0] != 0 ||
+                (v[j])[1] != 0 ||
+                (v[j])[2] != 0)
+            {
+                count++;
+            }
+        }
+    }
+    return count;
+}
+
+void normal_test(bool isHashTSDF, bool isRaycast, bool isFetchPointsNormals, bool isFetchNormals)
+{
+    auto normalCheck = [](Vec4f& vector, const int*)
+    {
+        if (!cvIsNaN(vector[0]))
+        {
+            float length = vector[0] * vector[0] +
+                vector[1] * vector[1] +
+                vector[2] * vector[2];
+            ASSERT_LT(abs(1 - length), 0.0001f) << "There is normal with length != 1";
+        }
+    };
+
+    Settings settings(isHashTSDF, false);
+
+    Mat depth = settings.scene->depth(settings.poses[0]);
+    UMat _points, _normals, _tmpnormals;
+    UMat _newPoints, _newNormals;
+    Mat  points, normals;
+    AccessFlag af = ACCESS_READ;
+
+    settings.volume->integrate(depth, settings.depthFactor, settings.poses[0].matrix, settings.intr);
+
+    if (isRaycast)
+    {
+        settings.volume->raycast(settings.poses[0].matrix, settings.intr, settings.frameSize, _points, _normals);
+    }
+    if (isFetchPointsNormals)
+    {
+        settings.volume->fetchPointsNormals(_points, _normals);
+    }
+    if (isFetchNormals)
+    {
+        settings.volume->fetchPointsNormals(_points, _tmpnormals);
+        settings.volume->fetchNormals(_points, _normals);
+    }
+
+    normals = _normals.getMat(af);
+    points = _points.getMat(af);
+
+    if (parallelCheck)
+        normals.forEach<Vec4f>(normalCheck);
+    else
+        normalsCheck(normals);
+
+    if (isRaycast && display)
+        displayImage(depth, points, normals, settings.depthFactor, settings.lightPose);
+
+    if (isRaycast)
+    {
+        settings.volume->raycast(settings.poses[17].matrix, settings.intr, settings.frameSize, _newPoints, _newNormals);
+        normals = _newNormals.getMat(af);
+        points = _newPoints.getMat(af);
+        normalsCheck(normals);
+
+        if (parallelCheck)
+            normals.forEach<Vec4f>(normalCheck);
+        else
+            normalsCheck(normals);
+
+        if (display)
+            displayImage(depth, points, normals, settings.depthFactor, settings.lightPose);
+    }
+
+    points.release(); normals.release();
+}
+
+void valid_points_test(bool isHashTSDF)
+{
+    Settings settings(isHashTSDF, true);
+
+    Mat depth = settings.scene->depth(settings.poses[0]);
+    UMat _points, _normals, _newPoints, _newNormals;
+    AccessFlag af = ACCESS_READ;
+    Mat  points, normals;
+    int anfas, profile;
+
+    settings.volume->integrate(depth, settings.depthFactor, settings.poses[0].matrix, settings.intr);
+    settings.volume->raycast(settings.poses[0].matrix, settings.intr, settings.frameSize, _points, _normals);
+    normals = _normals.getMat(af);
+    points = _points.getMat(af);
+    patchNaNs(points);
+    anfas = counterOfValid(points);
+
+    if (display)
+        displayImage(depth, points, normals, settings.depthFactor, settings.lightPose);
+
+    settings.volume->raycast(settings.poses[17].matrix, settings.intr, settings.frameSize, _newPoints, _newNormals);
+    normals = _newNormals.getMat(af);
+    points = _newPoints.getMat(af);
+    patchNaNs(points);
+    profile = counterOfValid(points);
+
+    if (display)
+        displayImage(depth, points, normals, settings.depthFactor, settings.lightPose);
+
+    // TODO: why profile == 2*anfas ?
+    float percentValidity = float(anfas) / float(profile);
+
+    ASSERT_NE(profile, 0) << "There is no points in profile";
+    ASSERT_NE(anfas, 0) << "There is no points in anfas";
+    ASSERT_LT(abs(0.5 - percentValidity), 0.3) << "percentValidity out of [0.3; 0.7] (percentValidity=" << percentValidity << ")";
+}
+
+#ifndef HAVE_OPENCL
+TEST(TSDF, raycast_normals) { normal_test(false, true, false, false); }
+TEST(TSDF, fetch_points_normals) { normal_test(false, false, true, false); }
+TEST(TSDF, fetch_normals) { normal_test(false, false, false, true); }
+TEST(TSDF, valid_points) { valid_points_test(false); }
+
+TEST(HashTSDF, raycast_normals) { normal_test(true, true, false, false); }
+TEST(HashTSDF, fetch_points_normals) { normal_test(true, false, true, false); }
+TEST(HashTSDF, fetch_normals) { normal_test(true, false, false, true); }
+TEST(HashTSDF, valid_points) { valid_points_test(true); }
+#else
+TEST(TSDF_CPU, raycast_normals)
+{
+    cv::ocl::setUseOpenCL(false);
+    normal_test(false, true, false, false);
+    cv::ocl::setUseOpenCL(true);
+}
+
+TEST(TSDF_CPU, fetch_points_normals)
+{
+    cv::ocl::setUseOpenCL(false);
+    normal_test(false, false, true, false);
+    cv::ocl::setUseOpenCL(true);
+}
+
+TEST(TSDF_CPU, fetch_normals)
+{
+    cv::ocl::setUseOpenCL(false);
+    normal_test(false, false, false, true);
+    cv::ocl::setUseOpenCL(true);
+}
+
+TEST(TSDF_CPU, valid_points)
+{
+    cv::ocl::setUseOpenCL(false);
+    valid_points_test(false);
+    cv::ocl::setUseOpenCL(true);
+}
+
+TEST(HashTSDF_CPU, raycast_normals)
+{
+    cv::ocl::setUseOpenCL(false);
+    normal_test(true, true, false, false);
+    cv::ocl::setUseOpenCL(true);
+}
+
+TEST(HashTSDF_CPU, fetch_points_normals)
+{
+    cv::ocl::setUseOpenCL(false);
+    normal_test(true, false, true, false);
+    cv::ocl::setUseOpenCL(true);
+}
+
+TEST(HashTSDF_CPU, fetch_normals)
+{
+    cv::ocl::setUseOpenCL(false);
+    normal_test(true, false, false, true);
+    cv::ocl::setUseOpenCL(true);
+}
+
+TEST(HashTSDF_CPU, valid_points)
+{
+    cv::ocl::setUseOpenCL(false);
+    valid_points_test(true);
+    cv::ocl::setUseOpenCL(true);
+}
+#endif
+}
+}  // namespace
diff --git a/modules/calib/src/chessboard.cpp b/modules/calib/src/chessboard.cpp
index 941489a792..53ace9df2b 100644
--- a/modules/calib/src/chessboard.cpp
+++ b/modules/calib/src/chessboard.cpp
@@ -23,7 +23,7 @@ namespace details {
 /////////////////////////////////////////////////////////////////////////////
 static const float CORNERS_SEARCH = 0.5F;                       // percentage of the edge length to the next corner used to find new corners
 static const float MAX_ANGLE = float(48.0/180.0*CV_PI);          // max angle between line segments supposed to be straight
-static const float MIN_COS_ANGLE = float(cos(35.0/180*CV_PI));   // min cos angle between board edges
+static const float MIN_COS_ANGLE = float(std::cos(35.0/180*CV_PI));   // min cos angle between board edges
 static const float MIN_RESPONSE_RATIO = 0.1F;
 static const float ELLIPSE_WIDTH = 0.35F;                       // width of the search ellipse in percentage of its length
 static const float RAD2DEG = float(180.0/CV_PI);
@@ -303,7 +303,7 @@ int testPointSymmetry(const cv::Mat &mat,cv::Point2f pt,float dist,float max_err
     for (int angle_i = 0; angle_i < 10; angle_i++)
     {
         double angle = angle_i * (CV_PI * 0.1);
-        cv::Point2f n(float(cos(angle)),float(-sin(angle)));
+        cv::Point2f n(float(std::cos(angle)),float(-std::sin(angle)));
         center1 = pt+dist*n;
         if(!image_rect.contains(center1))
             return false;
@@ -792,8 +792,8 @@ Ellipse::Ellipse(const cv::Point2f &_center, const cv::Size2f &_axes, float _ang
     center(_center),
     axes(_axes),
     angle(_angle),
-    cosf(cos(-_angle)),
-    sinf(sin(-_angle))
+    cosf(std::cos(-_angle)),
+    sinf(std::sin(-_angle))
 {
 }
 
@@ -1296,18 +1296,18 @@ float Chessboard::Board::getAngle()const
         cv::Point3f ptx(1,0,0);
         val = float(ptx.dot(pt)/cv::norm(pt));
         if(val < 0)
-            val = -acos(val);
+            val = -std::acos(val);
         else
-            val = acos(val);
+            val = std::acos(val);
     }
     else
     {
         cv::Point3f ptx(0,1,0);
         val = float(ptx.dot(pt)/cv::norm(pt));
         if(val < 0)
-            val = float(-acos(val)+CV_PI/2);
+            val = float(-std::acos(val)+CV_PI/2);
         else
-            val = float(acos(val)+CV_PI/2);
+            val = float(std::acos(val)+CV_PI/2);
     }
     return val;
 }
@@ -1942,8 +1942,8 @@ bool Chessboard::Board::init(const std::vector<cv::Point2f> points)
     // set initial cell colors
     Point2f pt1 = *(cells[0]->top_right)-*(cells[0]->bottom_left);
     pt1 /= cv::norm(pt1);
-    cv::Point2f pt2(cos(white_angle),-sin(white_angle));
-    cv::Point2f pt3(cos(black_angle),-sin(black_angle));
+    cv::Point2f pt2(std::cos(white_angle),-std::sin(white_angle));
+    cv::Point2f pt3(std::cos(black_angle),-std::sin(black_angle));
     if(fabs(pt1.dot(pt2)) < fabs(pt1.dot(pt3)))
     {
         cells[0]->black = false;
@@ -2071,7 +2071,7 @@ Ellipse Chessboard::Board::estimateSearchArea(cv::Mat _H,int row, int col,float
     cv::Point2f p02(pt2-pt);
     float norm1 = float(cv::norm(p01));
     float norm2 = float(cv::norm(p02));
-    float angle = float(acos(p01.dot(p02)/norm1/norm2));
+    float angle = float(std::acos(p01.dot(p02)/norm1/norm2));
     cv::Size2f axes(norm1,norm2);
     return Ellipse(pt,axes,angle);
 }
@@ -2095,7 +2095,7 @@ bool Chessboard::Board::estimateSearchArea(const cv::Point2f &p1,const cv::Point
     }
     float norm = float(cv::norm(n));
     n = n/norm;
-    float angle = acos(n.x);
+    float angle = std::acos(n.x);
     if(n.y > 0)
         angle = float(2.0F*CV_PI-angle);
     n = p4-p3;
@@ -3540,8 +3540,8 @@ Chessboard::BState Chessboard::generateBoards(cv::flann::Index &flann_index,cons
 
     // use angles to filter out points
     std::vector<cv::KeyPoint> points;
-    cv::Vec2f n1(cos(white_angle),-sin(white_angle));
-    cv::Vec2f n2(cos(black_angle),-sin(black_angle));
+    cv::Vec2f n1(std::cos(white_angle),-std::sin(white_angle));
+    cv::Vec2f n2(std::cos(black_angle),-std::sin(black_angle));
     std::vector<cv::KeyPoint>::const_iterator iter1 = kpoints.begin()+1; // first point is center
     for(;iter1 != kpoints.end();++iter1)
     {
diff --git a/modules/calib/src/distortion_model.hpp b/modules/calib/src/distortion_model.hpp
index f84cbc6597..308da01e7f 100644
--- a/modules/calib/src/distortion_model.hpp
+++ b/modules/calib/src/distortion_model.hpp
@@ -79,10 +79,10 @@ void computeTiltProjectionMatrix(FLOAT tauX,
     Matx<FLOAT, 3, 3>* dMatTiltdTauY = 0,
     Matx<FLOAT, 3, 3>* invMatTilt = 0)
 {
-    FLOAT cTauX = cos(tauX);
-    FLOAT sTauX = sin(tauX);
-    FLOAT cTauY = cos(tauY);
-    FLOAT sTauY = sin(tauY);
+    FLOAT cTauX = std::cos(tauX);
+    FLOAT sTauX = std::sin(tauX);
+    FLOAT cTauY = std::cos(tauY);
+    FLOAT sTauY = std::sin(tauY);
     Matx<FLOAT, 3, 3> matRotX = Matx<FLOAT, 3, 3>(1,0,0,0,cTauX,sTauX,0,-sTauX,cTauX);
     Matx<FLOAT, 3, 3> matRotY = Matx<FLOAT, 3, 3>(cTauY,0,-sTauY,0,1,0,sTauY,0,cTauY);
     Matx<FLOAT, 3, 3> matRotXY = matRotY * matRotX;
diff --git a/modules/calib/src/fisheye.cpp b/modules/calib/src/fisheye.cpp
index 2d79b8fa94..e850d74dfd 100644
--- a/modules/calib/src/fisheye.cpp
+++ b/modules/calib/src/fisheye.cpp
@@ -137,7 +137,7 @@ void cv::fisheye::projectPoints(InputArray objectPoints, OutputArray imagePoints
         double r = std::sqrt(r2);
 
         // Angle of the incoming ray:
-        double theta = atan(r);
+        double theta = std::atan(r);
 
         double theta2 = theta*theta, theta3 = theta2*theta, theta4 = theta2*theta2, theta5 = theta4*theta,
                 theta6 = theta3*theta3, theta7 = theta6*theta, theta8 = theta4*theta4, theta9 = theta8*theta;
@@ -296,7 +296,7 @@ void cv::fisheye::distortPoints(InputArray undistorted, OutputArray distorted, I
         double r = std::sqrt(r2);
 
         // Angle of the incoming ray:
-        double theta = atan(r);
+        double theta = std::atan(r);
 
         double theta2 = theta*theta, theta3 = theta2*theta, theta4 = theta2*theta2, theta5 = theta4*theta,
                 theta6 = theta3*theta3, theta7 = theta6*theta, theta8 = theta4*theta4, theta9 = theta8*theta;
@@ -526,7 +526,7 @@ void cv::fisheye::initUndistortRectifyMap( InputArray K, InputArray D, InputArra
                 double x = _x/_w, y = _y/_w;
 
                 double r = sqrt(x*x + y*y);
-                double theta = atan(r);
+                double theta = std::atan(r);
 
                 double theta2 = theta*theta, theta4 = theta2*theta2, theta6 = theta4*theta2, theta8 = theta4*theta4;
                 double theta_d = theta * (1 + k[0]*theta2 + k[1]*theta4 + k[2]*theta6 + k[3]*theta8);
@@ -688,7 +688,7 @@ void cv::fisheye::stereoRectify( InputArray K1, InputArray D1, InputArray K2, In
     Vec3d ww = t.cross(uu);
     double nw = norm(ww);
     if (nw > 0.0)
-        ww *= acos(fabs(t[0])/cv::norm(t))/nw;
+        ww *= std::acos(fabs(t[0])/cv::norm(t))/nw;
 
     Matx33d wr;
     Rodrigues(ww, wr);
diff --git a/modules/calib/src/quadsubpix.cpp b/modules/calib/src/quadsubpix.cpp
index 90429815b2..949b544efe 100644
--- a/modules/calib/src/quadsubpix.cpp
+++ b/modules/calib/src/quadsubpix.cpp
@@ -228,7 +228,7 @@ bool find4QuadCornerSubpix(InputArray _img, InputOutputArray _corners, Size regi
         Point2f dir1 = quad_corners[1] - quad_corners[0];
         Point2f origin2 = quad_corners[2];
         Point2f dir2 = quad_corners[3] - quad_corners[2];
-        double angle = acos(dir1.dot(dir2)/(norm(dir1)*norm(dir2)));
+        double angle = std::acos(dir1.dot(dir2)/(norm(dir1)*norm(dir2)));
         if(cvIsNaN(angle) || cvIsInf(angle) || angle < 0.5 || angle > CV_PI - 0.5) continue;
 
         findLinesCrossPoint(origin1, dir1, origin2, dir2, corners[i]);
diff --git a/modules/calib/test/test_chessboardgenerator.cpp b/modules/calib/test/test_chessboardgenerator.cpp
index 7cc5598e3f..da7cd8e094 100644
--- a/modules/calib/test/test_chessboardgenerator.cpp
+++ b/modules/calib/test/test_chessboardgenerator.cpp
@@ -190,14 +190,14 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat
     float av = static_cast<float>(rng.uniform(-fovy/2 * cov, fovy/2 * cov) * CV_PI / 180);
 
     Point3f p;
-    p.z = cos(ah) * d1;
-    p.x = sin(ah) * d1;
-    p.y = p.z * tan(av);
+    p.z = std::cos(ah) * d1;
+    p.x = std::sin(ah) * d1;
+    p.y = p.z * std::tan(av);
 
     Point3f pb1, pb2;
     generateBasis(pb1, pb2);
 
-    float cbHalfWidth = static_cast<float>(norm(p) * sin( std::min(fovx, fovy) * 0.5 * CV_PI / 180));
+    float cbHalfWidth = static_cast<float>(norm(p) * std::sin( std::min(fovx, fovy) * 0.5 * CV_PI / 180));
     float cbHalfHeight = cbHalfWidth * patternSize.height / patternSize.width;
 
     float cbHalfWidthEx  =  cbHalfWidth * ( patternSize.width + 1) / patternSize.width;
@@ -255,9 +255,9 @@ Mat ChessBoardGenerator::operator ()(const Mat& bg, const Mat& camMat, const Mat
     float av = static_cast<float>(rng.uniform(-fovy/2 * cov, fovy/2 * cov) * CV_PI / 180);
 
     Point3f p;
-    p.z = cos(ah) * d1;
-    p.x = sin(ah) * d1;
-    p.y = p.z * tan(av);
+    p.z = std::cos(ah) * d1;
+    p.x = std::sin(ah) * d1;
+    p.y = p.z * std::tan(av);
 
     Point3f pb1, pb2;
     generateBasis(pb1, pb2);
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index ae2fce8423..4d7c82f2c7 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -58,8 +58,12 @@
 namespace cv
 {
 
+CV__DEBUG_NS_BEGIN
+
 class CV_EXPORTS _OutputArray;
 
+CV__DEBUG_NS_END
+
 //! @addtogroup core_basic
 //! @{
 
diff --git a/modules/stereo/src/stereo_geom.cpp b/modules/stereo/src/stereo_geom.cpp
index 752d591661..4e88b1e4a9 100644
--- a/modules/stereo/src/stereo_geom.cpp
+++ b/modules/stereo/src/stereo_geom.cpp
@@ -147,7 +147,7 @@ void stereoRectify( InputArray _cameraMatrix1, InputArray _distCoeffs1,
     Mat ww = t.cross(Mat(3, 1, CV_64F, _uu)), wR;
     double nw = norm(ww, NORM_L2);
     if (nw > 0.0)
-        ww *= acos(fabs(c)/nt)/nw;
+        ww *= std::acos(fabs(c)/nt)/nw;
     Rodrigues(ww, wR);
 
     Mat Ri;
@@ -607,7 +607,7 @@ float rectify3Collinear( InputArray _cameraMatrix1, InputArray _distCoeffs1,
     Mat_<double> ww = t12.cross(uu), wR;
     double nw = norm(ww, NORM_L2);
     CV_Assert(fabs(nw) > 0);
-    ww *= acos(fabs(c)/nt)/nw;
+    ww *= std::acos(fabs(c)/nt)/nw;
     Rodrigues(ww, wR);
 
     // now rotate camera 3 to make its optical axis parallel to cameras 1 and 2.