diff --git a/cxx/isce3/cuda/Sources.cmake b/cxx/isce3/cuda/Sources.cmake
index b5ab0817f..ae9472736 100644
--- a/cxx/isce3/cuda/Sources.cmake
+++ b/cxx/isce3/cuda/Sources.cmake
@@ -29,23 +29,27 @@ geometry/utilities.cu
 image/gpuResampSlc.cu
 image/Resample.cu
 image/ResampSlc.cpp
-matchtemplate/pycuampcor/cuAmpcorChunk.cu
-matchtemplate/pycuampcor/cuAmpcorController.cu
-matchtemplate/pycuampcor/cuAmpcorParameter.cu
-matchtemplate/pycuampcor/cuArrays.cu
+matchtemplate/pycuampcor/cuAmpcorController.cpp
+matchtemplate/pycuampcor/cuAmpcorParameter.cpp
+matchtemplate/pycuampcor/cuAmpcorProcessor.cpp
+matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp
+matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp
+matchtemplate/pycuampcor/cuArrays.cpp
 matchtemplate/pycuampcor/cuArraysCopy.cu
 matchtemplate/pycuampcor/cuArraysPadding.cu
 matchtemplate/pycuampcor/cuCorrFrequency.cu
 matchtemplate/pycuampcor/cuCorrNormalization.cu
 matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu
-matchtemplate/pycuampcor/cuCorrNormalizer.cu
+matchtemplate/pycuampcor/cuCorrNormalizer.cpp
 matchtemplate/pycuampcor/cuCorrTimeDomain.cu
 matchtemplate/pycuampcor/cuDeramp.cu
 matchtemplate/pycuampcor/cuEstimateStats.cu
 matchtemplate/pycuampcor/cuOffset.cu
-matchtemplate/pycuampcor/cuOverSampler.cu
+matchtemplate/pycuampcor/cuOverSampler.cpp
 matchtemplate/pycuampcor/cuSincOverSampler.cu
-matchtemplate/pycuampcor/GDALImage.cu
+matchtemplate/pycuampcor/cudaError.cpp
+matchtemplate/pycuampcor/cudaUtil.cpp
+matchtemplate/pycuampcor/SlcImage.cpp
 product/SubSwaths.cu
 signal/gpuAzimuthFilter.cu
 signal/gpuCrossMul.cu
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cu
deleted file mode 100644
index ef54f0386..000000000
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cu
+++ /dev/null
@@ -1,163 +0,0 @@
-/**
- * @file  GDALImage.h
- * @brief Implementations of GDALImage class
- *
- */
-
-// my declaration
-#include "GDALImage.h"
-
-// dependencies
-#include <iostream>
-#include "cudaError.h"
-
-/**
- * Constructor
- * @brief Create a GDAL image object
- * @param filename a std::string with the raster image file name
- * @param band the band number
- * @param cacheSizeInGB read buffer size in GigaBytes
- * @param useMmap whether to use memory map
- */
-GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useMmap)
-   : _useMmap(useMmap)
-{
-    // open the file as dataset
-    _poDataset = (GDALDataset *) GDALOpen(filename.c_str(), GA_ReadOnly);
-    // if something is wrong, throw an exception
-    // GDAL reports the error message
-    if(!_poDataset)
-        throw;
-
-    // check the band info
-    int count = _poDataset->GetRasterCount();
-    if(band > count)
-    {
-        std::cout << "The desired band " << band << " is greater than " << count << " bands available";
-        throw;
-    }
-
-    // get the desired band
-    _poBand = _poDataset->GetRasterBand(band);
-    if(!_poBand)
-        throw;
-
-     // get the width(x), and height(y)
-    _width = _poBand->GetXSize();
-    _height = _poBand->GetYSize();
-
-    _dataType = _poBand->GetRasterDataType();
-    // determine the image type
-    _isComplex = GDALDataTypeIsComplex(_dataType);
-    // determine the pixel size in bytes
-    _pixelSize = GDALGetDataTypeSize(_dataType);
-
-    _bufferSize = 1024*1024*cacheSizeInGB;
-
-    // checking whether using memory map
-    if(_useMmap) {
-
-       char **papszOptions = NULL;
-        // if cacheSizeInGB = 0, use default
-        // else set the option
-        if(cacheSizeInGB > 0)
-            papszOptions = CSLSetNameValue( papszOptions,
-                "CACHE_SIZE",
-                std::to_string(_bufferSize).c_str());
-
-        // space between two lines
-        GIntBig pnLineSpace;
-        // set up the virtual mem buffer
-        _poBandVirtualMem =  GDALGetVirtualMemAuto(
-            static_cast<GDALRasterBandH>(_poBand),
-            GF_Read,
-            &_pixelSize,
-            &pnLineSpace,
-            papszOptions);
-        if(!_poBandVirtualMem)
-            throw;
-
-        // get the starting pointer
-        _memPtr = CPLVirtualMemGetAddr(_poBandVirtualMem);
-    }
-    else { // use a buffer
-        checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
-    }
-    // make sure memPtr is not Null
-    if (!_memPtr)
-    {
-        std::cout << "unable to locate the memory buffer\n";
-        throw;
-    }
-    // all done
-}
-
-
-/**
- * Load a tile of data h_tile x w_tile from CPU to GPU
- * @param dArray pointer for array in device memory
- * @param h_offset Down/Height offset
- * @param w_offset Across/Width offset
- * @param h_tile Down/Height tile size
- * @param w_tile Across/Width tile size
- * @param stream CUDA stream for copying
- * @note Need to use size_t type to pass the parameters to cudaMemcpy2D correctly
- */
-void GDALImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset,
-    size_t h_tile, size_t w_tile, cudaStream_t stream)
-{
-
-    size_t tileStartOffset = (h_offset*_width + w_offset)*_pixelSize;
-
-    char * startPtr = (char *)_memPtr ;
-    startPtr += tileStartOffset;
-
-    if (_useMmap) {
-        // direct copy from memory map buffer to device memory
-        checkCudaErrors(cudaMemcpy2DAsync(dArray, // dst
-            w_tile*_pixelSize,                    // dst pitch
-            startPtr,                             // src
-            _width*_pixelSize,                    // src pitch
-            w_tile*_pixelSize,                    // width in Bytes
-            h_tile,                               // height
-            cudaMemcpyHostToDevice,stream));
-    }
-    else { // use a cpu buffer to load image data to gpu
-
-        // get the total tile size in bytes
-        size_t tileSize = h_tile*w_tile*_pixelSize;
-        // if the size is bigger than existing buffer, reallocate
-        if (tileSize > _bufferSize) {
-            // TODO: fit the pagesize
-            _bufferSize = tileSize;
-            checkCudaErrors(cudaFree(_memPtr));
-            checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
-        }
-        // copy from file to buffer
-        CPLErr err = _poBand->RasterIO(GF_Read, //eRWFlag
-            w_offset, h_offset,  //nXOff, nYOff
-            w_tile, h_tile,  // nXSize, nYSize
-            _memPtr, // pData
-            w_tile*h_tile, 1, // nBufXSize, nBufYSize
-            _dataType, //eBufType
-            0, 0 //nPixelSpace, nLineSpace in pData
-            );
-        if(err != CE_None)
-            throw; // throw if reading error occurs; message reported by GDAL
-
-        // copy from buffer to gpu
-        checkCudaErrors(cudaMemcpyAsync(dArray, _memPtr, tileSize, cudaMemcpyHostToDevice, stream));
-    }
-    // all done
-}
-
-/// destructor
-GDALImage::~GDALImage()
-{
-    // free the virtual memory
-    CPLVirtualMemFree(_poBandVirtualMem),
-    // free the GDAL Dataset, close the file
-    delete _poDataset;
-}
-
-// end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h
deleted file mode 100644
index 91c3abead..000000000
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * @file GDALImage.h
- * @brief Interface with GDAL vrt driver
- *
- * To read image file with the GDAL vrt driver, including SLC, GeoTIFF images
- * @warning Only single precision images are supported: complex(pixelOffset=8) or real(pixelOffset=4).
- * @warning Only single band file is currently supported.
- */
-
-// code guard
-#ifndef __GDALIMAGE_H
-#define __GDALIMAGE_H
-
-// dependencies
-#include <string>
-#include <gdal_priv.h>
-#include <cpl_conv.h>
-
-
-class GDALImage{
-public:
-    // specify the types
-    using size_t = std::size_t;
-
-private:
-    int _height;      ///< image height
-    int _width;       ///< image width
-
-    void * _memPtr = NULL; ///< pointer to buffer
-
-    int _pixelSize; ///< pixel size in bytes
-
-    int _isComplex; ///< whether the image is complex
-
-    size_t _bufferSize; ///< buffer size
-    int _useMmap;   ///< whether to use memory map
-
-    // GDAL temporary objects
-    GDALDataType _dataType;
-    CPLVirtualMem * _poBandVirtualMem = NULL;
-    GDALDataset * _poDataset = NULL;
-    GDALRasterBand * _poBand = NULL;
-
-public:
-    //disable default constructor
-    GDALImage() = delete;
-    // constructor
-    GDALImage(std::string fn, int band=1, int cacheSizeInGB=0, int useMmap=1);
-    // destructor
-    ~GDALImage();
-
-    // get class properties
-    void * getmemPtr()
-    {
-        return(_memPtr);
-    }
-
-    int getHeight() {
-        return (_height);
-    }
-
-    int getWidth()
-    {
-        return (_width);
-    }
-
-    int getPixelSize()
-    {
-        return _pixelSize;
-    }
-
-    bool isComplex()
-    {
-        return _isComplex;
-    }
-
-    // load data from cpu buffer to gpu
-    void loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream);
-
-};
-
-#endif //__GDALIMAGE_H
-// end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp
new file mode 100644
index 000000000..d9ebfb84f
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp
@@ -0,0 +1,109 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "cuAmpcorController.h"
+#include "cuAmpcorParameter.h"
+
+PYBIND11_MODULE(PyCuAmpcor, m)
+{
+    m.doc() = "Python module controller for underlying CUDA-Ampcor code";
+
+    using str = std::string;
+    using cls = cuAmpcorController;
+
+    pybind11::class_<cls>(m, "PyCuAmpcor")
+        .def(pybind11::init<>())
+
+        // define a trivial binding for a controller method
+#define DEF_METHOD(name) def(#name, &cls::name)
+
+        // define a trivial getter/setter for a controller parameter
+#define DEF_PARAM_RENAME(T, pyname, cppname) \
+        def_property(#pyname, [](const cls& self) -> T { \
+            return self.param->cppname; \
+        }, [](cls& self, const T i) { \
+            self.param->cppname = i; \
+        })
+
+        // same as above, for even more trivial cases where pyname == cppname
+#define DEF_PARAM(T, name) DEF_PARAM_RENAME(T, name, name)
+
+        .DEF_PARAM(int, algorithm)
+        .DEF_PARAM(int, deviceID)
+        .DEF_PARAM(int, nStreams)
+        .DEF_PARAM(int, derampMethod)
+        .DEF_PARAM(int, workflow)
+
+        .DEF_PARAM(str, referenceImageName)
+        .DEF_PARAM(int, referenceImageHeight)
+        .DEF_PARAM(int, referenceImageWidth)
+        .DEF_PARAM(int, referenceImageDataType)
+        .DEF_PARAM(str, secondaryImageName)
+        .DEF_PARAM(int, secondaryImageHeight)
+        .DEF_PARAM(int, secondaryImageWidth)
+        .DEF_PARAM(int, secondaryImageDataType)
+
+        .DEF_PARAM(int, numberWindowDown)
+        .DEF_PARAM(int, numberWindowAcross)
+
+        .DEF_PARAM_RENAME(int, windowSizeHeight, windowSizeHeightRaw)
+        .DEF_PARAM_RENAME(int, windowSizeWidth,  windowSizeWidthRaw)
+
+        .DEF_PARAM(str, offsetImageName)
+        .DEF_PARAM(str, grossOffsetImageName)
+        .DEF_PARAM(int, mergeGrossOffset)
+        .DEF_PARAM(str, snrImageName)
+        .DEF_PARAM(str, covImageName)
+        .DEF_PARAM(str, peakValueImageName)
+
+        .DEF_PARAM(int, rawDataOversamplingFactor)
+        .DEF_PARAM(int, corrStatWindowSize)
+
+        .DEF_PARAM(int, numberWindowDownInChunk)
+        .DEF_PARAM(int, numberWindowAcrossInChunk)
+
+        .DEF_PARAM(int, useMmap)
+
+        .DEF_PARAM_RENAME(int, halfSearchRangeAcross, halfSearchRangeAcrossRaw)
+        .DEF_PARAM_RENAME(int, halfSearchRangeDown,   halfSearchRangeDownRaw)
+
+        .DEF_PARAM_RENAME(int, referenceStartPixelAcrossStatic, referenceStartPixelAcross0)
+        .DEF_PARAM_RENAME(int, referenceStartPixelDownStatic,   referenceStartPixelDown0)
+
+        .DEF_PARAM(int, corrSurfaceOverSamplingMethod)
+        .DEF_PARAM(int, corrSurfaceOverSamplingFactor)
+
+        .DEF_PARAM_RENAME(int, mmapSize, mmapSizeInGB)
+
+        .DEF_PARAM_RENAME(int, skipSampleDown,   skipSampleDownRaw)
+        .DEF_PARAM_RENAME(int, skipSampleAcross, skipSampleAcrossRaw)
+        .DEF_PARAM_RENAME(int, corrSurfaceZoomInWindow, zoomWindowSize)
+
+        .DEF_METHOD(runAmpcor)
+
+        .DEF_METHOD(isDoublePrecision)
+
+        .def("checkPixelInImageRange", [](const cls& self) {
+            self.param->checkPixelInImageRange();
+        })
+
+        .def("setupParams", [](cls& self) {
+            self.param->setupParameters();
+        })
+
+        .def("setConstantGrossOffset", [](cls& self, const int goDown,
+                                                     const int goAcross) {
+            self.param->setStartPixels(
+                    self.param->referenceStartPixelDown0,
+                    self.param->referenceStartPixelAcross0,
+                    goDown, goAcross);
+        })
+        .def("setVaryingGrossOffset", [](cls& self, std::vector<int> vD,
+                                                    std::vector<int> vA) {
+            self.param->setStartPixels(
+                    self.param->referenceStartPixelDown0,
+                    self.param->referenceStartPixelAcross0,
+                    vD.data(), vA.data());
+        })
+        ;
+}
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp
new file mode 100644
index 000000000..ed4bf14f5
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp
@@ -0,0 +1,95 @@
+#include "SlcImage.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <cuComplex.h>
+#include <assert.h>
+#include <cublas_v2.h>
+#include "cudaError.h"
+#include <iostream>
+#include <stdexcept>
+
+SlcImage::SlcImage(const std::string& filepath, size_t img_height, size_t img_width, size_t pixel_size, size_t buffer_size)
+    : width(img_width), height(img_height), pixel_size(pixel_size), fd(-1), mapped_data(nullptr),
+      mapped_offset(0), mapped_size(0)
+{
+    file_size = width * height * pixel_size;
+    max_map_size = buffer_size*1024*1024*1024;
+    page_size = sysconf(_SC_PAGE_SIZE);  // Get system page size
+
+    // Open the file
+    fd = open(filepath.c_str(), O_RDONLY);
+    if (fd == -1) {
+        throw std::runtime_error("Failed to open file: " + filepath);
+    }
+}
+
+void SlcImage::remapIfNeeded(size_t required_start, size_t required_end)
+{
+
+    if(required_start < mapped_offset || required_end > mapped_offset + mapped_size)
+    {
+        // out of range, remap
+        // unmap first, if neccesary
+        if(mapped_data!=nullptr)
+            munmap(mapped_data, mapped_size);
+        // align new mapping offset
+        // round to the page size
+        mapped_offset = (required_start/page_size)*page_size;
+        // compute the mapped size
+        mapped_size = file_size - mapped_offset;
+        // not to exceed the buffer size
+        if (mapped_size > max_map_size) {
+            mapped_size = max_map_size;
+        }
+        // remap
+        mapped_data = mmap(nullptr, mapped_size, PROT_READ, MAP_PRIVATE, fd, mapped_offset);
+        if (mapped_data == MAP_FAILED) {
+            throw std::runtime_error("Failed to mmap file at offset " + std::to_string(mapped_offset));
+        }
+    }
+    // else - in range, do nothing
+}
+
+
+/// load a tile of data h_tile x w_tile from CPU (mmap) to GPU
+/// @param dArray pointer for array in device memory
+/// @param h_offset Down/Height offset
+/// @param w_offset Across/Width offset
+/// @param h_tile Down/Height tile size
+/// @param w_tile Across/Width tile size
+/// @param stream CUDA stream for copying
+void SlcImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream)
+{
+    size_t tileStartAddress = (h_offset*width + w_offset)*pixel_size;
+    size_t tileLastAddress = tileStartAddress + (h_tile*width + w_tile)*pixel_size;
+     
+    remapIfNeeded(tileStartAddress, tileLastAddress);
+    
+    char *startPtr = (char *)mapped_data ;
+    startPtr += tileStartAddress - mapped_offset;
+    
+    // @note 
+    // We assume down/across directions as rows/cols. Therefore, SLC mmap and device array both use row major.
+    // cuBlas assumes both source and target arrays are column major. 
+    // To use cublasSetMatrix, we need to switch w_tile/h_tile for rows/cols  
+    // checkCudaErrors(cublasSetMatrixAsync(w_tile, h_tile, pixelsize, startPtr, width, dArray, w_tile, stream)); 
+    
+    checkCudaErrors(cudaMemcpy2DAsync(dArray, w_tile*pixel_size, startPtr, width*pixel_size,
+                                      w_tile*pixel_size, h_tile, cudaMemcpyHostToDevice,stream));
+}
+
+SlcImage::~SlcImage()
+{
+    if (mapped_data!=nullptr) {
+        munmap(mapped_data, mapped_size);
+    }
+    if (fd != -1) {
+        close(fd);
+    }
+}
+  	  
+// end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h
new file mode 100644
index 000000000..a22650f1e
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h
@@ -0,0 +1,38 @@
+// -*- c++ -*-
+// file slcimage.h
+// a mmap loader for slc images
+
+#ifndef __SLCIMAGE_H
+#define __SLCIMAGE_H
+
+#include <string>
+#include <cuda_runtime.h>
+
+class SlcImage{
+public:
+    // disable default constructor
+    SlcImage()=delete;
+    // constructor
+    SlcImage(const std::string& fn, size_t image_height, size_t image_width, size_t pixel_size, size_t buffersize);
+    // interface
+    void loadToDevice(void* dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream);
+    // destructor
+    ~SlcImage();
+
+private:
+    int fd;
+    size_t file_size;
+    size_t pixel_size;
+    size_t height;
+    size_t width;
+
+    void* mapped_data;
+    size_t page_size;
+    size_t mapped_offset;
+    size_t mapped_size;
+    size_t max_map_size;
+
+    void remapIfNeeded(size_t required_start, size_t required_end);
+};
+
+#endif //__SLCIMAGE_H
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h
deleted file mode 100644
index 5c990e233..000000000
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * @file  cuAmpcorChunk.h
- * @brief Ampcor processor for a batch of windows
- *
- *
- */
-
-#ifndef __CUAMPCORCHUNK_H
-#define __CUAMPCORCHUNK_H
-
-#include "GDALImage.h"
-#include "cuArrays.h"
-#include "cuAmpcorParameter.h"
-#include "cuOverSampler.h"
-#include "cuSincOverSampler.h"
-#include "cuCorrFrequency.h"
-#include "cuCorrNormalizer.h"
-
-
-/**
- * cuAmpcor processor for a chunk (a batch of windows)
- */
-class cuAmpcorChunk{
-private:
-    int idxChunkDown;     ///< index of the chunk in total batches, down
-    int idxChunkAcross;   ///< index of the chunk in total batches, across
-    int idxChunk;         ///<
-    int nWindowsDown;     ///< number of windows in one chunk, down
-    int nWindowsAcross;   ///< number of windows in one chunk, across
-
-    int devId;            ///< GPU device ID to use
-    cudaStream_t stream;  ///< CUDA stream to use
-
-    GDALImage *referenceImage;  ///< reference image object
-    GDALImage *secondaryImage;  ///< secondary image object
-    cuAmpcorParameter *param;   ///< reference to the (global) parameters
-    cuArrays<float2> *offsetImage; ///< output offsets image
-    cuArrays<float> *snrImage;     ///< snr image
-    cuArrays<float3> *covImage;    ///< cov image
-    cuArrays<float> *corrImage;    ///< corr image
-
-    // local variables and workers
-    // gpu buffer to load images from file
-    cuArrays<float2> * c_referenceChunkRaw, * c_secondaryChunkRaw;
-    cuArrays<float> * r_referenceChunkRaw, * r_secondaryChunkRaw;
-
-    // windows raw (not oversampled) data, complex and real
-    cuArrays<float2> * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn;
-    cuArrays<float> * r_referenceBatchRaw, * r_secondaryBatchRaw;
-
-    // windows oversampled data
-    cuArrays<float2> * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled;
-    cuArrays<float> * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled;
-    cuArrays<float> * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust;
-
-    // offset data
-    cuArrays<int> *ChunkOffsetDown, *ChunkOffsetAcross;
-
-    // oversampling processors for complex images
-    cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
-
-    // oversampling processor for correlation surface
-    cuOverSamplerR2R *corrOverSampler;
-    cuSincOverSamplerR2R *corrSincOverSampler;
-
-    // cross-correlation processor with frequency domain algorithm
-    cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
-
-    // correlation surface normalizer
-    cuNormalizer *corrNormalizerRaw;
-    cuNormalizer *corrNormalizerOverSampled;
-
-    // save offset results in different stages
-    cuArrays<int2> *offsetInit;
-    cuArrays<int2> *offsetZoomIn;
-    cuArrays<float2> *offsetFinal;
-    cuArrays<int2> *maxLocShift; // record the maxloc from the extract center
-    cuArrays<float> *corrMaxValue;
-    cuArrays<int2> *i_maxloc;
-    cuArrays<float> *r_maxval;
-
-    // SNR estimation
-    cuArrays<float> *r_corrBatchRawZoomIn;
-    cuArrays<float> *r_corrBatchSum;
-    cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
-    cuArrays<float> *r_snrValue;
-
-    // Variance estimation
-    cuArrays<float3> *r_covValue;
-
-public:
-    // constructor
-    cuAmpcorChunk(cuAmpcorParameter *param_,
-        GDALImage *reference_, GDALImage *secondary_,
-        cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_,
-        cuArrays<float3> *covImage_, cuArrays<float>* corrImage_, cudaStream_t stream_);
-    // destructor
-    ~cuAmpcorChunk();
-
-    // local methods
-    void setIndex(int idxDown_, int idxAcross_);
-    void loadReferenceChunk();
-    void loadSecondaryChunk();
-    void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff);
-    // run the given chunk
-    void run(int, int);
-};
-
-
-
-#endif
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp
similarity index 62%
rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cu
rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp
index e906ef340..ceb0a6759 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp
@@ -7,21 +7,35 @@
 #include "cuAmpcorController.h"
 
 // dependencies
-#include "GDALImage.h"
+#include "SlcImage.h"
 #include "cuArrays.h"
 #include "cudaUtil.h"
-#include "cuAmpcorChunk.h"
+#include "cuAmpcorProcessor.h"
 #include "cuAmpcorUtil.h"
+#include <cuda_runtime.h>
 #include <iostream>
 
 // constructor
 cuAmpcorController::cuAmpcorController()
 {
     // create a new set of parameters
-    param.reset(new cuAmpcorParameter());
+    param = new cuAmpcorParameter();
 }
 
+// destructor
+cuAmpcorController::~cuAmpcorController()
+{
+    delete param;
+}
 
+bool cuAmpcorController::isDoublePrecision()
+{
+#ifdef CUAMPCOR_DOUBLE
+    return true;
+#else
+    return false;
+#endif
+}
 /**
  *  Run ampcor
  *
@@ -31,66 +45,67 @@ void cuAmpcorController::runAmpcor()
 {
     // set the gpu id
     param->deviceID = gpuDeviceInit(param->deviceID);
-    // initialize the gdal driver
-    GDALAllRegister();
-    // reference and secondary images; use band=1 as default
+
+    // reference and secondary images
     // TODO: selecting band
     std::cout << "Opening reference image " << param->referenceImageName << "...\n";
-    GDALImage *referenceImage = new GDALImage(param->referenceImageName, 1, param->mmapSizeInGB);
+    SlcImage *referenceImage = new SlcImage(param->referenceImageName, param->referenceImageHeight, param->referenceImageWidth,
+        param->referenceImageDataType*sizeof(float), param->mmapSizeInGB);
     std::cout << "Opening secondary image " << param->secondaryImageName << "...\n";
-    GDALImage *secondaryImage = new GDALImage(param->secondaryImageName, 1, param->mmapSizeInGB);
+    SlcImage *secondaryImage = new SlcImage(param->secondaryImageName, param->secondaryImageHeight, param->secondaryImageWidth,
+        param->secondaryImageDataType*sizeof(float), param->mmapSizeInGB);
 
-    cuArrays<float2> *offsetImage, *offsetImageRun;
-    cuArrays<float> *snrImage, *snrImageRun;
-    cuArrays<float3> *covImage, *covImageRun;
-    cuArrays<float> *corrImage, *corrImageRun;
+    cuArrays<real2_type> *offsetImage, *offsetImageRun;
+    cuArrays<real_type> *snrImage, *snrImageRun;
+    cuArrays<real3_type> *covImage, *covImageRun;
+    cuArrays<real_type> *peakValueImage, *peakValueImageRun;
 
     // nWindowsDownRun is defined as numberChunk * numberWindowInChunk
     // It may be bigger than the actual number of windows
     int nWindowsDownRun = param->numberChunkDown * param->numberWindowDownInChunk;
     int nWindowsAcrossRun = param->numberChunkAcross * param->numberWindowAcrossInChunk;
 
-    offsetImageRun = new cuArrays<float2>(nWindowsDownRun, nWindowsAcrossRun);
+    offsetImageRun = new cuArrays<real2_type>(nWindowsDownRun, nWindowsAcrossRun);
     offsetImageRun->allocate();
 
-    snrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
+    snrImageRun = new cuArrays<real_type>(nWindowsDownRun, nWindowsAcrossRun);
     snrImageRun->allocate();
 
-    covImageRun = new cuArrays<float3>(nWindowsDownRun, nWindowsAcrossRun);
+    covImageRun = new cuArrays<real3_type>(nWindowsDownRun, nWindowsAcrossRun);
     covImageRun->allocate();
 
-    corrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
-    corrImageRun->allocate();
+    peakValueImageRun = new cuArrays<real_type>(nWindowsDownRun, nWindowsAcrossRun);
+    peakValueImageRun->allocate();
 
     // Offset fields.
-    offsetImage = new cuArrays<float2>(param->numberWindowDown, param->numberWindowAcross);
+    offsetImage = new cuArrays<real2_type>(param->numberWindowDown, param->numberWindowAcross);
     offsetImage->allocate();
 
     // SNR.
-    snrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
+    snrImage = new cuArrays<real_type>(param->numberWindowDown, param->numberWindowAcross);
     snrImage->allocate();
 
     // Variance.
-    covImage = new cuArrays<float3>(param->numberWindowDown, param->numberWindowAcross);
+    covImage = new cuArrays<real3_type>(param->numberWindowDown, param->numberWindowAcross);
     covImage->allocate();
 
-    // Cross-correlation peak
-    corrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
-    corrImage->allocate();
+    // Correlation surface peak value
+    peakValueImage = new cuArrays<real_type>(param->numberWindowDown, param->numberWindowAcross);
+    peakValueImage->allocate();
+
+
 
     // set up the cuda streams
     cudaStream_t streams[param->nStreams];
-    cuAmpcorChunk *chunk[param->nStreams];
+    std::unique_ptr<cuAmpcorProcessor> chunk[param->nStreams];
     // iterate over cuda streams
     for(int ist=0; ist<param->nStreams; ist++)
     {
         // create each stream
         checkCudaErrors(cudaStreamCreate(&streams[ist]));
         // create the chunk processor for each stream
-        chunk[ist]= new cuAmpcorChunk(param.get(), referenceImage, secondaryImage,
-            offsetImageRun, snrImageRun, covImageRun, corrImageRun,
-            streams[ist]);
-
+        chunk[ist]= cuAmpcorProcessor::create(param->workflow, param, referenceImage, secondaryImage,
+            offsetImageRun, snrImageRun, covImageRun, peakValueImageRun, streams[ist]);
     }
 
     int nChunksDown = param->numberChunkDown;
@@ -131,17 +146,17 @@ void cuAmpcorController::runAmpcor()
     cuArraysCopyExtract(offsetImageRun, offsetImage, make_int2(0,0), streams[0]);
     cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]);
     cuArraysCopyExtract(covImageRun, covImage, make_int2(0,0), streams[0]);
-    cuArraysCopyExtract(corrImageRun, corrImage, make_int2(0,0), streams[0]);
+    cuArraysCopyExtract(peakValueImageRun, peakValueImage, make_int2(0,0), streams[0]);
 
     /* save the offsets and gross offsets */
     // copy the offset to host
     offsetImage->allocateHost();
     offsetImage->copyToHost(streams[0]);
     // construct the gross offset
-    cuArrays<float2> *grossOffsetImage = new cuArrays<float2>(param->numberWindowDown, param->numberWindowAcross);
+    cuArrays<real2_type> *grossOffsetImage = new cuArrays<real2_type>(param->numberWindowDown, param->numberWindowAcross);
     grossOffsetImage->allocateHost();
     for(int i=0; i< param->numberWindows; i++)
-        grossOffsetImage->hostData[i] = make_float2(param->grossOffsetDown[i], param->grossOffsetAcross[i]);
+        grossOffsetImage->hostData[i] = make_real2(param->grossOffsetDown[i], param->grossOffsetAcross[i]);
 
     // check whether to merge gross offset
     if (param->mergeGrossOffset)
@@ -158,25 +173,24 @@ void cuAmpcorController::runAmpcor()
     // save the snr/cov images
     snrImage->outputToFile(param->snrImageName, streams[0]);
     covImage->outputToFile(param->covImageName, streams[0]);
-
-    // save the cross-correlation peak
-    corrImage->outputToFile(param->corrImageName, streams[0]);
+    peakValueImage->outputToFile(param->peakValueImageName, streams[0]);
 
     // Delete arrays.
     delete offsetImage;
     delete snrImage;
     delete covImage;
-    delete corrImage;
+    delete peakValueImage;
 
     delete offsetImageRun;
     delete snrImageRun;
     delete covImageRun;
-    delete corrImageRun;
+    delete peakValueImageRun;
 
     for (int ist=0; ist<param->nStreams; ist++)
     {
+        // cufftplan etc are stream dependent, need to be deleted before stream is destroyed
+        chunk[ist].release();
         checkCudaErrors(cudaStreamDestroy(streams[ist]));
-        delete chunk[ist];
     }
 
     delete referenceImage;
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h
index e807cce32..466fd6859 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h
@@ -15,18 +15,19 @@
 #ifndef CU_AMPCOR_CONTROLLER_H
 #define CU_AMPCOR_CONTROLLER_H
 
-#include <memory>
-
 // dependencies
 #include "cuAmpcorParameter.h"
 
 class cuAmpcorController {
 public:
-    std::unique_ptr<cuAmpcorParameter> param;  ///< the parameter set
+    cuAmpcorParameter *param;  ///< the parameter set
     // constructor
     cuAmpcorController();
+    // destructor
+    ~cuAmpcorController();
     // run interface
     void runAmpcor();
+    bool isDoublePrecision();
 };
 #endif
 
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp
similarity index 54%
rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp
index 9acc6b294..0445f9343 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp
@@ -5,6 +5,9 @@
 
 #include "cuAmpcorParameter.h"
 #include <stdio.h>
+#include <iostream>
+#include <stdexcept>
+
 
 #ifndef IDIVUP
 #define IDIVUP(i,j) ((i+j-1)/j)
@@ -22,7 +25,8 @@ cuAmpcorParameter::cuAmpcorParameter()
     algorithm = 0; //0 freq; 1 time
     deviceID = 0;
     nStreams = 1;
-    derampMethod = 1;
+    derampMethod = 0;
+    workflow = 0;
 
     windowSizeWidthRaw = 64;
     windowSizeHeightRaw = 64;
@@ -31,21 +35,24 @@ cuAmpcorParameter::cuAmpcorParameter()
 
     skipSampleAcrossRaw = 64;
     skipSampleDownRaw = 64;
-    rawDataOversamplingFactor = 2;
-    zoomWindowSize = 16;
-    oversamplingFactor = 16;
-    oversamplingMethod = 0;
+    rawDataOversamplingFactor = 2; //antialiasing
+    zoomWindowSize = 16;  // correlation surface oversampling zoom
+    corrSurfaceOverSamplingFactor = 16; // correlation surface oversampling
+    corrSurfaceOverSamplingMethod = 0;
 
     referenceImageName = "reference.slc";
     referenceImageWidth = 1000;
     referenceImageHeight = 1000;
+    referenceImageDataType = 2; // complex
     secondaryImageName = "secondary.slc";
     secondaryImageWidth = 1000;
     secondaryImageHeight = 1000;
-    offsetImageName = "DenseOffset.off";
-    grossOffsetImageName = "GrossOffset.off";
-    snrImageName = "snr.snr";
-    covImageName = "cov.cov";
+    secondaryImageDataType = 2; // complex
+    offsetImageName = "DenseOffset.bip";
+    grossOffsetImageName = "GrossOffset.bip";
+    snrImageName = "snr.bip";
+    covImageName = "cov.bip";
+    peakValueImageName = "peakValue.bip";
     numberWindowDown =  1;
     numberWindowAcross = 1;
     numberWindowDownInChunk = 1;
@@ -68,6 +75,34 @@ cuAmpcorParameter::cuAmpcorParameter()
  */
 
 void cuAmpcorParameter::setupParameters()
+{
+    // workflow specific parameter settings
+    //
+    switch (workflow) {
+        case 0:
+            _setupParameters_TwoPass();
+            break;
+        case 1:
+            _setupParameters_OnePass();
+            break;
+        default:
+            throw std::invalid_argument("Unsupported workflow");
+    }
+
+    // common parameter settings
+    numberWindows = numberWindowDown*numberWindowAcross;
+    if(numberWindows <=0) {
+        fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross);
+        exit(EXIT_FAILURE);
+    }
+
+    numberChunkDown = IDIVUP(numberWindowDown, numberWindowDownInChunk);
+    numberChunkAcross = IDIVUP(numberWindowAcross, numberWindowAcrossInChunk);
+    numberChunks = numberChunkDown*numberChunkAcross;
+    allocateArrays();
+}
+
+void cuAmpcorParameter::_setupParameters_TwoPass()
 {
     // Size to extract the raw correlation surface for snr/cov
     corrRawZoomInHeight = std::min(corrStatWindowSize, 2*halfSearchRangeDownRaw+1);
@@ -87,8 +122,8 @@ void cuAmpcorParameter::setupParameters()
     windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor;  //
     windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor;
 
-    searchWindowSizeWidthRaw =  windowSizeWidthRaw + 2*halfSearchRangeDownRaw;
-    searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw;
+    searchWindowSizeWidthRaw =  windowSizeWidthRaw + 2*halfSearchRangeAcrossRaw;
+    searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeDownRaw;
 
     searchWindowSizeWidthRawZoomIn = windowSizeWidthRaw + 2*halfZoomWindowSizeRaw;
     searchWindowSizeHeightRawZoomIn = windowSizeHeightRaw + 2*halfZoomWindowSizeRaw;
@@ -96,40 +131,126 @@ void cuAmpcorParameter::setupParameters()
     searchWindowSizeWidth = searchWindowSizeWidthRawZoomIn*rawDataOversamplingFactor;
     searchWindowSizeHeight = searchWindowSizeHeightRawZoomIn*rawDataOversamplingFactor;
 
-    numberWindows = numberWindowDown*numberWindowAcross;
-    if(numberWindows <=0) {
-        fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross);
-        exit(EXIT_FAILURE);
-    }
+    windowSizeWidthRawEnlarged = windowSizeWidthRaw;
+    windowSizeHeightRawEnlarged = windowSizeHeightRaw;
+
+    // loading offsets
+    referenceLoadingOffsetDown = 0;
+    referenceLoadingOffsetAcross = 0;
+    // secondary loading offset relative to reference
+    secondaryLoadingOffsetDown = -halfSearchRangeDownRaw;
+    secondaryLoadingOffsetAcross = -halfSearchRangeAcrossRaw;
+
+}
+
+void cuAmpcorParameter::_setupParameters_OnePass()
+{
+
+    // template window size (after antialiasing oversampling)
+    windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor;  //
+    windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor;
+
+    // serve as extra margin for search range
+    halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor);
+
+    // add extra search range to ensure enough area to oversample correlation surface
+    halfSearchRangeDownRaw += halfZoomWindowSizeRaw;
+    halfSearchRangeAcrossRaw += halfZoomWindowSizeRaw;
+
+    // search window size before antialiasing oversampling
+    searchWindowSizeWidthRaw =  windowSizeWidthRaw + 2*halfSearchRangeAcrossRaw;
+    searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeDownRaw;
+
+    windowSizeHeightRawEnlarged = searchWindowSizeHeightRaw;
+    windowSizeWidthRawEnlarged = searchWindowSizeWidthRaw;
+
+    // search window size (after antialiasing oversampling)
+    searchWindowSizeWidth = searchWindowSizeWidthRaw*rawDataOversamplingFactor;
+    searchWindowSizeHeight = searchWindowSizeHeightRaw*rawDataOversamplingFactor;
+
+    windowSizeHeightEnlarged = searchWindowSizeHeight;
+    windowSizeWidthEnlarged = searchWindowSizeWidth;
+
+    // loading offsets
+    // reference size matching the secondary size
+    referenceLoadingOffsetDown = -halfSearchRangeDownRaw;
+    referenceLoadingOffsetAcross = -halfSearchRangeAcrossRaw;
+    // secondary loading offset relative to reference
+    secondaryLoadingOffsetDown = 0;
+    secondaryLoadingOffsetAcross = 0;
+
+    // correlation surface size
+    corrWindowSize = make_int2(searchWindowSizeHeight - windowSizeHeight + 1,
+                               searchWindowSizeWidth - windowSizeWidth +1);
+    // check zoom in window size, if larger, issue a warning
+    if(zoomWindowSize >= corrWindowSize.x || zoomWindowSize >= corrWindowSize.y)
+        fprintf(stderr, "Warning: zoomWindowSize %d is bigger than the original correlation surface size (%d, %d)!\n",
+            zoomWindowSize, corrWindowSize.x, corrWindowSize.y );
+    // use the smaller values
+    corrZoomInSize = make_int2(std::min(zoomWindowSize+1, corrWindowSize.x),
+                               std::min(zoomWindowSize+1, corrWindowSize.y));
+
+    // oversampled correlation surface size
+    corrZoomInOversampledSize = make_int2(corrZoomInSize.x * corrSurfaceOverSamplingFactor,
+                                              corrZoomInSize.y * corrSurfaceOverSamplingFactor);
+
+    int totalOS = rawDataOversamplingFactor*corrSurfaceOverSamplingFactor;
+    // search start : image center - 1 pixel * totalOS
+    corrZoomInOversampledSearchStart = make_int2((corrZoomInSize.x-1)*corrSurfaceOverSamplingFactor/2 - totalOS,
+        (corrZoomInSize.y-1)*corrSurfaceOverSamplingFactor/2 - totalOS);
+    // search range \pm 1 pixel * totalOS
+    corrZoomInOversampledSearchRange = make_int2(2*totalOS, 2*totalOS);
+
+    fprintf(stderr, "zoomWindowSize is (%d, %d)!\n",
+            corrZoomInSize.x, corrZoomInSize.y );
 
-    numberChunkDown = IDIVUP(numberWindowDown, numberWindowDownInChunk);
-    numberChunkAcross = IDIVUP(numberWindowAcross, numberWindowAcrossInChunk);
-    numberChunks = numberChunkDown*numberChunkAcross;
-    allocateArrays();
 }
 
 
 void cuAmpcorParameter::allocateArrays()
 {
-    int arraySize = numberWindows;
-    grossOffsetDown.resize(arraySize);
-    grossOffsetAcross.resize(arraySize);
-    referenceStartPixelDown.resize(arraySize);
-    referenceStartPixelAcross.resize(arraySize);
-    secondaryStartPixelDown.resize(arraySize);
-    secondaryStartPixelAcross.resize(arraySize);
-
-    int arraySizeChunk = numberChunks;
-    referenceChunkStartPixelDown.resize(arraySizeChunk);
-    referenceChunkStartPixelAcross.resize(arraySizeChunk);
-    secondaryChunkStartPixelDown.resize(arraySizeChunk);
-    secondaryChunkStartPixelAcross.resize(arraySizeChunk);
-    referenceChunkHeight.resize(arraySizeChunk);
-    referenceChunkWidth.resize(arraySizeChunk);
-    secondaryChunkHeight.resize(arraySizeChunk);
-    secondaryChunkWidth.resize(arraySizeChunk);
+    int arraySize = numberWindows*sizeof(int);
+    grossOffsetDown = (int *)malloc(arraySize);
+    grossOffsetAcross = (int *)malloc(arraySize);
+    referenceStartPixelDown = (int *)malloc(arraySize);
+    referenceStartPixelAcross =  (int *)malloc(arraySize);
+    secondaryStartPixelDown = (int *)malloc(arraySize);
+    secondaryStartPixelAcross =  (int *)malloc(arraySize);
+
+    int arraySizeChunk = numberChunks*sizeof(int);
+    referenceChunkStartPixelDown = (int *)malloc(arraySizeChunk);
+    referenceChunkStartPixelAcross = (int *)malloc(arraySizeChunk);
+    secondaryChunkStartPixelDown = (int *)malloc(arraySizeChunk);
+    secondaryChunkStartPixelAcross = (int *)malloc(arraySizeChunk);
+    referenceChunkHeight = (int *)malloc(arraySizeChunk);
+    referenceChunkWidth = (int *)malloc(arraySizeChunk);
+    secondaryChunkHeight = (int *)malloc(arraySizeChunk);
+    secondaryChunkWidth = (int *)malloc(arraySizeChunk);
 }
 
+void cuAmpcorParameter::deallocateArrays()
+{
+    free(grossOffsetDown);
+    free(grossOffsetAcross);
+    free(referenceStartPixelDown);
+    free(referenceStartPixelAcross);
+    free(secondaryStartPixelDown);
+    free(secondaryStartPixelAcross);
+    free(referenceChunkStartPixelDown);
+    free(referenceChunkStartPixelAcross);
+    free(secondaryChunkStartPixelDown);
+    free(secondaryChunkStartPixelAcross);
+    free(referenceChunkHeight);
+    free(referenceChunkWidth);
+    free(secondaryChunkHeight);
+    free(secondaryChunkWidth);
+}
+
+
+// ****************
+// make reference window the same as secondary for oversampling
+// ****************
+
 /// Set starting pixels for reference and secondary windows from arrays
 /// set also gross offsets between reference and secondary windows
 ///
@@ -137,12 +258,12 @@ void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD
 {
     for(int i=0; i<numberWindows; i++)
     {
-        referenceStartPixelDown[i] = mStartD[i];
+        referenceStartPixelDown[i] = mStartD[i] + referenceLoadingOffsetDown;
         grossOffsetDown[i] = gOffsetD[i];
-        secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
-        referenceStartPixelAcross[i] = mStartA[i];
+        secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] + secondaryLoadingOffsetDown;
+        referenceStartPixelAcross[i] = mStartA[i] + referenceLoadingOffsetAcross;
         grossOffsetAcross[i] = gOffsetA[i];
-        secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
+        secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] + secondaryLoadingOffsetAcross;
     }
     setChunkStartPixels();
 }
@@ -155,12 +276,12 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int *gOffsetD,
         for(int col = 0; col < numberWindowAcross; col++)
         {
             int i = row*numberWindowAcross + col;
-            referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
+            referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw + referenceLoadingOffsetDown;
             grossOffsetDown[i] = gOffsetD[i];
-            secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
-            referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
+            secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] + secondaryLoadingOffsetDown;
+            referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw + referenceLoadingOffsetAcross;
             grossOffsetAcross[i] = gOffsetA[i];
-            secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
+            secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] + secondaryLoadingOffsetAcross;
         }
     }
     setChunkStartPixels();
@@ -174,12 +295,12 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int gOffsetD, i
         for(int col = 0; col < numberWindowAcross; col++)
         {
             int i = row*numberWindowAcross + col;
-            referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
+            referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw + referenceLoadingOffsetDown;
             grossOffsetDown[i] = gOffsetD;
-            secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
-            referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
+            secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] + secondaryLoadingOffsetDown;
+            referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw + referenceLoadingOffsetAcross;
             grossOffsetAcross[i] = gOffsetA;
-            secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
+            secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] + secondaryLoadingOffsetAcross;
         }
     }
     setChunkStartPixels();
@@ -198,8 +319,9 @@ void cuAmpcorParameter::setChunkStartPixels()
     {
         for (int jchunk =0; jchunk<numberChunkAcross; jchunk++)
         {
-
+            // index of chunk
             int idxChunk = ichunk*numberChunkAcross+jchunk;
+            // variables to keep track of the first(s) and last(e) pixels in the chunk
             int mChunkSD = referenceImageHeight;
             int mChunkSA = referenceImageWidth;
             int mChunkED = 0;
@@ -217,6 +339,8 @@ void cuAmpcorParameter::setChunkStartPixels()
             if(jchunk == numberChunkAcross -1)
                 numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
 
+            // iterate over windows in the chunk to find the starting pixels and the chunk height/windowSizeWidth
+            // these parameters are needed to copy the chunk from the whole image file
             for(int i=0; i<numberWindowDownInChunkRun; i++)
             {
                 for(int j=0; j<numberWindowAcrossInChunkRun; j++)
@@ -236,14 +360,31 @@ void cuAmpcorParameter::setChunkStartPixels()
                     if(sChunkEA < vpixel) sChunkEA = vpixel;
                 }
             }
+
+            // check whether the starting pixel exceeds the image range
+            if (mChunkSD < 0) mChunkSD = 0;
+            if (mChunkSA < 0) mChunkSA = 0;
+            if (sChunkSD < 0) sChunkSD = 0;
+            if (sChunkSA < 0) sChunkSA = 0;
+            // set and check the last pixel
+            mChunkED += windowSizeHeightRawEnlarged;
+            if (mChunkED > referenceImageHeight) mChunkED = referenceImageHeight;
+            mChunkEA += windowSizeWidthRawEnlarged;
+            if (mChunkEA > referenceImageWidth) mChunkED = referenceImageWidth;
+            sChunkED += searchWindowSizeHeightRaw;
+            if (sChunkED > secondaryImageHeight) sChunkED = secondaryImageHeight;
+            sChunkEA += searchWindowSizeWidthRaw;
+            if (sChunkEA > secondaryImageWidth) sChunkED = secondaryImageWidth;
+            // set the starting pixel and size of the chunk
             referenceChunkStartPixelDown[idxChunk]   = mChunkSD;
             referenceChunkStartPixelAcross[idxChunk] = mChunkSA;
             secondaryChunkStartPixelDown[idxChunk]    = sChunkSD;
             secondaryChunkStartPixelAcross[idxChunk]  = sChunkSA;
-            referenceChunkHeight[idxChunk] = mChunkED - mChunkSD + windowSizeHeightRaw;
-            referenceChunkWidth[idxChunk]  = mChunkEA - mChunkSA + windowSizeWidthRaw;
-            secondaryChunkHeight[idxChunk]  = sChunkED - sChunkSD + searchWindowSizeHeightRaw;
-            secondaryChunkWidth[idxChunk]   = sChunkEA - sChunkSA + searchWindowSizeWidthRaw;
+            referenceChunkHeight[idxChunk] = mChunkED - mChunkSD;
+            referenceChunkWidth[idxChunk]  = mChunkEA - mChunkSA;
+            secondaryChunkHeight[idxChunk]  = sChunkED - sChunkSD;
+            secondaryChunkWidth[idxChunk]   = sChunkEA - sChunkSA;
+            // search the max chunk size, used to determine the allocated read buffer size
             if(maxReferenceChunkHeight < referenceChunkHeight[idxChunk]) maxReferenceChunkHeight = referenceChunkHeight[idxChunk];
             if(maxReferenceChunkWidth  < referenceChunkWidth[idxChunk] ) maxReferenceChunkWidth  = referenceChunkWidth[idxChunk];
             if(maxSecondaryChunkHeight  < secondaryChunkHeight[idxChunk]) maxSecondaryChunkHeight = secondaryChunkHeight[idxChunk];
@@ -253,8 +394,11 @@ void cuAmpcorParameter::setChunkStartPixels()
 }
 
 /// check whether reference and secondary windows are within the image range
+// now issue warning rather than errors
 void cuAmpcorParameter::checkPixelInImageRange()
 {
+// check range is no longer required, but offered as an option in DEBUG
+#ifdef CUAMPCOR_DEBUG
     int endPixel;
     for(int row=0; row<numberWindowDown; row++)
     {
@@ -263,54 +407,50 @@ void cuAmpcorParameter::checkPixelInImageRange()
             int i = row*numberWindowAcross + col;
             if(referenceStartPixelDown[i] <0)
             {
-                fprintf(stderr, "Reference Window start pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
-                exit(EXIT_FAILURE); //or raise range error
+                printf("Warning: Reference Window start pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
             }
             if(referenceStartPixelAcross[i] <0)
             {
-                fprintf(stderr, "Reference Window start pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
-                exit(EXIT_FAILURE);
+                printf("Warning: Reference Window start pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
             }
             endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw;
             if(endPixel >= referenceImageHeight)
             {
-                fprintf(stderr, "Reference Window end pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
-                exit(EXIT_FAILURE);
+                printf("Warning: Warning: Reference Window end pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
             }
             endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw;
             if(endPixel >= referenceImageWidth)
             {
-                fprintf(stderr, "Reference Window end pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
-                exit(EXIT_FAILURE);
+                printf("Warning: Reference Window end pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
             }
             //secondary
             if(secondaryStartPixelDown[i] <0)
             {
-                fprintf(stderr, "Secondary Window start pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
-                exit(EXIT_FAILURE);
+                printf("Warning: Secondary Window start pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
             }
             if(secondaryStartPixelAcross[i] <0)
             {
-                fprintf(stderr, "Secondary Window start pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
-                exit(EXIT_FAILURE);
+                printf("Warning: Secondary Window start pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
             }
             endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw;
             if(endPixel >= secondaryImageHeight)
             {
-                fprintf(stderr, "Secondary Window end pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
-                exit(EXIT_FAILURE);
+                printf("Warning: Secondary Window end pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
             }
             endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw;
             if(endPixel >= secondaryImageWidth)
             {
-                fprintf(stderr, "Secondary Window end pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
-                exit(EXIT_FAILURE);
+                printf("Warning: Secondary Window end pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
             }
 
         }
     }
+#endif // CUAMPCOR_DEBUG
 }
 
 
-cuAmpcorParameter::~cuAmpcorParameter() {}
+cuAmpcorParameter::~cuAmpcorParameter()
+{
+    deallocateArrays();
+}
 // end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
index 357f36ba9..6a6d83a12 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
@@ -10,7 +10,7 @@
 #define __CUAMPCORPARAMETER_H
 
 #include <string>
-#include <vector>
+#include <cuda_runtime.h> // for int2
 
 /// Class container for all parameters
 ///
@@ -32,40 +32,34 @@
 
 class cuAmpcorParameter{
 public:
-
-    // TODO this is to avoid memory issues due to copying the non-owning pointers
-    // we should use RAII ownership so these can be trivially defaulted
-    cuAmpcorParameter(const cuAmpcorParameter&) = delete;
-    cuAmpcorParameter& operator=(const cuAmpcorParameter&) = delete;
-    cuAmpcorParameter(cuAmpcorParameter&&) = delete;
-    cuAmpcorParameter& operator=(cuAmpcorParameter&&) = delete;
-
     int algorithm;      ///< Cross-correlation algorithm: 0=freq domain (default) 1=time domain
     int deviceID;       ///< Targeted GPU device ID: use -1 to auto select
     int nStreams;       ///< Number of streams to asynchonize data transfers and compute kernels
     int derampMethod;   ///< Method for deramping 0=None, 1=average
+    int workflow;       ///< Workflow 0: two passes, first pass without antialiasing oversampling, 1: one pass with antialiasing oversampling
 
     // chip or window size for raw data
     int windowSizeHeightRaw;        ///< Template window height (original size)
     int windowSizeWidthRaw;         ///< Template window width (original size)
+
+    int windowSizeHeightRawEnlarged; ///< Template window height Enlarged to search window size for oversampling
+    int windowSizeWidthRawEnlarged; ///< Template window width Enlarged to search window size for oversampling
+
     int searchWindowSizeHeightRaw;  ///< Search window height (original size)
     int searchWindowSizeWidthRaw;   ///< Search window width (original size)
 
     int halfSearchRangeDownRaw;   ///< (searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
     int halfSearchRangeAcrossRaw;    ///< (searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
     // search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
-
-    int searchWindowSizeHeightRawZoomIn; ///< search window height used for zoom in
-    int searchWindowSizeWidthRawZoomIn;  ///< search window width used for zoom in
-
-    int corrStatWindowSize;     ///< correlation surface size used to estimate snr
-    int corrRawZoomInHeight;    ///< correlation surface height used for oversampling
-    int corrRawZoomInWidth;     ///< correlation surface width used for oversampling
+    // note the search range now includes extra margin for the correlation surface extraction
 
     // chip or window size after oversampling
-    int rawDataOversamplingFactor;  ///< Raw data overampling factor (from original size to oversampled size)
+    int rawDataOversamplingFactor;  ///< Raw data oversampling factor (from original size to oversampled size)
     int windowSizeHeight;           ///< Template window length (oversampled size)
     int windowSizeWidth;            ///< Template window width (original size)
+    int windowSizeHeightEnlarged;           ///< Template window length enlarged (oversampled size)
+    int windowSizeWidthEnlarged;            ///< Template window width enlarged (original size)
+
     int searchWindowSizeHeight;     ///< Search window height (oversampled size)
     int searchWindowSizeWidth;      ///< Search window width (oversampled size)
 
@@ -77,21 +71,35 @@ class cuAmpcorParameter{
     int zoomWindowSize;      ///< Zoom-in window size in correlation surface (same for down and across directions)
     int halfZoomWindowSizeRaw; ///<  half of zoomWindowSize/rawDataOversamplingFactor
 
-    int oversamplingFactor;  ///< Oversampling factor for interpolating correlation surface
-    int oversamplingMethod;  ///< correlation surface oversampling method 0 = fft (default)  1 = sinc
+    int corrSurfaceOverSamplingFactor;  ///< Oversampling factor for interpolating correlation surface
+    int corrSurfaceOverSamplingMethod;  ///< correlation surface oversampling method 0 = fft (default)  1 = sinc
+
+    // correlation surface
+    int2 corrWindowSize; // 2*halfSearchRange + 1
+    int2 corrZoomInSize; // zoomWindowSize+1
+    int2 corrZoomInOversampledSize; // corrZoomInSize * oversamplingFactor
+    int2 corrZoomInOversampledSearchStart; // search start position in the oversampled correlation surface
+    int2 corrZoomInOversampledSearchRange; // search range in the oversampled correlation surface
+
+    int corrStatWindowSize;     ///< correlation surface size used to estimate snr
 
+    // parameters used in the first pass in two-pass workflow
+    // @TODO move them to workflow specific files
+    int searchWindowSizeHeightRawZoomIn;
+    int searchWindowSizeWidthRawZoomIn;
+    int corrRawZoomInHeight;    ///< correlation surface height used for oversampling
+    int corrRawZoomInWidth;     ///< correlation surface width used for oversampling
 
-    float thresholdSNR;      ///< Threshold of Signal noise ratio to remove noisy data
 
     //reference image
     std::string referenceImageName;    ///< reference SLC image name
-    int imageDataType1;                ///< reference image data type, 2=cfloat=complex=float2 1=float
+    int referenceImageDataType;        ///< reference image data type, 2=cfloat=complex=float2 1=float
     int referenceImageHeight;          ///< reference image height
     int referenceImageWidth;           ///< reference image width
 
     //secondary image
     std::string secondaryImageName;     ///< secondary SLC image name
-    int imageDataType2;                 ///< secondary image data type, 2=cfloat=complex=float2 1=float
+    int secondaryImageDataType;         ///< secondary image data type, 2=cfloat=complex=float2 1=float
     int secondaryImageHeight;           ///< secondary image height
     int secondaryImageWidth;            ///< secondary image width
 
@@ -113,32 +121,35 @@ class cuAmpcorParameter{
 
     int referenceStartPixelDown0;    ///< first starting pixel in reference image (down)
     int referenceStartPixelAcross0;  ///< first starting pixel in reference image (across)
-    std::vector<int> referenceStartPixelDown;    ///< reference starting pixels for each window (down)
-    std::vector<int> referenceStartPixelAcross;  ///< reference starting pixels for each window (across)
-    std::vector<int> secondaryStartPixelDown;    ///< secondary starting pixels for each window (down)
-    std::vector<int> secondaryStartPixelAcross;  ///< secondary starting pixels for each window (across)
+    int *referenceStartPixelDown;    ///< reference starting pixels for each window (down)
+    int *referenceStartPixelAcross;  ///< reference starting pixels for each window (across)
+    int *secondaryStartPixelDown;    ///< secondary starting pixels for each window (down)
+    int *secondaryStartPixelAcross;  ///< secondary starting pixels for each window (across)
     int grossOffsetDown0;       ///< gross offset static component (down)
     int grossOffsetAcross0;     ///< gross offset static component (across)
-    std::vector<int> grossOffsetDown;		///< Gross offsets between reference and secondary windows (down)
-    std::vector<int> grossOffsetAcross;     ///< Gross offsets between reference and secondary windows (across)
+    int *grossOffsetDown;		///< Gross offsets between reference and secondary windows (down)
+    int *grossOffsetAcross;     ///< Gross offsets between reference and secondary windows (across)
     int mergeGrossOffset;       ///< whether to merge gross offsets into the final offsets
 
-    std::vector<int> referenceChunkStartPixelDown;    ///< reference starting pixels for each chunk (down)
-    std::vector<int> referenceChunkStartPixelAcross;  ///< reference starting pixels for each chunk (across)
-    std::vector<int> secondaryChunkStartPixelDown;    ///< secondary starting pixels for each chunk (down)
-    std::vector<int> secondaryChunkStartPixelAcross;  ///< secondary starting pixels for each chunk (across)
-    std::vector<int> referenceChunkHeight;   ///< reference chunk height
-    std::vector<int> referenceChunkWidth;    ///< reference chunk width
-    std::vector<int> secondaryChunkHeight;   ///< secondary chunk height
-    std::vector<int> secondaryChunkWidth;    ///< secondary chunk width
+    int *referenceChunkStartPixelDown;    ///< reference starting pixels for each chunk (down)
+    int *referenceChunkStartPixelAcross;  ///< reference starting pixels for each chunk (across)
+    int *secondaryChunkStartPixelDown;    ///< secondary starting pixels for each chunk (down)
+    int *secondaryChunkStartPixelAcross;  ///< secondary starting pixels for each chunk (across)
+    int *referenceChunkHeight;   ///< reference chunk height
+    int *referenceChunkWidth;    ///< reference chunk width
+    int *secondaryChunkHeight;   ///< secondary chunk height
+    int *secondaryChunkWidth;    ///< secondary chunk width
     int maxReferenceChunkHeight, maxReferenceChunkWidth; ///< max reference chunk size
     int maxSecondaryChunkHeight, maxSecondaryChunkWidth; ///< max secondary chunk size
 
+    int referenceLoadingOffsetDown, referenceLoadingOffsetAcross; ///< reference loading offset, depending on workflow (e.g, matching secondary)
+    int secondaryLoadingOffsetDown, secondaryLoadingOffsetAcross; ///< secondary loading offset, depending on workflow (e.g., with extra pads)
+
     std::string grossOffsetImageName;  ///< gross offset output filename
     std::string offsetImageName;       ///< Offset fields output filename
     std::string snrImageName;          ///< Output SNR filename
     std::string covImageName;          ///< Output variance filename
-    std::string corrImageName;         ///< Output cross-correlation peak filename
+    std::string peakValueImageName;      ///< Output normalized correlation surface peak value filename
 
     // Class constructor and default parameters setter
     cuAmpcorParameter();
@@ -147,6 +158,9 @@ class cuAmpcorParameter{
 
     // Allocate various arrays after the number of Windows is given
     void allocateArrays();
+    // Deallocate arrays on exit
+    void deallocateArrays();
+
 
     // Three methods to set reference/secondary starting pixels and gross offsets from input reference start pixel(s) and gross offset(s)
     // 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets
@@ -161,6 +175,8 @@ class cuAmpcorParameter{
     void checkPixelInImageRange();
     // Process other parameters after Python Input
     void setupParameters();
+    void _setupParameters_TwoPass();
+    void _setupParameters_OnePass();
 
 };
 
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp
new file mode 100644
index 000000000..d96ff8f57
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp
@@ -0,0 +1,81 @@
+#include "cuAmpcorProcessor.h"
+#include "cuAmpcorProcessorTwoPass.h"
+#include "cuAmpcorProcessorOnePass.h"
+
+#include <stdexcept>
+
+// Factory method implementation
+// create the batch processor for a given {workflow}
+std::unique_ptr<cuAmpcorProcessor> cuAmpcorProcessor::create(int workflow,
+    cuAmpcorParameter *param_,
+    SlcImage *reference_, SlcImage *secondary_,
+    cuArrays<real2_type> *offsetImage_, cuArrays<real_type> *snrImage_,
+    cuArrays<real3_type> *covImage_, cuArrays<real_type> *peakValueImage_,
+    cudaStream_t stream_)
+{
+    if (workflow == 0) {
+        return std::unique_ptr<cuAmpcorProcessor>(new cuAmpcorProcessorTwoPass(
+            param_, reference_, secondary_, offsetImage_,
+            snrImage_, covImage_, peakValueImage_, stream_));
+    } else if (workflow == 1) {
+        return std::unique_ptr<cuAmpcorProcessor>(new cuAmpcorProcessorOnePass(
+            param_, reference_, secondary_, offsetImage_,
+            snrImage_, covImage_, peakValueImage_, stream_));
+    } else {
+        throw std::invalid_argument("Unsupported workflow");
+    }
+}
+
+// constructor
+cuAmpcorProcessor::cuAmpcorProcessor(cuAmpcorParameter *param_,
+        SlcImage *reference_, SlcImage *secondary_,
+        cuArrays<real2_type> *offsetImage_, cuArrays<real_type> *snrImage_,
+        cuArrays<real3_type> *covImage_, cuArrays<real_type> *peakValueImage_,
+        cudaStream_t stream_)
+    : param(param_), referenceImage(reference_), secondaryImage(secondary_),
+    offsetImage(offsetImage_), snrImage(snrImage_), covImage(covImage_),
+    peakValueImage(peakValueImage_), stream(stream_)
+{
+}
+
+/// set chunk index
+void cuAmpcorProcessor::setIndex(int idxDown_, int idxAcross_)
+{
+    idxChunkDown = idxDown_;
+    idxChunkAcross = idxAcross_;
+    idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
+
+    if(idxChunkDown == param->numberChunkDown -1) {
+        nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
+    }
+    else {
+        nWindowsDown = param->numberWindowDownInChunk;
+    }
+
+    if(idxChunkAcross == param->numberChunkAcross -1) {
+        nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
+    }
+    else {
+        nWindowsAcross = param->numberWindowAcrossInChunk;
+    }
+}
+
+/// obtain the starting pixels for each chip
+/// @param[in] oStartPixel start pixel locations for all chips
+/// @param[out] rstartPixel  start pixel locations for chips within the chunk
+void cuAmpcorProcessor::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff)
+{
+    for(int i=0; i<param->numberWindowDownInChunk; ++i) {
+        int iDown = i;
+        if(i>=nWindowsDown) iDown = nWindowsDown-1;
+        for(int j=0; j<param->numberWindowAcrossInChunk; ++j){
+            int iAcross = j;
+            if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
+            int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross;
+            int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross
+                + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
+            rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff;
+        }
+    }
+}
+
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h
new file mode 100644
index 000000000..2e084a67a
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h
@@ -0,0 +1,73 @@
+/*
+ * @file  cuAmpcorChunk.h
+ * @brief Ampcor processor for a batch of windows
+ *
+ *
+ */
+
+#ifndef __CUAMPCORPROCESSOR_H
+#define __CUAMPCORPROCESSOR_H
+
+#include "SlcImage.h"
+#include "data_types.h"
+#include "cuArrays.h"
+#include "cuAmpcorParameter.h"
+#include "cuOverSampler.h"
+#include "cuSincOverSampler.h"
+#include "cuCorrFrequency.h"
+#include "cuCorrNormalizer.h"
+#include <memory>
+
+
+/**
+ * cuAmpcor batched processor (virtual class)
+ */
+class cuAmpcorProcessor{
+// shared variables
+protected:
+    int idxChunkDown;     ///< index of the chunk in total batches, down
+    int idxChunkAcross;   ///< index of the chunk in total batches, across
+    int idxChunk;         ///<
+    int nWindowsDown;     ///< number of windows in one chunk, down
+    int nWindowsAcross;   ///< number of windows in one chunk, across
+
+    int devId;            ///< GPU device ID to use
+    cudaStream_t stream;  ///< CUDA stream to use
+
+    SlcImage *referenceImage;  ///< reference image object
+    SlcImage *secondaryImage;  ///< secondary image object
+    cuAmpcorParameter *param;   ///< reference to the (global) parameters
+    cuArrays<real2_type> *offsetImage; ///< output offsets image
+    cuArrays<real_type> *snrImage;     ///< snr image
+    cuArrays<real3_type> *covImage;    ///< cov image
+    cuArrays<real_type> *peakValueImage;     ///< peak value image
+
+
+public:
+    // default constructor and destructor
+    cuAmpcorProcessor(cuAmpcorParameter *param_,
+        SlcImage *reference_, SlcImage *secondary_,
+        cuArrays<real2_type> *offsetImage_, cuArrays<real_type> *snrImage_,
+        cuArrays<real3_type> *covImage_, cuArrays<real_type> *peakValueImage_,
+        cudaStream_t stream_);
+    virtual ~cuAmpcorProcessor() = default;
+
+    // Factory method (virtual constructor)
+    static std::unique_ptr<cuAmpcorProcessor> create(int workflow,
+        cuAmpcorParameter *param_,
+        SlcImage *reference_, SlcImage *secondary_,
+        cuArrays<real2_type> *offsetImage_, cuArrays<real_type> *snrImage_,
+        cuArrays<real3_type> *covImage_, cuArrays<real_type> *peakValueImage_,
+        cudaStream_t stream_);
+
+    // workflow specific methods
+    virtual void run(int, int) = 0;
+
+protected:
+    // shared methods
+    void setIndex(int idxDown_, int idxAcross_);
+    void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff);
+
+};
+
+#endif //__CUAMPCORPROCESSOR_H
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp
new file mode 100644
index 000000000..060f6f574
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp
@@ -0,0 +1,527 @@
+#include "cuAmpcorProcessorOnePass.h"
+
+#include "cuAmpcorUtil.h"
+#include <cufft.h>
+#include <iostream>
+
+/**
+ * Run ampcor process for a batch of images (a chunk)
+ * @param[in] idxDown_  index of the chunk along Down/Azimuth direction
+ * @param[in] idxAcross_ index of the chunk along Across/Range direction
+ */
+void cuAmpcorProcessorOnePass::run(int idxDown_, int idxAcross_)
+{
+    // set chunk index
+    setIndex(idxDown_, idxAcross_);
+
+    // load reference image chunk
+    loadReferenceChunk();
+    // oversample reference
+    // (deramping included in oversampler)
+    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod);
+    // c_referenceBatchRaw and c_referenceBatchOverSampled now have enlarged size
+
+    int2 offset = make_int2((c_referenceBatchOverSampled->height - r_referenceBatchOverSampled->height)/2,
+        (c_referenceBatchOverSampled->width - r_referenceBatchOverSampled->width)/2);
+    // extract and take amplitudes
+    cuArraysCopyExtractAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, offset, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the raw reference image(s)
+    c_referenceBatchRaw->outputToFile("c_referenceBatchRaw", stream);
+    // dump the oversampled reference image(s)
+    c_referenceBatchOverSampled->outputToFile("c_referenceBatchOverSampled", stream);
+    r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampled", stream);
+#endif
+
+    // compute and subtract the mean value
+    cuArraysSubtractMean(r_referenceBatchOverSampled, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the oversampled reference image(s) with mean subtracted
+    r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream);
+#endif
+
+    // load secondary image chunk to c_secondaryBatchRaw
+    loadSecondaryChunk();
+    // oversampling the secondary image(s)
+    secondaryBatchOverSampler->execute(c_secondaryBatchRaw, c_secondaryBatchOverSampled, param->derampMethod);
+    // take amplitudes
+    cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the raw secondary image
+    c_secondaryBatchRaw->outputToFile("c_secondaryBatchRaw", stream);
+    // dump the oversampled secondary image(s)
+    c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream);
+    r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream);
+#endif
+
+    // correlate oversampled images
+    if(param->algorithm == 0) {
+        cuCorrFreqDomain_OverSampled->execute(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatch);
+    }
+    else {
+        cuCorrTimeDomain(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatch, stream);
+    }
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the oversampled correlation surface (un-normalized)
+    r_corrBatch->outputToFile("r_corrBatch", stream);
+#endif
+
+    // normalize the correlation surface
+    corrNormalizerOverSampled->execute(r_corrBatch, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the oversampled correlation surface (normalized)
+    r_corrBatch->outputToFile("r_corrBatchNormed", stream);
+#endif
+
+    // find the maximum location of the correlation surface, in a rectangle area {range} from {start}
+    int extraPadSize = param->halfZoomWindowSizeRaw*param->rawDataOversamplingFactor;
+    int2 start = make_int2(extraPadSize, extraPadSize);
+    int2 range = make_int2(r_corrBatch->height-2*extraPadSize, r_corrBatch->width-2*extraPadSize);
+    cuArraysMaxloc2D(r_corrBatch, start, range, offsetInit, r_maxval, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the max location and value
+    offsetInit->outputToFile("i_offsetInit", stream);
+    r_maxval->outputToFile("r_maxvalInit", stream);
+#endif
+
+    // extract a smaller chip around the peak {offsetInit}
+    // with extra pads, i_corrBatchValidCount is no longer needed
+    cuArraysCopyExtractCorr(r_corrBatch, r_corrBatchZoomIn, offsetInit, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the extracted correlation Surface
+    r_corrBatchZoomIn->outputToFile("r_corrBatchZoomIn", stream);
+#endif
+
+    // statistics of correlation surface
+    // estimate variance on r_corrBatch
+    cuEstimateVariance(r_corrBatch, offsetInit, r_maxval, r_referenceBatchOverSampled->size, param->rawDataOversamplingFactor, r_covValue, stream);
+
+    // snr on the extracted surface r_corrBatchZoomIn
+    cuArraysSumSquare(r_corrBatchZoomIn, r_corrBatchSum, stream);
+    int corrSurfaceSize = r_corrBatch->height*r_corrBatch->width;
+    cuEstimateSnr(r_corrBatchSum, r_maxval, r_snrValue, corrSurfaceSize, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    r_snrValue->outputToFile("r_snrValue", stream);
+    r_covValue->outputToFile("r_covValue", stream);
+#endif
+
+    // oversample the correlation surface
+    if(param->corrSurfaceOverSamplingMethod) {
+        // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor
+        // we need the max loc as the center if shifted
+        corrSincOverSampler->execute(r_corrBatchZoomIn, r_corrBatchZoomInOverSampled,
+             maxLocShift, param->corrSurfaceOverSamplingFactor*param->rawDataOversamplingFactor
+            );
+
+    }
+    else {
+        corrOverSampler->execute(r_corrBatchZoomIn, r_corrBatchZoomInOverSampled);
+    }
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the oversampled correlation surface
+    r_corrBatchZoomInOverSampled->outputToFile("r_corrBatchZoomInOverSampled", stream);
+#endif
+
+    //find the max again, within the range of \pm 1 pixel * totalOS
+    cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, param->corrZoomInOversampledSearchStart, param->corrZoomInOversampledSearchRange, offsetZoomIn, corrMaxValue, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the max location on oversampled correlation surface
+    offsetZoomIn->outputToFile("i_offsetZoomIn", stream);
+    corrMaxValue->outputToFile("r_maxvalZoomInOversampled", stream);
+#endif
+
+    // determine the final offset from initial (pixel/2) and oversampled (sub-pixel)
+    cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal,
+        make_int2(param->corrWindowSize.x/2, param->corrWindowSize.y/2), // init offset origin
+        param->rawDataOversamplingFactor, // init offset factor
+        make_int2(param->corrZoomInSize.x/2*param->corrSurfaceOverSamplingFactor, param->corrZoomInSize.y/2*param->corrSurfaceOverSamplingFactor),
+        param->rawDataOversamplingFactor*param->corrSurfaceOverSamplingFactor,
+        stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the final offset
+    offsetFinal->outputToFile("i_offsetFinal", stream);
+#endif
+
+    // Insert the chunk results to final images
+    cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
+    // snr
+    cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
+    // Variance.
+    cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
+    // peak value.
+    cuArraysCopyInsert(r_maxval, peakValueImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
+    // all done
+}
+
+
+
+/// constructor
+cuAmpcorProcessorOnePass::cuAmpcorProcessorOnePass(cuAmpcorParameter *param_, SlcImage *reference_, SlcImage *secondary_,
+    cuArrays<real2_type> *offsetImage_, cuArrays<real_type> *snrImage_, cuArrays<real3_type> *covImage_, cuArrays<real_type> *peakValueImage_,
+    cudaStream_t stream_)
+    : cuAmpcorProcessor(param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_)
+{
+
+    // allocate the SLC image read buffer
+    ChunkOffsetDown = new cuArrays<int> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    ChunkOffsetDown->allocate();
+    ChunkOffsetDown->allocateHost();
+    ChunkOffsetAcross = new cuArrays<int> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    ChunkOffsetAcross->allocate();
+    ChunkOffsetAcross->allocateHost();
+
+    c_referenceBatchRaw = new cuArrays<complex_type> (
+        param->windowSizeHeightRawEnlarged, param->windowSizeWidthRawEnlarged,
+        param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    c_referenceBatchRaw->allocate();
+
+    c_secondaryBatchRaw = new cuArrays<complex_type> (
+        param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
+        param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    c_secondaryBatchRaw->allocate();
+
+    c_referenceBatchOverSampled = new cuArrays<complex_type> (
+            param->windowSizeHeightEnlarged, param->windowSizeWidthEnlarged,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    c_referenceBatchOverSampled->allocate();
+
+    c_secondaryBatchOverSampled = new cuArrays<complex_type> (
+            param->searchWindowSizeHeight, param->searchWindowSizeWidth,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    c_secondaryBatchOverSampled->allocate();
+
+    r_referenceBatchOverSampled = new cuArrays<real_type> (
+         param->windowSizeHeight, param->windowSizeWidth,
+         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    r_referenceBatchOverSampled->allocate();
+
+    r_secondaryBatchOverSampled = new cuArrays<real_type> (
+        param->searchWindowSizeHeight, param->searchWindowSizeWidth,
+        param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    r_secondaryBatchOverSampled->allocate();
+
+    referenceBatchOverSampler = new cuOverSamplerC2C(
+        c_referenceBatchRaw->height, c_referenceBatchRaw->width, //orignal size
+        c_referenceBatchOverSampled->height, c_referenceBatchOverSampled->width, //oversampled size
+        c_referenceBatchRaw->count, stream);
+
+    secondaryBatchOverSampler = new cuOverSamplerC2C(
+        c_secondaryBatchRaw->height, c_secondaryBatchRaw->width,
+        c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width,
+        c_secondaryBatchRaw->count, stream);
+
+    r_corrBatch = new cuArrays<real_type> (
+        param->corrWindowSize.x,
+        param->corrWindowSize.y,
+        param->numberWindowDownInChunk,
+        param->numberWindowAcrossInChunk);
+    r_corrBatch->allocate();
+
+    r_corrBatchZoomIn = new cuArrays<real_type> (
+            param->corrZoomInSize.x,
+            param->corrZoomInSize.y,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
+    r_corrBatchZoomIn->allocate();
+
+    r_corrBatchZoomInOverSampled = new cuArrays<real_type> (
+        param->corrZoomInOversampledSize.x,
+        param->corrZoomInOversampledSize.y,
+        param->numberWindowDownInChunk,
+        param->numberWindowAcrossInChunk);
+    r_corrBatchZoomInOverSampled->allocate();
+
+    offsetInit = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    offsetInit->allocate();
+
+    offsetZoomIn = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    offsetZoomIn->allocate();
+
+    offsetFinal = new cuArrays<real2_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    offsetFinal->allocate();
+
+    // the sinc interpolation center is at the correlation surface center
+    // E.g. (10, 10) for 21x21.
+    maxLocShift = make_int2(param->corrWindowSize.x/2,  param->corrWindowSize.y/2);
+
+    corrMaxValue = new cuArrays<real_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    corrMaxValue->allocate();
+
+    r_corrBatchSum = new cuArrays<real_type> (
+                    param->numberWindowDownInChunk,
+                    param->numberWindowAcrossInChunk);
+    r_corrBatchSum->allocate();
+
+    i_maxloc = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    i_maxloc->allocate();
+
+    r_maxval = new cuArrays<real_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    r_maxval->allocate();
+
+    r_snrValue = new cuArrays<real_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    r_snrValue->allocate();
+
+    r_covValue = new cuArrays<real3_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    r_covValue->allocate();
+
+    // end of new arrays
+
+    if(param->corrSurfaceOverSamplingMethod) {
+        corrSincOverSampler = new cuSincOverSamplerR2R(param->corrSurfaceOverSamplingFactor, stream);
+    }
+    else {
+        corrOverSampler= new cuOverSamplerR2R(
+            r_corrBatchZoomIn->height, r_corrBatchZoomIn->width,
+            r_corrBatchZoomInOverSampled->height, r_corrBatchZoomInOverSampled->width,
+            param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
+            stream);
+    }
+    if(param->algorithm == 0) {
+        cuCorrFreqDomain_OverSampled = new cuFreqCorrelator(
+            param->searchWindowSizeHeight, param->searchWindowSizeWidth,
+            param->numberWindowDownInChunk * param->numberWindowAcrossInChunk,
+            stream);
+    }
+
+
+    corrNormalizerOverSampled =
+        std::unique_ptr<cuNormalizeProcessor>(newCuNormalizer(
+        param->searchWindowSizeHeight,
+        param->searchWindowSizeWidth,
+        param->numberWindowDownInChunk * param->numberWindowAcrossInChunk
+        ));
+
+
+#ifdef CUAMPCOR_DEBUG
+    std::cout << "all objects in chunk are created ...\n";
+#endif
+}
+
+void cuAmpcorProcessorOnePass::loadReferenceChunk()
+{
+
+    // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw
+    // then copy to a batch of windows with (nImages, height, width) (leading dimension on the right)
+
+    // get the chunk size to be loaded to gpu
+    int startD = param->referenceChunkStartPixelDown[idxChunk]; //start pixel down (along height)
+    int startA = param->referenceChunkStartPixelAcross[idxChunk]; // start pixel across (along width)
+    int height =  param->referenceChunkHeight[idxChunk]; // number of pixels along height
+    int width = param->referenceChunkWidth[idxChunk];  // number of pixels along width
+
+#ifdef CUAMPCOR_DEBUG
+    std::cout << "loading reference chunk ...\n "
+              << "    index: " << idxChunk << " "
+              << "starting pixel: (" << startD << ", " << startA << ") "
+              << "size : (" << height << ", " << width << ")"
+              << "\n";
+#endif
+
+
+    // check whether all pixels are outside the original image range
+    if (height ==0 || width ==0)
+    {
+        // yes, simply set the image to 0
+        c_referenceBatchRaw->setZero(stream);
+    }
+    else
+    {
+        // use cpu to compute the starting positions for each window
+        getRelativeOffset(ChunkOffsetDown->hostData, param->referenceStartPixelDown, param->referenceChunkStartPixelDown[idxChunk]);
+        // copy the positions to gpu
+        ChunkOffsetDown->copyToDevice(stream);
+        // same for the across direction
+        getRelativeOffset(ChunkOffsetAcross->hostData, param->referenceStartPixelAcross, param->referenceChunkStartPixelAcross[idxChunk]);
+        ChunkOffsetAcross->copyToDevice(stream);
+
+#ifdef CUAMPCOR_DEBUG
+        std::cout << "loading reference windows from chunk debug ... \n";
+        auto * startPixelDownToChunk = ChunkOffsetDown->hostData;
+        auto * startPixelAcrossToChunk = ChunkOffsetAcross->hostData;
+
+        for(int i=0; i<param->numberWindowDownInChunk; ++i) {
+            int iDown = i;
+            if(i>=nWindowsDown) iDown = nWindowsDown-1;
+            for(int j=0; j<param->numberWindowAcrossInChunk; ++j){
+                int iAcross = j;
+                if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
+                int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross;
+                int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross
+                    + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
+                std::cout << "Window index in chuck: (" << iDown << ", " << iAcross << ") \n";
+                std::cout << "    Staring pixel location from raw: (" <<  param->referenceStartPixelDown[idxInAll] << ", "
+                                                                      <<  param->referenceStartPixelAcross[idxInAll] <<")\n";
+                std::cout << "    Staring pixel location from chunk: (" <<  startPixelDownToChunk[idxInChunk] << ", "
+                                                                        <<  startPixelAcrossToChunk[idxInChunk] <<")\n";
+
+            }
+        }
+#endif
+
+        // check whether the image is complex (e.g., SLC) or real( e.g. TIFF)
+        if(param->referenceImageDataType==2)
+        {
+            // allocate a gpu buffer to load data from cpu/file
+            // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19
+            c_referenceChunkRaw = new cuArrays<image_complex_type> (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth);
+            c_referenceChunkRaw->allocate();
+
+            // load the data from cpu
+            referenceImage->loadToDevice((void *)c_referenceChunkRaw->devData, startD, startA, height, width, stream);
+
+            //copy the chunk to a batch format (nImages, height, width)
+            // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data
+            if(param->derampMethod == 0) {
+                cuArraysCopyToBatchAbsWithOffset(c_referenceChunkRaw,
+                    param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk],
+                    c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            }
+            else {
+                cuArraysCopyToBatchWithOffset(c_referenceChunkRaw,
+                    param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk],
+                    c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            }
+            // deallocate the gpu buffer
+            delete c_referenceChunkRaw;
+        }
+        // if the image is real
+        else {
+            r_referenceChunkRaw = new cuArrays<image_real_type> (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth);
+            r_referenceChunkRaw->allocate();
+
+            // load the data from cpu
+            referenceImage->loadToDevice((void *)r_referenceChunkRaw->devData, startD, startA, height, width, stream);
+
+            // copy the chunk (real) to a batch format (complex)
+            cuArraysCopyToBatchWithOffsetR2C(r_referenceChunkRaw,
+                    param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk],
+                    c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            // deallocate the gpu buffer
+            delete r_referenceChunkRaw;
+        } // end of if complex
+    } // end of if all pixels out of range
+}
+
+void cuAmpcorProcessorOnePass::loadSecondaryChunk()
+{
+    // get the chunk size to be loaded to gpu
+    int height =  param->secondaryChunkHeight[idxChunk]; // number of pixels along height
+    int width = param->secondaryChunkWidth[idxChunk]; // number of pixels along width
+
+    // check whether all pixels are outside the original image range
+    if (height ==0 || width ==0)
+    {
+        // yes, simply set the image to 0
+        c_secondaryBatchRaw->setZero(stream);
+    }
+    else
+    {
+        //copy to a batch format (nImages, height, width)
+        getRelativeOffset(ChunkOffsetDown->hostData, param->secondaryStartPixelDown, param->secondaryChunkStartPixelDown[idxChunk]);
+        ChunkOffsetDown->copyToDevice(stream);
+        getRelativeOffset(ChunkOffsetAcross->hostData, param->secondaryStartPixelAcross, param->secondaryChunkStartPixelAcross[idxChunk]);
+        ChunkOffsetAcross->copyToDevice(stream);
+
+        if(param->secondaryImageDataType==2)
+        {
+            c_secondaryChunkRaw = new cuArrays<image_complex_type> (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth);
+            c_secondaryChunkRaw->allocate();
+
+            //load a chunk from mmap to gpu
+            secondaryImage->loadToDevice(c_secondaryChunkRaw->devData,
+                param->secondaryChunkStartPixelDown[idxChunk],
+                param->secondaryChunkStartPixelAcross[idxChunk],
+                param->secondaryChunkHeight[idxChunk],
+                param->secondaryChunkWidth[idxChunk],
+                stream);
+
+            if(param->derampMethod == 0) {
+                cuArraysCopyToBatchAbsWithOffset(c_secondaryChunkRaw,
+                    param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk],
+                    c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            }
+            else {
+               cuArraysCopyToBatchWithOffset(c_secondaryChunkRaw,
+                    param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk],
+                    c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            }
+            delete c_secondaryChunkRaw;
+        }
+        else { //real image
+            //allocate the gpu buffer
+            r_secondaryChunkRaw = new cuArrays<image_real_type> (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth);
+            r_secondaryChunkRaw->allocate();
+
+            //load a chunk from mmap to gpu
+            secondaryImage->loadToDevice(r_secondaryChunkRaw->devData,
+                param->secondaryChunkStartPixelDown[idxChunk],
+                param->secondaryChunkStartPixelAcross[idxChunk],
+                param->secondaryChunkHeight[idxChunk],
+                param->secondaryChunkWidth[idxChunk],
+                stream);
+
+            // convert to the batch format
+            cuArraysCopyToBatchWithOffsetR2C(r_secondaryChunkRaw,
+                param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk],
+                c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            delete r_secondaryChunkRaw;
+        }
+    }
+}
+
+// destructor
+cuAmpcorProcessorOnePass::~cuAmpcorProcessorOnePass()
+{
+    corrNormalizerOverSampled.release();
+
+    if(param->corrSurfaceOverSamplingMethod) {
+        delete corrSincOverSampler;
+    }
+    else {
+        delete corrOverSampler;
+    }
+    if(param->algorithm == 0) {
+        delete cuCorrFreqDomain_OverSampled;
+    }
+
+    delete ChunkOffsetDown ;
+    delete ChunkOffsetAcross ;
+    delete c_referenceBatchRaw;
+    delete c_secondaryBatchRaw;
+    delete c_referenceBatchOverSampled;
+    delete c_secondaryBatchOverSampled;
+    delete r_referenceBatchOverSampled;
+    delete r_secondaryBatchOverSampled;
+    delete referenceBatchOverSampler;
+    delete secondaryBatchOverSampler;
+
+    delete r_corrBatch;
+    delete r_corrBatchZoomIn;
+    delete r_corrBatchZoomInOverSampled;
+    delete offsetInit;
+    delete offsetZoomIn;
+    delete offsetFinal;
+    delete corrMaxValue;
+
+    delete r_corrBatchSum;
+    delete i_maxloc;
+    delete r_maxval;
+    delete r_snrValue;
+    delete r_covValue;
+
+    // end of deletions
+
+}
+
+// end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.h
new file mode 100644
index 000000000..fb5f1e8b4
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.h
@@ -0,0 +1,88 @@
+/*
+ * @file  cuAmpcorProcessorOnePass.h
+ * @brief Ampcor processor for a batch of windows with OnePass workflow
+ *
+ *
+ */
+
+#ifndef __CUAMPCORPROCESSOROnePass_H
+#define __CUAMPCORPROCESSOROnePass_H
+
+#include "cuAmpcorProcessor.h"
+
+
+/**
+ * cuAmpcor processor for a chunk (a batch of windows)
+ */
+class cuAmpcorProcessorOnePass : public cuAmpcorProcessor {
+
+private:
+
+    // local variables and workers
+    // gpu buffer to load images from file
+    // image_complex_type uses original image type,
+    //    convert to complex_type when copied to c_referenceBatchRaw
+    cuArrays<image_complex_type> * c_referenceChunkRaw, * c_secondaryChunkRaw;
+    cuArrays<image_real_type> * r_referenceChunkRaw, * r_secondaryChunkRaw;
+
+        // offset data
+    cuArrays<int> *ChunkOffsetDown, *ChunkOffsetAcross;
+
+    // windows raw (not oversampled) data, complex and real
+    cuArrays<complex_type> * c_referenceBatchRaw, * c_secondaryBatchRaw;
+    // cuArrays<real_type> * r_referenceBatchRaw, * r_secondaryBatchRaw;
+
+    // windows oversampled data
+    cuArrays<complex_type> * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled;
+    cuArrays<real_type> * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled;
+    cuArrays<real_type> * r_corrBatch, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled;
+
+    // oversampling processors for complex images
+    cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
+
+    // oversampling processor for correlation surface
+    cuOverSamplerR2R *corrOverSampler;
+    cuSincOverSamplerR2R *corrSincOverSampler;
+
+    // cross-correlation processor with frequency domain algorithm
+    cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
+
+    // correlation surface normalizer
+    std::unique_ptr<cuNormalizeProcessor> corrNormalizerRaw;
+    std::unique_ptr<cuNormalizeProcessor> corrNormalizerOverSampled;
+
+    // save offset results in different stages
+    cuArrays<int2> *offsetInit;
+    cuArrays<int2> *offsetZoomIn;
+    cuArrays<complex_type> *offsetFinal;
+    int2 maxLocShift; // record the maxloc from the extract center
+    cuArrays<real_type> *corrMaxValue;
+    cuArrays<int2> *i_maxloc;
+    cuArrays<real_type> *r_maxval;
+
+    // SNR estimation
+    cuArrays<real_type> *r_corrBatchRawZoomIn;
+    cuArrays<real_type> *r_corrBatchSum;
+    cuArrays<real_type> *r_snrValue;
+
+    // Variance estimation
+    cuArrays<real3_type> *r_covValue;
+
+public:
+    // constructor
+    cuAmpcorProcessorOnePass(cuAmpcorParameter *param_,
+        SlcImage *reference_, SlcImage *secondary_,
+        cuArrays<real2_type> *offsetImage_, cuArrays<real_type> *snrImage_,
+        cuArrays<real3_type> *covImage_, cuArrays<real_type> *peakValueImage_,
+        cudaStream_t stream_);
+    // destructor
+    ~cuAmpcorProcessorOnePass() override;
+
+    // run the given chunk
+    void run(int, int) override;
+
+    void loadReferenceChunk();
+    void loadSecondaryChunk();
+};
+
+#endif //__CUAMPCORPROCESSOROnePass_H
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp
similarity index 60%
rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp
index 8f35fa075..b43721ca4 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp
@@ -1,12 +1,15 @@
-#include "cuAmpcorChunk.h"
+#include "cuAmpcorProcessorTwoPass.h"
+
 #include "cuAmpcorUtil.h"
+#include <cufft.h>
+#include <iostream>
 
 /**
  * Run ampcor process for a batch of images (a chunk)
  * @param[in] idxDown_  index of the chunk along Down/Azimuth direction
  * @param[in] idxAcross_ index of the chunk along Across/Range direction
  */
-void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
+void cuAmpcorProcessorTwoPass::run(int idxDown_, int idxAcross_)
 {
     // set chunk index
     setIndex(idxDown_, idxAcross_);
@@ -66,7 +69,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, r_maxval, stream);
 
     // estimate variance
-    cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_referenceBatchRaw->size, r_covValue, stream);
+    cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_referenceBatchRaw->size, 1, r_covValue, stream);
 
     // estimate SNR
     // step1: extraction of correlation surface around the peak
@@ -171,11 +174,11 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
 #endif
 
     // oversample the correlation surface
-    if(param->oversamplingMethod) {
-        // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor
+    if(param->corrSurfaceOverSamplingMethod) {
+        // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversampling factor
         // we need the max loc as the center if shifted
         corrSincOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled,
-            maxLocShift, param->oversamplingFactor*param->rawDataOversamplingFactor
+            maxLocShift, param->corrSurfaceOverSamplingFactor*param->rawDataOversamplingFactor
             );
     }
     else {
@@ -198,8 +201,8 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
 
     // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel)
     // = (Init-HalfsearchRange) + ZoomIn/(2*ovs)
-    cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal,
-        param->oversamplingFactor, param->rawDataOversamplingFactor,
+    cuSubPixelOffset2Pass(offsetInit, offsetZoomIn, offsetFinal,
+        param->corrSurfaceOverSamplingFactor, param->rawDataOversamplingFactor,
         param->halfSearchRangeDownRaw, param->halfSearchRangeAcrossRaw,
         stream);
 
@@ -209,54 +212,13 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
     // Variance.
     cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
-    // Cross-correlation peak
-    cuArraysCopyInsert(corrMaxValue, corrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
+    // peak value.
+    cuArraysCopyInsert(r_maxval, peakValueImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
     // all done
 
 }
 
-/// set chunk index
-void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
-{
-    idxChunkDown = idxDown_;
-    idxChunkAcross = idxAcross_;
-    idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
-
-    if(idxChunkDown == param->numberChunkDown -1) {
-        nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
-    }
-    else {
-        nWindowsDown = param->numberWindowDownInChunk;
-    }
-
-    if(idxChunkAcross == param->numberChunkAcross -1) {
-        nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
-    }
-    else {
-        nWindowsAcross = param->numberWindowAcrossInChunk;
-    }
-}
-
-/// obtain the starting pixels for each chip
-/// @param[in] oStartPixel start pixel locations for all chips
-/// @param[out] rstartPixel  start pixel locations for chips within the chunk
-void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff)
-{
-    for(int i=0; i<param->numberWindowDownInChunk; ++i) {
-        int iDown = i;
-        if(i>=nWindowsDown) iDown = nWindowsDown-1;
-        for(int j=0; j<param->numberWindowAcrossInChunk; ++j){
-            int iAcross = j;
-            if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
-            int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross;
-            int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross
-                + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
-            rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff;
-        }
-    }
-}
-
-void cuAmpcorChunk::loadReferenceChunk()
+void cuAmpcorProcessorTwoPass::loadReferenceChunk()
 {
 
     // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw
@@ -268,114 +230,139 @@ void cuAmpcorChunk::loadReferenceChunk()
     int height =  param->referenceChunkHeight[idxChunk]; // number of pixels along height
     int width = param->referenceChunkWidth[idxChunk];  // number of pixels along width
 
-    //use cpu to compute the starting positions for each window
-    getRelativeOffset(ChunkOffsetDown->hostData, &param->referenceStartPixelDown[0], param->referenceChunkStartPixelDown[idxChunk]);
-    // copy the positions to gpu
-    ChunkOffsetDown->copyToDevice(stream);
-    // same for the across direction
-    getRelativeOffset(ChunkOffsetAcross->hostData, &param->referenceStartPixelAcross[0], param->referenceChunkStartPixelAcross[idxChunk]);
-    ChunkOffsetAcross->copyToDevice(stream);
-
-    // check whether the image is complex (e.g., SLC) or real( e.g. TIFF)
-    if(referenceImage->isComplex())
+    // check whether all pixels are outside the original image range
+    if (height ==0 || width ==0)
+    {
+        // yes, simply set the image to 0
+        c_referenceBatchRaw->setZero(stream);
+    }
+    else
     {
-        // allocate a gpu buffer to load data from cpu/file
-        // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19
-        c_referenceChunkRaw = new cuArrays<float2> (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth);
-        c_referenceChunkRaw->allocate();
-
-        // load the data from cpu
-        referenceImage->loadToDevice((void *)c_referenceChunkRaw->devData, startD, startA, height, width, stream);
-
-        //copy the chunk to a batch format (nImages, height, width)
-        // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data
-        if(param->derampMethod == 0) {
-            cuArraysCopyToBatchAbsWithOffset(c_referenceChunkRaw, param->referenceChunkWidth[idxChunk],
-                c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        // use cpu to compute the starting positions for each window
+        getRelativeOffset(ChunkOffsetDown->hostData, param->referenceStartPixelDown, param->referenceChunkStartPixelDown[idxChunk]);
+        // copy the positions to gpu
+        ChunkOffsetDown->copyToDevice(stream);
+        // same for the across direction
+        getRelativeOffset(ChunkOffsetAcross->hostData, param->referenceStartPixelAcross, param->referenceChunkStartPixelAcross[idxChunk]);
+        ChunkOffsetAcross->copyToDevice(stream);
+
+        // check whether the image is complex (e.g., SLC) or real( e.g. TIFF)
+        if(param->referenceImageDataType==2)
+        {
+            // allocate a gpu buffer to load data from cpu/file
+            // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19
+
+            c_referenceChunkRaw = new cuArrays<image_complex_type> (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth);
+            c_referenceChunkRaw->allocate();
+
+            // load the data from cpu
+            referenceImage->loadToDevice((void *)c_referenceChunkRaw->devData, startD, startA, height, width, stream);
+
+            //copy the chunk to a batch format (nImages, height, width)
+            // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data
+            if(param->derampMethod == 0) {
+                cuArraysCopyToBatchAbsWithOffset(c_referenceChunkRaw,
+                    param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk],
+                    c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            }
+            else {
+                cuArraysCopyToBatchWithOffset(c_referenceChunkRaw,
+                    param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk],
+                    c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            }
+            // deallocate the gpu buffer
+            delete c_referenceChunkRaw;
         }
+        // if the image is real
         else {
-            cuArraysCopyToBatchWithOffset(c_referenceChunkRaw, param->referenceChunkWidth[idxChunk],
-                c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
-        }
-        // deallocate the gpu buffer
-        delete c_referenceChunkRaw;
-    }
-    // if the image is real
-    else {
-        r_referenceChunkRaw = new cuArrays<float> (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth);
-        r_referenceChunkRaw->allocate();
-
-        // load the data from cpu
-        referenceImage->loadToDevice((void *)r_referenceChunkRaw->devData, startD, startA, height, width, stream);
-
-        // copy the chunk (real) to a batch format (complex)
-        cuArraysCopyToBatchWithOffsetR2C(r_referenceChunkRaw, param->referenceChunkWidth[idxChunk],
-                c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
-        // deallocate the gpu buffer
-        delete r_referenceChunkRaw;
-    }
-
-
+            r_referenceChunkRaw = new cuArrays<image_real_type> (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth);
+            r_referenceChunkRaw->allocate();
+
+            // load the data from cpu
+            referenceImage->loadToDevice((void *)r_referenceChunkRaw->devData, startD, startA, height, width, stream);
+
+            // copy the chunk (real) to a batch format (complex)
+            cuArraysCopyToBatchWithOffsetR2C(r_referenceChunkRaw,
+                    param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk],
+                    c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            // deallocate the gpu buffer
+            delete r_referenceChunkRaw;
+        } // end of if complex
+    } // end of if all pixels out of range
 }
 
-void cuAmpcorChunk::loadSecondaryChunk()
+void cuAmpcorProcessorTwoPass::loadSecondaryChunk()
 {
+    // get the chunk size to be loaded to gpu
+    int height =  param->secondaryChunkHeight[idxChunk]; // number of pixels along height
+    int width = param->secondaryChunkWidth[idxChunk]; // number of pixels along width
 
-    //copy to a batch format (nImages, height, width)
-    getRelativeOffset(ChunkOffsetDown->hostData, &param->secondaryStartPixelDown[0], param->secondaryChunkStartPixelDown[idxChunk]);
-    ChunkOffsetDown->copyToDevice(stream);
-    getRelativeOffset(ChunkOffsetAcross->hostData, &param->secondaryStartPixelAcross[0], param->secondaryChunkStartPixelAcross[idxChunk]);
-    ChunkOffsetAcross->copyToDevice(stream);
-
-    if(secondaryImage->isComplex())
+    // check whether all pixels are outside the original image range
+    if (height ==0 || width ==0)
     {
-        c_secondaryChunkRaw = new cuArrays<float2> (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth);
-        c_secondaryChunkRaw->allocate();
-
-        //load a chunk from mmap to gpu
-        secondaryImage->loadToDevice(c_secondaryChunkRaw->devData,
-            param->secondaryChunkStartPixelDown[idxChunk],
-            param->secondaryChunkStartPixelAcross[idxChunk],
-            param->secondaryChunkHeight[idxChunk],
-            param->secondaryChunkWidth[idxChunk],
-            stream);
-
-        if(param->derampMethod == 0) {
-            cuArraysCopyToBatchAbsWithOffset(c_secondaryChunkRaw, param->secondaryChunkWidth[idxChunk],
-                c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        // yes, simply set the image to 0
+        c_secondaryBatchRaw->setZero(stream);
+    }
+    else
+    {
+        //copy to a batch format (nImages, height, width)
+        getRelativeOffset(ChunkOffsetDown->hostData, param->secondaryStartPixelDown, param->secondaryChunkStartPixelDown[idxChunk]);
+        ChunkOffsetDown->copyToDevice(stream);
+        getRelativeOffset(ChunkOffsetAcross->hostData, param->secondaryStartPixelAcross, param->secondaryChunkStartPixelAcross[idxChunk]);
+        ChunkOffsetAcross->copyToDevice(stream);
+
+        if(param->secondaryImageDataType==2)
+        {
+            c_secondaryChunkRaw = new cuArrays<image_complex_type> (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth);
+            c_secondaryChunkRaw->allocate();
+
+            //load a chunk from mmap to gpu
+            secondaryImage->loadToDevice(c_secondaryChunkRaw->devData,
+                param->secondaryChunkStartPixelDown[idxChunk],
+                param->secondaryChunkStartPixelAcross[idxChunk],
+                param->secondaryChunkHeight[idxChunk],
+                param->secondaryChunkWidth[idxChunk],
+                stream);
+
+            if(param->derampMethod == 0) {
+                cuArraysCopyToBatchAbsWithOffset(c_secondaryChunkRaw,
+                    param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk],
+                    c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            }
+            else {
+               cuArraysCopyToBatchWithOffset(c_secondaryChunkRaw,
+                    param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk],
+                    c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            }
+            delete c_secondaryChunkRaw;
         }
-        else {
-           cuArraysCopyToBatchWithOffset(c_secondaryChunkRaw, param->secondaryChunkWidth[idxChunk],
+        else { //real image
+            //allocate the gpu buffer
+            r_secondaryChunkRaw = new cuArrays<image_real_type> (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth);
+            r_secondaryChunkRaw->allocate();
+
+            //load a chunk from mmap to gpu
+            secondaryImage->loadToDevice(r_secondaryChunkRaw->devData,
+                param->secondaryChunkStartPixelDown[idxChunk],
+                param->secondaryChunkStartPixelAcross[idxChunk],
+                param->secondaryChunkHeight[idxChunk],
+                param->secondaryChunkWidth[idxChunk],
+                stream);
+
+            // convert to the batch format
+            cuArraysCopyToBatchWithOffsetR2C(r_secondaryChunkRaw,
+                param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk],
                 c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+            delete r_secondaryChunkRaw;
         }
-        delete c_secondaryChunkRaw;
-    }
-    else { //real image
-        //allocate the gpu buffer
-        r_secondaryChunkRaw = new cuArrays<float> (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth);
-        r_secondaryChunkRaw->allocate();
-
-        //load a chunk from mmap to gpu
-        secondaryImage->loadToDevice(r_secondaryChunkRaw->devData,
-            param->secondaryChunkStartPixelDown[idxChunk],
-            param->secondaryChunkStartPixelAcross[idxChunk],
-            param->secondaryChunkHeight[idxChunk],
-            param->secondaryChunkWidth[idxChunk],
-            stream);
-
-        // convert to the batch format
-        cuArraysCopyToBatchWithOffsetR2C(r_secondaryChunkRaw, param->secondaryChunkWidth[idxChunk],
-                c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
-        delete r_secondaryChunkRaw;
     }
 }
 
 /// constructor
-cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_,
-    cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_, cuArrays<float3> *covImage_,
-    cuArrays<float> *corrImage_,
+cuAmpcorProcessorTwoPass::cuAmpcorProcessorTwoPass(cuAmpcorParameter *param_, SlcImage *reference_, SlcImage *secondary_,
+    cuArrays<real2_type> *offsetImage_, cuArrays<real_type> *snrImage_, cuArrays<real3_type> *covImage_, cuArrays<real_type> *peakValueImage_,
     cudaStream_t stream_)
-
+    : cuAmpcorProcessor(param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_)
 {
     param = param_;
     referenceImage = reference_;
@@ -383,7 +370,6 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
     offsetImage = offsetImage_;
     snrImage = snrImage_;
     covImage = covImage_;
-    corrImage = corrImage_;
 
     stream = stream_;
 
@@ -394,47 +380,47 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
     ChunkOffsetAcross->allocate();
     ChunkOffsetAcross->allocateHost();
 
-    c_referenceBatchRaw = new cuArrays<float2> (
+    c_referenceBatchRaw = new cuArrays<complex_type> (
         param->windowSizeHeightRaw, param->windowSizeWidthRaw,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_referenceBatchRaw->allocate();
 
-    c_secondaryBatchRaw = new cuArrays<float2> (
+    c_secondaryBatchRaw = new cuArrays<complex_type> (
         param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_secondaryBatchRaw->allocate();
 
-    r_referenceBatchRaw = new cuArrays<float> (
+    r_referenceBatchRaw = new cuArrays<real_type> (
         param->windowSizeHeightRaw, param->windowSizeWidthRaw,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     r_referenceBatchRaw->allocate();
 
-    r_secondaryBatchRaw = new cuArrays<float> (
+    r_secondaryBatchRaw = new cuArrays<real_type> (
         param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     r_secondaryBatchRaw->allocate();
 
-    c_secondaryBatchZoomIn = new cuArrays<float2> (
+    c_secondaryBatchZoomIn = new cuArrays<complex_type> (
         param->searchWindowSizeHeightRawZoomIn, param->searchWindowSizeWidthRawZoomIn,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_secondaryBatchZoomIn->allocate();
 
-    c_referenceBatchOverSampled = new cuArrays<float2> (
+    c_referenceBatchOverSampled = new cuArrays<complex_type> (
             param->windowSizeHeight, param->windowSizeWidth,
             param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_referenceBatchOverSampled->allocate();
 
-    c_secondaryBatchOverSampled = new cuArrays<float2> (
+    c_secondaryBatchOverSampled = new cuArrays<complex_type> (
             param->searchWindowSizeHeight, param->searchWindowSizeWidth,
             param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_secondaryBatchOverSampled->allocate();
 
-    r_referenceBatchOverSampled = new cuArrays<float> (
+    r_referenceBatchOverSampled = new cuArrays<real_type> (
             param->windowSizeHeight, param->windowSizeWidth,
             param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     r_referenceBatchOverSampled->allocate();
 
-    r_secondaryBatchOverSampled = new cuArrays<float> (
+    r_secondaryBatchOverSampled = new cuArrays<real_type> (
             param->searchWindowSizeHeight, param->searchWindowSizeWidth,
             param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     r_secondaryBatchOverSampled->allocate();
@@ -447,21 +433,21 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
     secondaryBatchOverSampler = new cuOverSamplerC2C(c_secondaryBatchZoomIn->height, c_secondaryBatchZoomIn->width,
             c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, c_secondaryBatchRaw->count, stream);
 
-    r_corrBatchRaw = new cuArrays<float> (
+    r_corrBatchRaw = new cuArrays<real_type> (
             param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
             param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
             param->numberWindowDownInChunk,
             param->numberWindowAcrossInChunk);
     r_corrBatchRaw->allocate();
 
-    r_corrBatchZoomIn = new cuArrays<float> (
+    r_corrBatchZoomIn = new cuArrays<real_type> (
             param->searchWindowSizeHeight - param->windowSizeHeight+1,
             param->searchWindowSizeWidth - param->windowSizeWidth+1,
             param->numberWindowDownInChunk,
             param->numberWindowAcrossInChunk);
     r_corrBatchZoomIn->allocate();
 
-    r_corrBatchZoomInAdjust = new cuArrays<float> (
+    r_corrBatchZoomInAdjust = new cuArrays<real_type> (
             param->searchWindowSizeHeight - param->windowSizeHeight,
             param->searchWindowSizeWidth - param->windowSizeWidth,
             param->numberWindowDownInChunk,
@@ -469,9 +455,9 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
     r_corrBatchZoomInAdjust->allocate();
 
 
-    r_corrBatchZoomInOverSampled = new cuArrays<float> (
-        param->zoomWindowSize * param->oversamplingFactor,
-        param->zoomWindowSize * param->oversamplingFactor,
+    r_corrBatchZoomInOverSampled = new cuArrays<real_type> (
+        param->zoomWindowSize * param->corrSurfaceOverSamplingFactor,
+        param->zoomWindowSize * param->corrSurfaceOverSamplingFactor,
         param->numberWindowDownInChunk,
         param->numberWindowAcrossInChunk);
     r_corrBatchZoomInOverSampled->allocate();
@@ -482,18 +468,18 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
     offsetZoomIn = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     offsetZoomIn->allocate();
 
-    offsetFinal = new cuArrays<float2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    offsetFinal = new cuArrays<real2_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     offsetFinal->allocate();
 
     maxLocShift = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     maxLocShift->allocate();
 
-    corrMaxValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    corrMaxValue = new cuArrays<real_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     corrMaxValue->allocate();
 
 
     // new arrays due to snr estimation
-    r_corrBatchRawZoomIn = new cuArrays<float> (
+    r_corrBatchRawZoomIn = new cuArrays<real_type> (
             param->corrRawZoomInHeight,
             param->corrRawZoomInWidth,
             param->numberWindowDownInChunk,
@@ -508,7 +494,7 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
     i_corrBatchZoomInValid->allocate();
 
 
-    r_corrBatchSum = new cuArrays<float> (
+    r_corrBatchSum = new cuArrays<real_type> (
                     param->numberWindowDownInChunk,
                     param->numberWindowAcrossInChunk);
     r_corrBatchSum->allocate();
@@ -522,27 +508,27 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
 
     i_maxloc->allocate();
 
-    r_maxval = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    r_maxval = new cuArrays<real_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
 
     r_maxval->allocate();
 
-    r_snrValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    r_snrValue = new cuArrays<real_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
 
     r_snrValue->allocate();
 
-    r_covValue = new cuArrays<float3> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+    r_covValue = new cuArrays<real3_type> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
 
     r_covValue->allocate();
 
     // end of new arrays
 
-    if(param->oversamplingMethod) {
-        corrSincOverSampler = new cuSincOverSamplerR2R(param->oversamplingFactor, stream);
+    if(param->corrSurfaceOverSamplingMethod) {
+        corrSincOverSampler = new cuSincOverSamplerR2R(param->corrSurfaceOverSamplingFactor, stream);
     }
     else {
         corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize,
-            (param->zoomWindowSize)*param->oversamplingFactor,
-            (param->zoomWindowSize)*param->oversamplingFactor,
+            (param->zoomWindowSize)*param->corrSurfaceOverSamplingFactor,
+            (param->zoomWindowSize)*param->corrSurfaceOverSamplingFactor,
             param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
             stream);
     }
@@ -557,17 +543,18 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
             stream);
     }
 
-    corrNormalizerRaw = new cuNormalizer(
+    corrNormalizerRaw = std::unique_ptr<cuNormalizeProcessor>(newCuNormalizer(
         param->searchWindowSizeHeightRaw,
         param->searchWindowSizeWidthRaw,
         param->numberWindowDownInChunk * param->numberWindowAcrossInChunk
-        );
+        ));
 
-    corrNormalizerOverSampled = new cuNormalizer(
+    corrNormalizerOverSampled =
+        std::unique_ptr<cuNormalizeProcessor>(newCuNormalizer(
         param->searchWindowSizeHeight,
         param->searchWindowSizeWidth,
         param->numberWindowDownInChunk * param->numberWindowAcrossInChunk
-        );
+        ));
 
 
 #ifdef CUAMPCOR_DEBUG
@@ -576,8 +563,57 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
 }
 
 // destructor
-cuAmpcorChunk::~cuAmpcorChunk()
+cuAmpcorProcessorTwoPass::~cuAmpcorProcessorTwoPass()
 {
+    corrNormalizerOverSampled.release();
+    corrNormalizerRaw.release();
+
+    if(param->corrSurfaceOverSamplingMethod) {
+        delete corrSincOverSampler;
+    }
+    else {
+        delete corrOverSampler;
+    }
+    if(param->algorithm == 0) {
+        delete cuCorrFreqDomain;
+        delete cuCorrFreqDomain_OverSampled;
+    }
+
+    delete ChunkOffsetDown ;
+    delete ChunkOffsetAcross ;
+    delete c_referenceBatchRaw;
+    delete c_secondaryBatchRaw;
+    delete r_referenceBatchRaw;
+    delete r_secondaryBatchRaw;
+    delete c_secondaryBatchZoomIn;
+    delete c_referenceBatchOverSampled;
+    delete c_secondaryBatchOverSampled;
+    delete r_referenceBatchOverSampled;
+    delete r_secondaryBatchOverSampled;
+    delete referenceBatchOverSampler;
+    delete secondaryBatchOverSampler;
+
+    delete r_corrBatchRaw;
+    delete r_corrBatchZoomIn;
+    delete r_corrBatchZoomInAdjust;
+    delete r_corrBatchZoomInOverSampled;
+    delete offsetInit;
+    delete offsetZoomIn;
+    delete offsetFinal;
+    delete maxLocShift;
+    delete corrMaxValue;
+
+    delete r_corrBatchRawZoomIn;
+    delete i_corrBatchZoomInValid;
+    delete r_corrBatchSum;
+    delete i_corrBatchValidCount;
+    delete i_maxloc;
+    delete r_maxval;
+    delete r_snrValue;
+    delete r_covValue;
+
+    // end of deletions
+
 }
 
 // end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.h
new file mode 100644
index 000000000..de2a29d80
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.h
@@ -0,0 +1,88 @@
+/*
+ * @file  cuAmpcorProcessorTwoPass.h
+ * @brief Ampcor processor for a batch of windows with TwoPass workflow
+ *
+ *
+ */
+
+#ifndef __CUAMPCORPROCESSORTwoPass_H
+#define __CUAMPCORPROCESSORTwoPass_H
+
+#include "cuAmpcorProcessor.h"
+
+
+/**
+ * cuAmpcor processor for a chunk (a batch of windows)
+ */
+class cuAmpcorProcessorTwoPass : public cuAmpcorProcessor{
+private:
+
+    // local variables and workers
+    // gpu buffer to load images from file
+    cuArrays<image_complex_type> * c_referenceChunkRaw, * c_secondaryChunkRaw;
+    cuArrays<image_real_type> * r_referenceChunkRaw, * r_secondaryChunkRaw;
+
+    // windows raw (not oversampled) data, complex and real
+    cuArrays<complex_type> * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn;
+    cuArrays<real_type> * r_referenceBatchRaw, * r_secondaryBatchRaw;
+
+    // windows oversampled data
+    cuArrays<complex_type> * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled;
+    cuArrays<real_type> * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled;
+    cuArrays<real_type> * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust;
+
+    // offset data
+    cuArrays<int> *ChunkOffsetDown, *ChunkOffsetAcross;
+
+    // oversampling processors for complex images
+    cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
+
+    // oversampling processor for correlation surface
+    cuOverSamplerR2R *corrOverSampler;
+    cuSincOverSamplerR2R *corrSincOverSampler;
+
+    // cross-correlation processor with frequency domain algorithm
+    cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
+
+    // correlation surface normalizer
+    std::unique_ptr<cuNormalizeProcessor> corrNormalizerRaw;
+    std::unique_ptr<cuNormalizeProcessor> corrNormalizerOverSampled;
+
+    // save offset results in different stages
+    cuArrays<int2> *offsetInit;
+    cuArrays<int2> *offsetZoomIn;
+    cuArrays<complex_type> *offsetFinal;
+    cuArrays<int2> *maxLocShift; // record the maxloc from the extract center
+    cuArrays<real_type> *corrMaxValue;
+    cuArrays<int2> *i_maxloc;
+    cuArrays<real_type> *r_maxval;
+
+    // SNR estimation
+    cuArrays<real_type> *r_corrBatchRawZoomIn;
+    cuArrays<real_type> *r_corrBatchSum;
+    cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
+    cuArrays<real_type> *r_snrValue;
+
+    // Variance estimation
+    cuArrays<real3_type> *r_covValue;
+
+public:
+    // constructor
+    cuAmpcorProcessorTwoPass(cuAmpcorParameter *param_,
+        SlcImage *reference_, SlcImage *secondary_,
+        cuArrays<complex_type> *offsetImage_, cuArrays<real_type> *snrImage_,
+        cuArrays<real3_type> *covImage_, cuArrays<real_type> *peakValueImage_,
+        cudaStream_t stream_);
+    // destructor
+    ~cuAmpcorProcessorTwoPass() override;
+
+    // local methods
+    void loadReferenceChunk();
+    void loadSecondaryChunk();
+    // run the given chunk
+    void run(int, int) override;
+};
+
+
+
+#endif //__CUAMPCORPROCESSORTwoPass_H
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
index d56a01c67..111ab79f2 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
@@ -9,6 +9,7 @@
 #ifndef __CUAMPCORUTIL_H
 #define __CUAMPCORUTIL_H
 
+#include "data_types.h"
 #include "cuArrays.h"
 #include "cuAmpcorParameter.h"
 #include "cudaError.h"
@@ -18,14 +19,14 @@
 
 
 //in cuArraysCopy.cu: various utilities for copy images file in gpu memory
-void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2, int strideH, int strideW, cudaStream_t stream);
-void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-    const int *offsetH, const int* offsetW, cudaStream_t stream);
-void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-    const int *offsetH, const int* offsetW, cudaStream_t stream);
-void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
-    const int *offsetH, const int* offsetW, cudaStream_t stream);
-void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);
+void cuArraysCopyToBatch(cuArrays<image_complex_type> *image1, cuArrays<complex_type> *image2, int strideH, int strideW, cudaStream_t stream);
+void cuArraysCopyToBatchWithOffset(cuArrays<image_complex_type> *image1, const int inNX, const int inNY,
+    cuArrays<complex_type> *image2, const int *offsetH, const int* offsetW, cudaStream_t stream);
+void cuArraysCopyToBatchAbsWithOffset(cuArrays<image_complex_type> *image1, const int inNX, const int inNY,
+    cuArrays<complex_type> *image2, const int *offsetH, const int* offsetW, cudaStream_t stream);
+void cuArraysCopyToBatchWithOffsetR2C(cuArrays<image_real_type> *image1, const int inNX, const int inNY,
+    cuArrays<complex_type> *image2, const int *offsetH, const int* offsetW, cudaStream_t stream);
+void cuArraysCopyC2R(cuArrays<complex_type> *image1, cuArrays<real_type> *image2, int strideH, int strideW, cudaStream_t stream);
 
 // same routine name overloaded for different data type
 // extract data from a large image
@@ -33,67 +34,76 @@ template<typename T>
 void cuArraysCopyExtract(cuArrays<T> *imagesIn, cuArrays<T> *imagesOut, cuArrays<int2> *offset, cudaStream_t);
 template<typename T_in, typename T_out>
 void cuArraysCopyExtract(cuArrays<T_in> *imagesIn, cuArrays<T_out> *imagesOut, int2 offset, cudaStream_t);
+void cuArraysCopyExtractAbs(cuArrays<complex_type> *imagesIn, cuArrays<real_type> *imagesOut, int2 offset, cudaStream_t stream);
 
 template<typename T>
 void cuArraysCopyInsert(cuArrays<T> *in, cuArrays<T> *out, int offsetX, int offsetY, cudaStream_t);
 
 template<typename T_in, typename T_out>
 void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut,cudaStream_t stream);
-void cuArraysSetConstant(cuArrays<float> *imageIn, float value, cudaStream_t stream);
+void cuArraysSetConstant(cuArrays<real_type> *imageIn, real_type value, cudaStream_t stream);
 
-void cuArraysR2C(cuArrays<float> *image1, cuArrays<float2> *image2, cudaStream_t stream);
-void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
-void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
+void cuArraysR2C(cuArrays<real_type> *image1, cuArrays<complex_type> *image2, cudaStream_t stream);
+void cuArraysC2R(cuArrays<complex_type> *image1, cuArrays<real_type> *image2, cudaStream_t stream);
+void cuArraysAbs(cuArrays<complex_type> *image1, cuArrays<real_type> *image2, cudaStream_t stream);
 
 // cuDeramp.cu: deramping phase
-void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream);
-void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream);
+void cuDeramp(int method, cuArrays<complex_type> *images, cudaStream_t stream);
+void cuDerampMethod1(cuArrays<complex_type> *images, cudaStream_t stream);
 
 // cuArraysPadding.cu: various utilities for oversampling padding
-void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
-void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
+void cuArraysFFTPaddingMany(cuArrays<complex_type> *image1, cuArrays<complex_type> *image2, cudaStream_t stream);
 
 //in cuCorrNormalization.cu: utilities to normalize the cross correlation function
-void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream);
-void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream);
+void cuArraysSubtractMean(cuArrays<real_type> *images, cudaStream_t stream);
+void cuCorrNormalize(cuArrays<real_type> *templates, cuArrays<real_type> *images, cuArrays<real_type> *results, cudaStream_t stream);
+void cuCorrNormalize64(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream);
+void cuCorrNormalize128(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream);
+void cuCorrNormalize256(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream);
+void cuCorrNormalize512(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream);
+void cuCorrNormalize1024(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream);
+
+// in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table
+void cuCorrNormalizeSAT(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary,
+    cuArrays<real_type> * referenceSum2, cuArrays<real_type> *secondarySAT, cuArrays<real_type> *secondarySAT2, cudaStream_t stream);
 
 template<int Size>
-void cuCorrNormalizeFixed(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
+void cuCorrNormalizeFixed(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream);
 
 // in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table
-void cuCorrNormalizeSAT(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary,
-    cuArrays<float> * referenceSum2, cuArrays<float> *secondarySAT, cuArrays<float> *secondarySAT2, cudaStream_t stream);
+void cuCorrNormalizeSAT(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary,
+    cuArrays<real_type> * referenceSum2, cuArrays<real_type> *secondarySAT, cuArrays<real_type> *secondarySAT2, cudaStream_t stream);
 
 //in cuOffset.cu: utitilies for determining the max location of cross correlations or the offset
-void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cudaStream_t stream);
-void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cudaStream_t stream);
-void cuSubPixelOffset(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetZoomIn, cuArrays<float2> *offsetFinal,
-                      int OverSampleRatioZoomin, int OverSampleRatioRaw,
-                      int xHalfRangeInit,  int yHalfRangeInit,
-                      cudaStream_t stream);
-
-void cuDetermineSecondaryExtractOffset(cuArrays<int2> *maxLoc, cuArrays<int2> *maxLocShift,
-        int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream);
+void cuArraysMaxloc2D(cuArrays<real_type> *images, cuArrays<int2> *maxloc, cuArrays<real_type> *maxval, cudaStream_t stream);
+void cuArraysMaxloc2D(cuArrays<real_type> *images, const int2 start, const int2 range, cuArrays<int2> *maxloc, cuArrays<real_type> *maxval, cudaStream_t stream);
+void cuSubPixelOffset(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetRoomIn, cuArrays<real2_type> *offsetFinal, const int2 initOrigin, const int initFactor, const int2 zoomInOrigin, const int zoomInFactor, cudaStream_t stream);
+void cuSubPixelOffset2Pass(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetZoomIn, cuArrays<real2_type> *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit,  int yHalfRangeInit, cudaStream_t stream);
+void cuDetermineSecondaryExtractOffset(cuArrays<int2> *maxLoc, cuArrays<int2> *maxLocShift, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream);
 
 //in cuCorrTimeDomain.cu: cross correlation in time domain
-void cuCorrTimeDomain(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream);
+void cuCorrTimeDomain(cuArrays<real_type> *templates, cuArrays<real_type> *images, cuArrays<real_type> *results, cudaStream_t stream);
 
 //in cuCorrFrequency.cu: cross correlation in freq domain, also include fft correlatior class
-void cuArraysElementMultiply(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
-void cuArraysElementMultiplyConjugate(cuArrays<float2> *image1, cuArrays<float2> *image2, float coef, cudaStream_t stream);
+void cuArraysElementMultiply(cuArrays<complex_type> *image1, cuArrays<complex_type> *image2, cudaStream_t stream);
+void cuArraysElementMultiplyConjugate(cuArrays<complex_type> *image1, cuArrays<complex_type> *image2, real_type coef, cudaStream_t stream);
 
 
 // For SNR estimation on Correlation surface (Minyan Zhong)
 // implemented in cuArraysCopy.cu
-void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int> *imagesValid, cuArrays<int2> *maxloc, cudaStream_t stream);
+void cuArraysCopyExtractCorr(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut, cuArrays<int2> *maxloc, cudaStream_t stream);
+void cuArraysCopyExtractCorr(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut, cuArrays<int> *imagesValid, cuArrays<int2> *maxloc, cudaStream_t stream);
 // implemented in cuCorrNormalization.cu
-void cuArraysSumCorr(cuArrays<float> *images, cuArrays<int> *imagesValid, cuArrays<float> *imagesSum, cuArrays<int> *imagesValidCount, cudaStream_t stream);
+void cuArraysSumSquare(cuArrays<real_type> *images, cuArrays<real_type> *imagesSum, cudaStream_t stream);
+void cuArraysSumCorr(cuArrays<real_type> *images, cuArrays<int> *imagesValid, cuArrays<real_type> *imagesSum, cuArrays<int> *imagesValidCount, cudaStream_t stream);
+
 
 // implemented in cuEstimateStats.cu
-void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream);
+void cuEstimateSnr(cuArrays<real_type> *corrSum, cuArrays<real_type> *maxval, cuArrays<real_type> *snrValue, const int size, cudaStream_t stream);
+void cuEstimateSnr(cuArrays<real_type> *corrSum, cuArrays<int> *corrValidCount, cuArrays<real_type> *maxval, cuArrays<real_type> *snrValue, cudaStream_t stream);
 
 // implemented in cuEstimateStats.cu
-void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, int templateSize, cuArrays<float3> *covValue, cudaStream_t stream);
+void cuEstimateVariance(cuArrays<real_type> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<real_type> *maxval, const int templateSize, const int distance, cuArrays<real3_type> *covValue, cudaStream_t stream);
 
 #endif
 
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp
similarity index 60%
rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cu
rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp
index f7cc88837..bb08c3d56 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp
@@ -7,6 +7,11 @@
 // dependencies
 #include "cuArrays.h"
 #include "cudaError.h"
+#include "float2.h"
+#include "data_types.h"
+#include <cuda_runtime.h>
+#include <fstream>
+#include <iostream>
 
 // allocate arrays in device memory
 template <typename T>
@@ -62,6 +67,34 @@ void cuArrays<T>::setZero(cudaStream_t stream)
     checkCudaErrors(cudaMemsetAsync(devData, 0, getByteSize(), stream));
 }
 
+
+// Overloaded << operator for composite type
+std::ostream& operator<<(std::ostream& os, const float2& p) {
+
+        return os << "(" << p.x << ", " << p.y << ")";
+}
+
+std::ostream& operator<<(std::ostream& os, const float3& p) {
+
+        return os << "(" << p.x << ", " << p.y << ", " << p.z << ")";
+}
+
+std::ostream& operator<<(std::ostream& os, const double2& p) {
+
+        return os << "(" << p.x << ", " << p.y << ")";
+}
+
+std::ostream& operator<<(std::ostream& os, const double3& p) {
+
+        return os << "(" << p.x << ", " << p.y << ", " << p.z << ")";
+}
+
+std::ostream& operator<<(std::ostream& os, const int2& p) {
+
+        return os << "(" << p.x << ", " << p.y << ")";
+}
+
+
 // output (partial) data when debugging
 template <typename T>
 void cuArrays<T>::debuginfo(cudaStream_t stream) {
@@ -87,63 +120,6 @@ void cuArrays<T>::debuginfo(cudaStream_t stream) {
     }
 }
 
-// need specializations for x,y components
-template<>
-void cuArrays<float2>::debuginfo(cudaStream_t stream) {
-    std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;
-    if( !is_allocatedHost)
-        allocateHost();
-    copyToHost(stream);
-
-    int range = std::min(10, size*count);
-
-    for(int i=0; i<range; i++)
-        std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
-    std::cout << std::endl;
-    if(size*count>range) {
-        for(int i=size*count-range; i<size*count; i++)
-            std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
-        std::cout << std::endl;
-    }
-}
-
-template<>
-void cuArrays<float3>::debuginfo(cudaStream_t stream) {
-    std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;
-    if( !is_allocatedHost)
-        allocateHost();
-    copyToHost(stream);
-
-    int range = std::min(10, size*count);
-
-    for(int i=0; i<range; i++)
-        std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
-    std::cout << std::endl;
-    if(size*count>range) {
-        for(int i=size*count-range; i<size*count; i++)
-            std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ", " << hostData[i].z <<")";
-        std::cout << std::endl;
-    }
-}
-
-template<>
-void cuArrays<int2>::debuginfo(cudaStream_t stream) {
-    std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;
-    if( !is_allocatedHost)
-        allocateHost();
-    copyToHost(stream);
-
-    int range = std::min(10, size*count);
-
-    for(int i=0; i<range; i++)
-        std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
-    std::cout << std::endl;
-    if(size*count>range) {
-        for(int i=size*count-range; i<size*count; i++)
-            std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
-        std::cout << std::endl;
-    }
-}
 
 // output to file by copying to host at first
 template<typename T>
@@ -163,12 +139,16 @@ void cuArrays<T>::outputHostToFile(std::string fn)
     file.open(fn.c_str(),  std::ios_base::binary);
     file.write((char *)hostData, getByteSize());
     file.close();
+    std::cout << fn << "size: height " << height << " width "<< width << " count " << count << "\n";
 }
 
 // instantiations, required by python extensions
 template class cuArrays<float>;
 template class cuArrays<float2>;
 template class cuArrays<float3>;
+template class cuArrays<double>;
+template class cuArrays<double2>;
+template class cuArrays<double3>;
 template class cuArrays<int2>;
 template class cuArrays<int>;
 
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h
index b823a6b9d..5fb7878dc 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h
@@ -12,14 +12,9 @@
 #define __CUARRAYS_H
 
 // cuda dependencies
-#include <cuda.h>
 #include <driver_types.h>
-
 #include <iostream>
-#include <fstream>
-#include <cstdlib>
-#include <ctime>
-
+#include <string>
 
 template <typename T>
 class cuArrays{
@@ -109,5 +104,11 @@ class cuArrays{
 
 };
 
+std::ostream& operator<<(std::ostream& os, const float2& p);
+std::ostream& operator<<(std::ostream& os, const float3& p);
+std::ostream& operator<<(std::ostream& os, const double2& p);
+std::ostream& operator<<(std::ostream& os, const float3& p);
+std::ostream& operator<<(std::ostream& os, const int2& p);
+
 #endif //__CUARRAYS_H
 //end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu
index d9e36461b..cb86fd3c6 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu
@@ -25,10 +25,11 @@
 #include "cudaUtil.h"
 #include "cudaError.h"
 #include "float2.h"
+#include "data_types.h"
 
 // cuda kernel for cuArraysCopyToBatch
-__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY,
-    float2 *imageOut, const int outNX, const int outNY,
+__global__ void cuArraysCopyToBatch_kernel(const image_complex_type *imageIn, const int inNX, const int inNY,
+    complex_type *imageOut, const int outNX, const int outNY,
     const int nImagesX, const int nImagesY,
     const int strideX, const int strideY)
 {
@@ -40,7 +41,7 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX
     int idxImageX = idxImage/nImagesY;
     int idxImageY = idxImage%nImagesY;
     int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy;
-    imageOut[idxOut] = imageIn[idxIn];
+    imageOut[idxOut] = make_complex_type(imageIn[idxIn].x, imageIn[idxIn].y);
 }
 
 /**
@@ -52,7 +53,7 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX
  * @param strideW stride along width to extract chips
  * @param stream cudaStream
  */
-void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
+void cuArraysCopyToBatch(cuArrays<image_complex_type> *image1, cuArrays<complex_type> *image2,
     int strideH, int strideW, cudaStream_t stream)
 {
     const int nthreads = NTHREADS2D;
@@ -68,54 +69,113 @@ void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
 
 // kernel for cuArraysCopyToBatchWithOffset
 template<typename T_in, typename T_out>
-__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNY,
+__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNX, const int inNY,
     T_out *imageOut, const int outNX, const int outNY, const int nImages,
     const int *offsetX, const int *offsetY)
 {
+    // get image index
     int idxImage = blockIdx.z;
+    // check the image index within range
+    if(idxImage>=nImages ) return;
+    // get the output pixel location
     int outx = threadIdx.x + blockDim.x*blockIdx.x;
     int outy = threadIdx.y + blockDim.y*blockIdx.y;
-    if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
+     // check the output location within range (due to cuda threads)
+    if(outx >= outNX || outy >= outNY) return;
+    // flatten the output location to 1d
     int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
-    int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
-    imageOut[idxOut] = T_out{imageIn[idxIn]};
+    // find the input pixel location
+    int inx = offsetX[idxImage] + outx;
+    int iny = offsetY[idxImage] + outy;
+    // check whether the location is within the input image range
+    if(inx>=0 && inx<inNX && iny>=0 && iny<inNY) {
+        int idxIn = inx*inNY+iny;
+        imageOut[idxOut] = T_out{imageIn[idxIn]}; //true, copy
+    }
+    else
+        imageOut[idxOut] = T_out{0.0}; //false, fill with 0
+}
+
+// kernel for cuArraysCopyToBatchWithOffset
+template<>
+__global__ void cuArraysCopyToBatchWithOffset_kernel<image_complex_type, complex_type>(const image_complex_type *imageIn, const int inNX, const int inNY,
+    complex_type *imageOut, const int outNX, const int outNY, const int nImages,
+    const int *offsetX, const int *offsetY)
+{
+    // get image index
+    int idxImage = blockIdx.z;
+    // check the image index within range
+    if(idxImage>=nImages ) return;
+    // get the output pixel location
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
+     // check the output location within range (due to cuda threads)
+    if(outx >= outNX || outy >= outNY) return;
+    // flatten the output location to 1d
+    int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
+    // find the input pixel location
+    int inx = offsetX[idxImage] + outx;
+    int iny = offsetY[idxImage] + outy;
+    // check whether the location is within the input image range
+    if(inx>=0 && inx<inNX && iny>=0 && iny<inNY) {
+        int idxIn = inx*inNY+iny;
+        imageOut[idxOut] = make_complex_type(imageIn[idxIn].x, imageIn[idxIn].y);
+    }
+    else
+        imageOut[idxOut] = make_complex_type(0.0, 0.0); //false, fill with 0
 }
 
 /**
  * Copy a chunk into a batch of chips with varying offsets/strides
  * @note used to extract chips from a raw secondary image with varying offsets
  * @param image1 Input image as a large chunk
- * @param lda1 the leading dimension of image1, usually, its width inNY
+ * @param  the leading dimension of image1, usually, its width inNY
  * @param image2 Output images as a batch of chips
  * @param strideH (varying) offsets along height to extract chips
  * @param strideW (varying) offsets along width to extract chips
  * @param stream cudaStream
  */
-void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-    const int *offsetH, const int* offsetW, cudaStream_t stream)
+void cuArraysCopyToBatchWithOffset(cuArrays<image_complex_type> *image1, const int inNX, const int inNY,
+    cuArrays<complex_type> *image2, const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
     const int nthreads = 16;
     dim3 blockSize(nthreads, nthreads, 1);
     dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
     cuArraysCopyToBatchWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
-        image1->devData, lda1,
+        image1->devData, inNX, inNY,
         image2->devData, image2->height, image2->width, image2->count,
         offsetH, offsetW);
     getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
 
 // same as above, but from complex to real(take amplitudes)
-__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY,
-    float2 *imageOut, const int outNX, const int outNY, const int nImages,
+__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const image_complex_type *imageIn, const int inNX, const int inNY,
+    complex_type *imageOut, const int outNX, const int outNY, const int nImages,
     const int *offsetX, const int *offsetY)
 {
+
+    // get image index
     int idxImage = blockIdx.z;
+    // check the image index within range
+    if(idxImage>=nImages ) return;
+    // get the output pixel location
     int outx = threadIdx.x + blockDim.x*blockIdx.x;
     int outy = threadIdx.y + blockDim.y*blockIdx.y;
-    if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
+     // check the output location within range (due to cuda threads)
+    if(outx >= outNX || outy >= outNY) return;
+    // flatten the output location to 1d
     int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
-    int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
-    imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0);
+    // find the input pixel location
+    int inx = offsetX[idxImage] + outx;
+    int iny = offsetY[idxImage] + outy;
+    // check whether the location is within the input image range
+    if(inx>=0 && inx<inNX && iny>=0 && iny<inNY)
+    {
+        int idxIn = inx*inNY+iny;
+        imageOut[idxOut] = make_complex_type(complexAbs(imageIn[idxIn]), 0.0); //true, copy with abs value
+    }
+    else
+        imageOut[idxOut] = make_complex_type(0.0, 0.0); //false, fill with 0
 }
 
 /**
@@ -128,14 +188,14 @@ __global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, c
  * @param strideW (varying) offsets along width to extract chips
  * @param stream cudaStream
  */
-void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-    const int *offsetH, const int* offsetW, cudaStream_t stream)
+void cuArraysCopyToBatchAbsWithOffset(cuArrays<image_complex_type> *image1, const int inNX, const int inNY,
+    cuArrays<complex_type> *image2, const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
     const int nthreads = 16;
     dim3 blockSize(nthreads, nthreads, 1);
     dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
     cuArraysCopyToBatchAbsWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
-        image1->devData, lda1,
+        image1->devData, inNX, inNY,
         image2->devData, image2->height, image2->width, image2->count,
         offsetH, offsetW);
     getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
@@ -151,24 +211,25 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1,
  * @param strideW (varying) offsets along width to extract chips
  * @param stream cudaStream
  */
-void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
-    const int *offsetH, const int* offsetW, cudaStream_t stream)
+void cuArraysCopyToBatchWithOffsetR2C(cuArrays<image_real_type> *image1, const int inNX, const int inNY,
+    cuArrays<complex_type> *image2, const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
     const int nthreads = 16;
     dim3 blockSize(nthreads, nthreads, 1);
     dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
+
     cuArraysCopyToBatchWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
-        image1->devData, lda1,
+        image1->devData, inNX, inNY,
         image2->devData, image2->height, image2->width, image2->count,
         offsetH, offsetW);
     getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel");
 }
 
 //copy a chunk into a series of chips, from complex to real
-__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY,
-    float *imageOut, const int outNX, const int outNY,
+__global__ void cuArraysCopyC2R_kernel(const complex_type *imageIn, const int inNX, const int inNY,
+    real_type *imageOut, const int outNX, const int outNY,
     const int nImagesX, const int nImagesY,
-    const int strideX, const int strideY, const float factor)
+    const int strideX, const int strideY, const real_type factor)
 {
     int idxImage = blockIdx.z;
     int outx = threadIdx.x + blockDim.x*blockIdx.x;
@@ -190,13 +251,13 @@ __global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, co
  * @param strideW offsets along width to extract chips
  * @param stream cudaStream
  */
-void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2,
+void cuArraysCopyC2R(cuArrays<complex_type> *image1, cuArrays<real_type> *image2,
     int strideH, int strideW, cudaStream_t stream)
 {
     const int nthreads = 16;
     dim3 blockSize(nthreads, nthreads, 1);
     dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
-    float factor = 1.0f/image1->size; //the FFT factor
+    real_type factor = 1.0f/image1->size; //the FFT factor
     cuArraysCopyC2R_kernel<<<gridSize,blockSize, 0 , stream>>> (
         image1->devData, image1->height, image1->width,
         image2->devData, image2->height, image2->width,
@@ -242,12 +303,59 @@ void cuArraysCopyExtract(cuArrays<T> *imagesIn, cuArrays<T> *imagesOut, cuArrays
 }
 
 // instantiate the above template for the data types we need
-template void cuArraysCopyExtract(cuArrays<float> *in, cuArrays<float> *out, cuArrays<int2> *offsets, cudaStream_t);
-template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float2> *out, cuArrays<int2> *offsets, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<real_type> *in, cuArrays<real_type> *out, cuArrays<int2> *offsets, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<complex_type> *in, cuArrays<complex_type> *out, cuArrays<int2> *offsets, cudaStream_t);
 
 // correlation surface extraction (Minyan Zhong)
-__global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages,
+__global__ void cuArraysCopyExtractVaryingOffsetCorr(const real_type *imageIn, const int inNX, const int inNY,
+     real_type *imageOut, const int outNX, const int outNY, const int nImages,
+     const int2 *maxloc)
+{
+
+    // get the image index
+    int idxImage = blockIdx.z;
+
+    // One thread per out point. Find the coordinates within the current image.
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
+
+    // check whether thread is within output image range
+    if (outx < outNX && outy < outNY)
+    {
+        // Find the corresponding input.
+        int inx = outx + maxloc[idxImage].x - outNX/2;
+        int iny = outy + maxloc[idxImage].y - outNY/2;
+
+        // Find the location in flattened array.
+        int idxOut = (idxImage * outNX + outx) * outNY + outy;
+        int idxIn = (idxImage * inNX + inx) * inNY + iny;
+        imageOut[idxOut] = imageIn[idxIn];
+    }
+}
+
+/**
+ * copy a tile of images to another image, with starting pixels offsets accouting for boundary
+ * @param[in] imageIn inut images
+ * @param[out] imageOut output images of dimension nImages*outNX*outNY
+ */
+void cuArraysCopyExtractCorr(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut, cuArrays<int2> *maxloc, cudaStream_t stream)
+{
+    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
+    const int nthreads = 16;
+
+    dim3 threadsperblock(nthreads, nthreads,1);
+
+    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
+
+    cuArraysCopyExtractVaryingOffsetCorr<<<blockspergrid, threadsperblock,0, stream>>>(
+        imagesIn->devData, imagesIn->height, imagesIn->width,
+        imagesOut->devData, imagesOut->height, imagesOut->width,
+        imagesOut->count, maxloc->devData);
+    getLastCudaError("cuArraysCopyExtract error");
+}
+
+__global__ void cuArraysCopyExtractVaryingOffsetCorr(const real_type *imageIn, const int inNX, const int inNY,
+     real_type *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages,
      const int2 *maxloc)
 {
 
@@ -289,7 +397,7 @@ __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const
  * @param[in] imageIn input images
  * @param[out] imageOut output images of dimension nImages*outNX*outNY
  */
-void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int> *imagesValid, cuArrays<int2> *maxloc, cudaStream_t stream)
+void cuArraysCopyExtractCorr(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut, cuArrays<int> *imagesValid, cuArrays<int2> *maxloc, cudaStream_t stream)
 {
     //assert(imagesIn->height >= imagesOut && inNY >= outNY);
     const int nthreads = 16;
@@ -303,6 +411,7 @@ void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesO
     getLastCudaError("cuArraysCopyExtract error");
 }
 
+
 // end of correlation surface extraction (Minyan Zhong)
 
 
@@ -323,8 +432,8 @@ __global__ void cuArraysCopyExtractFixedOffset(const T *imageIn, const int inNX,
     }
 }
 
-__global__ void cuArraysCopyExtractFixedOffset(const float2 *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, const int nImages,
+__global__ void cuArraysCopyExtractFixedOffset(const complex_type *imageIn, const int inNX, const int inNY,
+     real_type *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
     int outx = threadIdx.x + blockDim.x*blockIdx.x;
@@ -356,10 +465,43 @@ void cuArraysCopyExtract(cuArrays<T_in> *imagesIn, cuArrays<T_out> *imagesOut, i
 }
 
 // instantiate the above template for the data types we need
-template void cuArraysCopyExtract(cuArrays<float> *in, cuArrays<float> *out, int2 offset, cudaStream_t);
-template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float> *out, int2 offset, cudaStream_t);
-template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float2> *out, int2 offset, cudaStream_t);
-template void cuArraysCopyExtract(cuArrays<float3> *in, cuArrays<float3> *out, int2 offset, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<real_type> *in, cuArrays<real_type> *out, int2 offset, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<complex_type> *in, cuArrays<real_type> *out, int2 offset, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<complex_type> *in, cuArrays<complex_type> *out, int2 offset, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<real3_type> *in, cuArrays<real3_type> *out, int2 offset, cudaStream_t);
+
+
+__global__ void cuArraysCopyExtractAbsFixedOffset(const complex_type *imageIn, const int inNX, const int inNY,
+     real_type *imageOut, const int outNX, const int outNY, const int nImages,
+     const int offsetX, const int offsetY)
+{
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
+
+    if(outx < outNX && outy < outNY)
+    {
+        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
+        int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
+        imageOut[idxOut] = complexAbs(imageIn[idxIn]);
+    }
+}
+/**
+ * copy/extract images from a large size to
+ * a smaller size from the location (offsetX, offsetY), take amplitude
+ */
+void cuArraysCopyExtractAbs(cuArrays<complex_type> *imagesIn, cuArrays<real_type> *imagesOut, int2 offset, cudaStream_t stream)
+{
+    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
+    const int nthreads = NTHREADS2D;
+    dim3 threadsperblock(nthreads, nthreads,1);
+    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
+    cuArraysCopyExtractAbsFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
+        (imagesIn->devData, imagesIn->height, imagesIn->width,
+        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
+    getLastCudaError("cuArraysCopyExtractAbs error");
+}
+
+
 
 template<typename T>
 __global__ void cuArraysCopyInsert_kernel(const T* imageIn, const int inNX, const int inNY,
@@ -389,9 +531,9 @@ void cuArraysCopyInsert(cuArrays<T> *imageIn, cuArrays<T> *imageOut, int offsetX
 }
 
 // instantiate the above template for the data types we need
-template void cuArraysCopyInsert(cuArrays<float2>* in, cuArrays<float2>* out, int offX, int offY, cudaStream_t);
-template void cuArraysCopyInsert(cuArrays<float3>* in, cuArrays<float3>* out, int offX, int offY, cudaStream_t);
-template void cuArraysCopyInsert(cuArrays<float>* in, cuArrays<float>* out, int offX, int offY, cudaStream_t);
+template void cuArraysCopyInsert(cuArrays<complex_type>* in, cuArrays<complex_type>* out, int offX, int offY, cudaStream_t);
+template void cuArraysCopyInsert(cuArrays<real3_type>* in, cuArrays<real3_type>* out, int offX, int offY, cudaStream_t);
+template void cuArraysCopyInsert(cuArrays<real_type>* in, cuArrays<real_type>* out, int offX, int offY, cudaStream_t);
 template void cuArraysCopyInsert(cuArrays<int>* in, cuArrays<int>* out, int offX, int offY, cudaStream_t);
 
 template<typename T_in, typename T_out>
@@ -430,12 +572,12 @@ void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut, cuda
 }
 
 // instantiate the above template for the data types we need
-template void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut, cudaStream_t);
-template void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut, cudaStream_t);
-template void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, cudaStream_t);
+template void cuArraysCopyPadded(cuArrays<real_type> *imageIn, cuArrays<real_type> *imageOut, cudaStream_t);
+template void cuArraysCopyPadded(cuArrays<real_type> *imageIn, cuArrays<complex_type> *imageOut, cudaStream_t);
+template void cuArraysCopyPadded(cuArrays<complex_type> *imageIn, cuArrays<complex_type> *imageOut, cudaStream_t);
 
 // cuda kernel for setting a constant value
-__global__ void cuArraysSetConstant_kernel(float *image, int size, float value)
+__global__ void cuArraysSetConstant_kernel(real_type *image, int size, real_type value)
 {
     int idx = threadIdx.x + blockDim.x*blockIdx.x;
 
@@ -449,7 +591,7 @@ __global__ void cuArraysSetConstant_kernel(float *image, int size, float value)
  * Set real images to a constant value
  * @note use setZero if value=0 because cudaMemset is faster
  */
-void cuArraysSetConstant(cuArrays<float> *imageIn, float value, cudaStream_t stream)
+void cuArraysSetConstant(cuArrays<real_type> *imageIn, real_type value, cudaStream_t stream)
 {
     const int nthreads = 256;
     int size = imageIn->getSize();
@@ -460,14 +602,14 @@ void cuArraysSetConstant(cuArrays<float> *imageIn, float value, cudaStream_t str
 }
 
 
-// convert float to float2(complex)
-__global__ void cuArraysR2C_kernel(float *image1, float2 *image2, int size)
+// convert real_type to complex_type(complex)
+__global__ void cuArraysR2C_kernel(real_type *image1, complex_type *image2, int size)
 {
     int idx =  threadIdx.x + blockDim.x*blockIdx.x;
     if(idx < size)
     {
         image2[idx].x = image1[idx];
-        image2[idx].y =  0.0f;
+        image2[idx].y =  0.0;
     }
 }
 
@@ -476,7 +618,7 @@ __global__ void cuArraysR2C_kernel(float *image1, float2 *image2, int size)
  * @param[in] image1 input images
  * @param[out] image2 output images
  */
-void cuArraysR2C(cuArrays<float> *image1, cuArrays<float2> *image2, cudaStream_t stream)
+void cuArraysR2C(cuArrays<real_type> *image1, cuArrays<complex_type> *image2, cudaStream_t stream)
 {
     int size = image1->getSize();
     cuArraysR2C_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>(image1->devData, image2->devData, size);
@@ -484,8 +626,8 @@ void cuArraysR2C(cuArrays<float> *image1, cuArrays<float2> *image2, cudaStream_t
 }
 
 
-// take real part of float2 to float
-__global__ void cuArraysC2R_kernel(float2 *image1, float *image2, int size)
+// take real part of complex_type to real_type
+__global__ void cuArraysC2R_kernel(complex_type *image1, real_type *image2, int size)
 {
     int idx =  threadIdx.x + blockDim.x*blockIdx.x;
     if(idx < size)
@@ -499,7 +641,7 @@ __global__ void cuArraysC2R_kernel(float2 *image1, float *image2, int size)
  * @param[in] image1 input images
  * @param[out] image2 output images
  */
-void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream)
+void cuArraysC2R(cuArrays<complex_type> *image1, cuArrays<real_type> *image2, cudaStream_t stream)
 {
     int size = image1->getSize();
     cuArraysC2R_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>(image1->devData, image2->devData, size);
@@ -507,7 +649,7 @@ void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t
 }
 
 // cuda kernel for cuArraysAbs
-__global__ void cuArraysAbs_kernel(float2 *image1, float *image2, int size)
+__global__ void cuArraysAbs_kernel(complex_type *image1, real_type *image2, int size)
 {
     int idx =  threadIdx.x + blockDim.x*blockIdx.x;
     if(idx < size)
@@ -521,7 +663,7 @@ __global__ void cuArraysAbs_kernel(float2 *image1, float *image2, int size)
  * @param[in] image1 input images
  * @param[out] image2 output images
  */
-void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream)
+void cuArraysAbs(cuArrays<complex_type> *image1, cuArrays<real_type> *image2, cudaStream_t stream)
 {
     int size = image1->getSize();
     cuArraysAbs_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>(image1->devData, image2->devData, size);
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu
index af3927056..c3feacf30 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu
@@ -1,83 +1,121 @@
 /*
  * @file  cuArraysPadding.cu
- * @brief Utilities for padding zeros to cuArrays
+ * @brief Utilities for padding zeros to cuArrays for FFT (zero padding in the middle)
  */
 
 #include "cuAmpcorUtil.h"
 #include "float2.h"
 
-// cuda kernel for cuArraysPadding
-__global__ void cuArraysPadding_kernel(
-    const float2 *image1, const int height1, const int width1,
-    float2 *image2, const int height2, const int width2)
-{
-    int tx = threadIdx.x + blockDim.x*blockIdx.x;
-    int ty = threadIdx.y + blockDim.y*blockIdx.y;
-    if(tx < height1/2 && ty < width1/2)
-    {
-        int tx1 = height1 - 1 - tx;
-        int ty1 = width1 -1 -ty;
-        int tx2 = height2 -1 -tx;
-        int ty2 = width2 -1 -ty;
-
-        image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)];
-        image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)];
-        image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)];
-        image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)];
-
-    }
-}
-
-/**
- * Padding zeros in the middle, move quads to corners
- * @param[in] image1 input images
- * @param[out] image2 output images
- * @note This routine is for a single image, no longer used
- */
-void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
-{
-    int ThreadsPerBlock = NTHREADS2D;
-    int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
-    int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
-    dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock);
-    dim3 dimGrid(BlockPerGridx, BlockPerGridy);
-    // set output image to 0
-    checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
-    // copy the quads of input images to four corners of the output images
-    cuArraysPadding_kernel<<<dimGrid, dimBlock, 0, stream>>>(
-        image1->devData, image1->height, image1->width,
-        image2->devData, image2->height, image2->width);
-    getLastCudaError("cuArraysPadding_kernel");
-}
 
-inline __device__ float2 cmplxMul(float2 c, float a)
+// cuda kernel for zero padding for FFT oversampling,
+// for both even and odd sequences
+// @param[in] image1 input images
+// @param[inout] image2 output images - memset to 0 in prior
+// @note siding Nyquist frequency for even length with negative frequency
+// for even N - positive f[0, ..., N/2-1],
+//              zeros 0...0,
+//              negative f[N/2], f[N/2+1, ..., N-1]
+// for odd N - positive f[0, ..., (N-1)/2],
+//             zeros 0...0,
+//             negative f [(N+1)/2, ..., N-1]
+__global__ void cuArraysPaddingMany_kernel(
+    const complex_type *image1, const int height1, const int width1, const int size1,
+    complex_type *image2, const int height2, const int width2, const int size2, const real_type factor )
 {
-    return make_float2(c.x*a, c.y*a);
+    // thread indices are for input image1
+    int x1 = threadIdx.x + blockDim.x*blockIdx.x;
+    int y1 = threadIdx.y + blockDim.y*blockIdx.y;
+    int imageIdx =  blockIdx.z;
+
+    //  ensure threads are in the range of image1
+    if (x1 >= height1 || y1 >= width1)
+        return;
+
+    // determine the quadrants
+    // divup the length to be consistent with both even and odd lengths
+    int x2 = (x1 < (height1+1)/2) ? x1 : height2 - height1 + x1;
+    int y2 = (y1 < (width1+1)/2) ? y1 : width2 - width1 + y1;
+    image2[IDX2R(x2, y2, width2)+imageIdx*size2]
+            = image1[IDX2R(x1, y1, width1)+imageIdx*size1]*factor;
+    return;
 }
 
-// cuda kernel for
-__global__ void cuArraysPaddingMany_kernel(
-    const float2 *image1, const int height1, const int width1, const int size1,
-    float2 *image2, const int height2, const int width2, const int size2, const float factor )
+// cuda kernel for zero padding for FFT oversampling,
+// for both even and odd sequences
+// @param[in] image1 input images
+// @param[inout] image2 output images - memset to 0 in prior
+// @note for even length (height, width or both),
+//       the Nyquist frequency (N/2) component is split between positive and negative frequencies.
+__global__ void cuArraysPaddingMany_split_Nyquist_kernel(
+    const complex_type *image1, const int height1, const int width1, const int size1,
+    complex_type *image2, const int height2, const int width2, const int size2, const real_type factor )
 {
-    int tx = threadIdx.x + blockDim.x*blockIdx.x;
-    int ty = threadIdx.y + blockDim.y*blockIdx.y;
-    if(tx < height1/2 && ty < width1/2)
+    // thread indices are for input image1
+    int x1 = threadIdx.x + blockDim.x*blockIdx.x;
+    int y1 = threadIdx.y + blockDim.y*blockIdx.y;
+    int imageIdx =  blockIdx.z;
+
+    //  ensure threads are in the range of image1
+    if (x1 >= height1 || y1 >= width1)
+        return;
+
+    int xcenter1 = (height1+1)/2; // divup
+    int ycenter1 = (width1+1)/2;
+
+    // for even N - positive f[0, ..., N/2-1], f[N/2]/2,
+    //              zeros 0...0,
+    //              negative f[N/2]/2, f[N/2+1, ..., N-1]
+    //        f[N/2] is split to N/2 and M-N/2
+    // for odd N - positive f[0, ..., (N-1)/2],
+    //             zeros 0...0,
+    //             negative f [(N+1)/2, ..., N-1]
+
+    // split spectrum for even height1 at the center
+    if (height1 % 2 ==0 && x1 == xcenter1)
     {
-
-        int tx1 = height1 - 1 - tx;
-        int ty1 = width1 -1 -ty;
-        int tx2 = height2 -1 -tx;
-        int ty2 = width2 -1 -ty;
-
-        int stride1 = blockIdx.z*size1;
-        int stride2 = blockIdx.z*size2;
-
-        image2[IDX2R(tx,  ty,  width2)+stride2] = image1[IDX2R(tx,  ty,  width1)+stride1]*factor;
-        image2[IDX2R(tx2, ty,  width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty,  width1)+stride1], factor);
-        image2[IDX2R(tx,  ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx,  ty1, width1)+stride1], factor);
-        image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor);
+        // if also for the width
+        if (width1 % 2 ==0 && y1 == ycenter1)
+        {
+            // split into 4
+            complex_type input = image1[IDX2R(x1, y1, width1)+imageIdx*size1];
+            input *= 0.25f*factor;
+            image2[IDX2R(x1, y1, width2)+imageIdx*size2] = input;
+            int x2 = x1 + height2 - height1;
+            int y2 = y1 + width2 - width1;
+            image2[IDX2R(x2, y1, width2)+imageIdx*size2] = input;
+            image2[IDX2R(x1, y2, width2)+imageIdx*size2] = input;
+            image2[IDX2R(x2, y2, width2)+imageIdx*size2] = input;
+            // all done for this pixel, so return
+            return;
+        }
+        // odd width or not at the center
+        complex_type input = image1[IDX2R(x1, y1, width1)+imageIdx*size1];
+        input *= 0.5f*factor;
+        int x2 = x1 + height2 - height1;
+        image2[IDX2R(x1, y1, width2)+imageIdx*size2] = input;
+        image2[IDX2R(x2, y1, width2)+imageIdx*size2] = input;
+        return;
     }
+    // split spectrum for even width1
+    if (width1 % 2 ==0 && y1 == ycenter1)
+    {
+        // even height even and at the center has been considered earlier
+        // so just odd height or not at the center
+        complex_type input = image1[IDX2R(x1, y1, width1)+imageIdx*size1];
+        input *= 0.5f*factor;
+        int y2 = y1 + width2 - width1;
+        image2[IDX2R(x1, y1, width2)+imageIdx*size2] = input;
+        image2[IDX2R(x1, y2, width2)+imageIdx*size2] = input;
+        return;
+    }
+
+    // all other cases, simply copy
+    // determine the quadrants
+    int x2 = (x1 < xcenter1) ? x1 : height2 - height1 + x1;
+    int y2 = (y1 < ycenter1) ? y1 : width2 - width1 + y1;
+    image2[IDX2R(x2, y2, width2)+imageIdx*size2]
+            = image1[IDX2R(x1, y1, width1)+imageIdx*size1]*factor;
+    return;
 }
 
 /**
@@ -86,16 +124,17 @@ __global__ void cuArraysPaddingMany_kernel(
  * @param[out] image2 output images
  * @note To keep the band center at (0,0), move quads to corners and pad zeros in the middle
  */
-void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
+void cuArraysFFTPaddingMany(cuArrays<complex_type> *image1, cuArrays<complex_type> *image2, cudaStream_t stream)
 {
     int ThreadsPerBlock = NTHREADS2D;
-    int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
-    int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
+    // up to IDIVUP(dim, 2) for odd-length sequences
+    int BlockPerGridx = IDIVUP (image1->height, ThreadsPerBlock);
+    int BlockPerGridy = IDIVUP (image1->width, ThreadsPerBlock);
     dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1);
     dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count);
 
     checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
-    float factor = 1.0f/image1->size;
+    real_type factor = 1.0f/image1->size;
     cuArraysPaddingMany_kernel<<<dimGrid, dimBlock, 0, stream>>>(
         image1->devData, image1->height, image1->width, image1->size,
         image2->devData, image2->height, image2->width, image2->size, factor);
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu
index 79c11fa2a..215caa723 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu
@@ -21,6 +21,16 @@ cuFreqCorrelator::cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaSt
     int n[NRANK] ={imageNX, imageNY};
     
     // set up fft plans
+#ifdef CUAMPCOR_DOUBLE
+    cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n,
+                              NULL, 1, imageSize,
+                              NULL, 1, fImageSize,
+                              CUFFT_D2Z, nImages));
+    cufft_Error(cufftPlanMany(&backwardPlan, NRANK, n,
+                              NULL, 1, fImageSize,
+                              NULL, 1, imageSize,
+                              CUFFT_Z2D, nImages));
+#else
     cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n,
                               NULL, 1, imageSize,
                               NULL, 1, fImageSize, 
@@ -29,16 +39,17 @@ cuFreqCorrelator::cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaSt
                               NULL, 1, fImageSize,
                               NULL, 1, imageSize, 
                               CUFFT_C2R, nImages));
+#endif
     stream = stream_;
     cufftSetStream(forwardPlan, stream);
     cufftSetStream(backwardPlan, stream);
 
     // set up work arrays
-    workFM = new cuArrays<float2>(imageNX, (imageNY/2+1), nImages);
+    workFM = new cuArrays<complex_type>(imageNX, (imageNY/2+1), nImages);
     workFM->allocate();
-    workFS = new cuArrays<float2>(imageNX, (imageNY/2+1), nImages);
+    workFS = new cuArrays<complex_type>(imageNX, (imageNY/2+1), nImages);
     workFS->allocate();
-    workT = new cuArrays<float> (imageNX, imageNY, nImages);
+    workT = new cuArrays<real_type> (imageNX, imageNY, nImages);
     workT->allocate();
 }
 
@@ -60,38 +71,46 @@ cuFreqCorrelator::~cuFreqCorrelator()
  * @param[out] results the correlation surfaces
  */
 
-void cuFreqCorrelator::execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results)
+void cuFreqCorrelator::execute(cuArrays<real_type> *templates, cuArrays<real_type> *images, cuArrays<real_type> *results)
 {
     // pad the reference windows to the the size of search windows
     cuArraysCopyPadded(templates, workT, stream);
     // forward fft to frequency domain
+#ifdef CUAMPCOR_DOUBLE
+    cufft_Error(cufftExecD2Z(forwardPlan, workT->devData, workFM->devData));
+    cufft_Error(cufftExecD2Z(forwardPlan, images->devData, workFS->devData));
+#else
     cufft_Error(cufftExecR2C(forwardPlan, workT->devData, workFM->devData));
     cufft_Error(cufftExecR2C(forwardPlan, images->devData, workFS->devData));
+#endif
+
     // cufft doesn't normalize, so manually get the image size for normalization
-    float coef = 1.0/(images->size);
+    real_type coef = 1.0/(images->size);
     // multiply reference with secondary windows in frequency domain
     cuArraysElementMultiplyConjugate(workFM, workFS, coef, stream);
     // backward fft to get correlation surface in time domain
+#ifdef CUAMPCOR_DOUBLE
+    cufft_Error(cufftExecZ2D(backwardPlan, workFM->devData, workT->devData));
+#else
     cufft_Error(cufftExecC2R(backwardPlan, workFM->devData, workT->devData));
+#endif
     // extract to get proper size of correlation surface
     cuArraysCopyExtract(workT, results, make_int2(0, 0), stream);
     // all done
 }
 
 // a = a^* * b
-inline __device__ float2 cuMulConj(float2 a, float2 b)
+inline __device__ complex_type cuMulConj(complex_type a, complex_type b, real_type f)
 {
-    return make_float2(a.x*b.x + a.y*b.y, -a.y*b.x + a.x*b.y);
+    return make_complex_type(f*(a.x*b.x + a.y*b.y), f*(-a.y*b.x + a.x*b.y));
 }
 
 // cuda kernel for cuArraysElementMultiplyConjugate
-__global__ void cudaKernel_elementMulConjugate(float2 *ainout, float2 *bin, int size, float coef)
+__global__ void cudaKernel_elementMulConjugate(complex_type *ainout, complex_type *bin, int size, real_type coef)
 {
     int idx = threadIdx.x + blockIdx.x*blockDim.x;
     if(idx < size) {
-        cuComplex prod; 
-        prod = cuMulConj(ainout[idx], bin[idx]);
-        ainout [idx] = prod*coef;
+        ainout [idx] = cuMulConj(ainout[idx], bin[idx], coef);
     }
 } 
 
@@ -101,7 +120,7 @@ __global__ void cudaKernel_elementMulConjugate(float2 *ainout, float2 *bin, int
  * @param[in] image2, the secondary image
  * @param[in] coef, usually the normalization factor
  */
-void cuArraysElementMultiplyConjugate(cuArrays<float2> *image1, cuArrays<float2> *image2, float coef, cudaStream_t stream)
+void cuArraysElementMultiplyConjugate(cuArrays<complex_type> *image1, cuArrays<complex_type> *image2, real_type coef, cudaStream_t stream)
 {
     int size = image1->getSize();
     int threadsperblock = NTHREADS;
@@ -109,4 +128,4 @@ void cuArraysElementMultiplyConjugate(cuArrays<float2> *image1, cuArrays<float2>
     cudaKernel_elementMulConjugate<<<blockspergrid, threadsperblock, 0, stream>>>(image1->devData, image2->devData, size, coef );
     getLastCudaError("cuArraysElementMultiply error\n");
 } 
-//end of file
\ No newline at end of file
+//end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h
index ef101e686..d05a58860 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h
@@ -8,8 +8,9 @@
 #define __CUCORRFREQUENCY_H
 
 // dependencies
-#include "cudaUtil.h"
 #include "cuArrays.h"
+#include "data_types.h"
+#include <cufft.h>
 
 class cuFreqCorrelator
 {
@@ -18,9 +19,9 @@ class cuFreqCorrelator
     cufftHandle forwardPlan;
     cufftHandle backwardPlan;
     // work data
-    cuArrays<float2> *workFM;
-    cuArrays<float2> *workFS;
-    cuArrays<float> *workT;
+    cuArrays<complex_type> *workFM;
+    cuArrays<complex_type> *workFS;
+    cuArrays<real_type> *workT;
     // cuda stream
     cudaStream_t stream;
 
@@ -30,8 +31,8 @@ class cuFreqCorrelator
     // destructor
     ~cuFreqCorrelator();
     // executor
-    void execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results);
+    void execute(cuArrays<real_type> *templates, cuArrays<real_type> *images, cuArrays<real_type> *results);
 };
 
 #endif //__CUCORRFREQUENCY_H
-// end of file
\ No newline at end of file
+// end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu
index b0ba6b731..d5135a23b 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu
@@ -20,14 +20,13 @@
  */
 
 #include "cuAmpcorUtil.h"
-#include <cfloat>
 #include <stdio.h>
 
  // sum reduction within a block
  // the following implementation is compatible for sm_20 and above
  // newer architectures may support faster implementations, such as warp shuffle, cooperative groups
 template <const int Nthreads>
-__device__ float sumReduceBlock(float sum, volatile float *shmem)
+__device__ real_type sumReduceBlock(real_type sum, volatile real_type *shmem)
 {
     const int tid = threadIdx.x;
     shmem[tid] = sum;
@@ -53,9 +52,9 @@ __device__ float sumReduceBlock(float sum, volatile float *shmem)
 
 // cuda kernel to subtract mean value from the images
 template<const int Nthreads>
-__global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSize, float invSize, int nImages)
+__global__ void cuArraysMean_kernel(real_type *images, real_type *image_sum, int imageSize, real_type invSize, int nImages)
 {
-    __shared__ float shmem[Nthreads];
+    __shared__ real_type shmem[Nthreads];
 
     const int tid = threadIdx.x;
     const int bid = blockIdx.x;
@@ -64,9 +63,9 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi
 
     const int       imageIdx = bid;
     const int imageOffset = imageIdx * imageSize;
-    float   *imageD = images + imageOffset;
+    real_type   *imageD = images + imageOffset;
 
-    float sum  = 0.0f;
+    real_type sum  = 0.0;
     // perform the reduction beyond one block
     // save the results for each thread in block
     for (int i = tid; i < imageSize; i += Nthreads)
@@ -74,7 +73,7 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi
     // reduction within the block
     sum = sumReduceBlock<Nthreads>(sum, shmem);
 
-    const float mean = sum * invSize;
+    const real_type mean = sum * invSize;
     if(tid ==0) image_sum[bid] = mean;
 }
 
@@ -84,11 +83,11 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi
  * @param[out] mean Output mean values
  * @param[in] stream cudaStream
  */
-void cuArraysMeanValue(cuArrays<float> *images, cuArrays<float> *mean, cudaStream_t stream)
+void cuArraysMeanValue(cuArrays<real_type> *images, cuArrays<real_type> *mean, cudaStream_t stream)
 {
     const dim3 grid(images->count, 1, 1);
     const int imageSize = images->width*images->height;
-    const float invSize = 1.0f/imageSize;
+    const real_type invSize = 1.0f/imageSize;
 
     cuArraysMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, mean->devData, imageSize, invSize, images->count);
     getLastCudaError("cuArraysMeanValue kernel error\n");
@@ -96,9 +95,9 @@ void cuArraysMeanValue(cuArrays<float> *images, cuArrays<float> *mean, cudaStrea
 
 // cuda kernel to compute and subtracts mean value from the images
 template<const int Nthreads>
-__global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float invSize, int nImages)
+__global__ void cuArraysSubtractMean_kernel(real_type *images, int imageSize, real_type invSize, int nImages)
 {
-    __shared__ float shmem[Nthreads];
+    __shared__ real_type shmem[Nthreads];
 
     const int tid = threadIdx.x;
     const int bid = blockIdx.x;
@@ -107,16 +106,16 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float
 
     const int       imageIdx = bid;
     const int imageOffset = imageIdx * imageSize;
-    float   *imageD = images + imageOffset;
+    real_type   *imageD = images + imageOffset;
 
     // compute the sum
-    float sum  = 0.0f;
+    real_type sum  = 0.0;
     for (int i = tid; i < imageSize; i += Nthreads)
             sum += imageD[i];
     sum = sumReduceBlock<Nthreads>(sum, shmem);
 
     // compute the mean
-    const float mean = sum * invSize;
+    const real_type mean = sum * invSize;
     // subtract the mean from each pixel
     for (int i = tid; i < imageSize; i += Nthreads)
             imageD[i] -= mean;
@@ -128,11 +127,11 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float
  * @param[out] mean Output mean values
  * @param[in] stream cudaStream
  */
-void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream)
+void cuArraysSubtractMean(cuArrays<real_type> *images, cudaStream_t stream)
 {
     const dim3 grid(images->count, 1, 1);
     const int imageSize = images->width*images->height;
-    const float invSize = 1.0f/imageSize;
+    const real_type invSize = 1.0f/imageSize;
 
     cuArraysSubtractMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, imageSize, invSize, images->count);
     getLastCudaError("cuArraysSubtractMean kernel error\n");
@@ -141,9 +140,9 @@ void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream)
 
 // cuda kernel to compute summation on extracted correlation surface (Minyan)
 template<const int Nthreads>
-__global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *imagesSum, int *imagesValidCount, int imageSize, int nImages)
+__global__ void cuArraysSum_kernel(real_type *images, real_type *imagesSum, int imageSize, int nImages)
 {
-    __shared__ float shmem[Nthreads];
+    __shared__ real_type shmem[Nthreads];
 
     const int tid = threadIdx.x;
     const int bid = blockIdx.x;
@@ -152,11 +151,57 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i
 
     const int imageIdx = bid;
     const int imageOffset = imageIdx * imageSize;
-    float*    imageD = images + imageOffset;
+    real_type*    imageD = images + imageOffset;
+
+    real_type sum  = 0.0;
+
+    for (int i = tid; i < imageSize; i += Nthreads) {
+            sum += imageD[i] * imageD[i];
+    }
+
+    sum = sumReduceBlock<Nthreads>(sum, shmem);
+
+    if(tid ==0) {
+        imagesSum[bid] = sum;
+    }
+}
+
+/**
+ * Compute the variance of images (for SNR)
+ * @param[in] images Input images
+ * @param[in] imagesValid validity flags for each pixel
+ * @param[out] imagesSum variance
+ * @param[out] imagesValidCount count of total valid pixels
+ * @param[in] stream cudaStream
+ */
+void cuArraysSumSquare(cuArrays<real_type> *images, cuArrays<real_type> *imagesSum, cudaStream_t stream)
+{
+    const dim3 grid(images->count, 1, 1);
+    const int imageSize = images->width*images->height;
+
+    cuArraysSum_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData,
+        imagesSum->devData, imageSize, images->count);
+    getLastCudaError("cuArraysSumValue kernel error\n");
+}
+
+// cuda kernel to compute summation on extracted correlation surface (Minyan)
+template<const int Nthreads>
+__global__ void cuArraysSumCorr_kernel(real_type *images, int *imagesValid, real_type *imagesSum, int *imagesValidCount, int imageSize, int nImages)
+{
+    __shared__ real_type shmem[Nthreads];
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+
+    if (bid >= nImages) return;
+
+    const int imageIdx = bid;
+    const int imageOffset = imageIdx * imageSize;
+    real_type*    imageD = images + imageOffset;
     int*      imageValidD = imagesValid + imageOffset;
 
-    float sum  = 0.0f;
-    int count = 0;
+    real_type sum  = 0.0f;
+    real_type count = 0;
 
     for (int i = tid; i < imageSize; i += Nthreads) {
             sum += imageD[i] * imageD[i];
@@ -168,7 +213,7 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i
 
     if(tid ==0) {
         imagesSum[bid] = sum;
-        imagesValidCount[bid] = count;
+        imagesValidCount[bid] = (int)count;
     }
 }
 
@@ -180,7 +225,7 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i
  * @param[out] imagesValidCount count of total valid pixels
  * @param[in] stream cudaStream
  */
-void cuArraysSumCorr(cuArrays<float> *images, cuArrays<int> *imagesValid, cuArrays<float> *imagesSum,
+void cuArraysSumCorr(cuArrays<real_type> *images, cuArrays<int> *imagesValid, cuArrays<real_type> *imagesSum,
     cuArrays<int> *imagesValidCount, cudaStream_t stream)
 {
     const dim3 grid(images->count, 1, 1);
@@ -191,9 +236,10 @@ void cuArraysSumCorr(cuArrays<float> *images, cuArrays<int> *imagesValid, cuArra
     getLastCudaError("cuArraysSumValueCorr kernel error\n");
 }
 
+
 // intra-block inclusive prefix sum
 template<int Nthreads2>
-__device__ void inclusive_prefix_sum(float sum, volatile float *shmem)
+__device__ void inclusive_prefix_sum(real_type sum, volatile real_type *shmem)
 {
     const int tid = threadIdx.x;
     shmem[tid] = sum;
@@ -212,31 +258,31 @@ __device__ void inclusive_prefix_sum(float sum, volatile float *shmem)
 
 // prefix sum of pixel value and pixel value^2
 template<const int Nthreads2>
-__device__ float2 partialSums(const float v, volatile float* shmem, const int stride)
+__device__ real2_type partialSums(const real_type v, volatile real_type* shmem, const int stride)
 {
     const int tid = threadIdx.x;
 
-    volatile float *shMem  = shmem + 1;
-    volatile float *shMem2 = shMem + 1 + (1 << Nthreads2);
+    volatile real_type *shMem  = shmem + 1;
+    volatile real_type *shMem2 = shMem + 1 + (1 << Nthreads2);
 
     inclusive_prefix_sum<Nthreads2>(v,   shMem);
     inclusive_prefix_sum<Nthreads2>(v*v, shMem2);
-    const float Sum  = shMem [tid-1 + stride] - shMem [tid-1];
-    const float Sum2 = shMem2[tid-1 + stride] - shMem2[tid-1];
-    return make_float2(Sum, Sum2);
+    const real_type Sum  = shMem [tid-1 + stride] - shMem [tid-1];
+    const real_type Sum2 = shMem2[tid-1 + stride] - shMem2[tid-1];
+    return make_real2(Sum, Sum2);
 }
 
 // cuda kernel for cuCorrNormalize
 template<const int Nthreads2>
 __global__ void cuCorrNormalize_kernel(
     int nImages,
-    const float *templateIn, int templateNX, int templateNY, int templateSize,
-    const float *imageIn, int imageNX, int imageNY, int imageSize,
-    float *resultOut, int resultNX, int resultNY, int resultSize,
-    float templateCoeff)
+    const real_type *templateIn, int templateNX, int templateNY, int templateSize,
+    const real_type *imageIn, int imageNX, int imageNY, int imageSize,
+    real_type *resultOut, int resultNX, int resultNY, int resultSize,
+    real_type templateCoeff)
 {
     const int Nthreads = 1<<Nthreads2;
-    __shared__ float shmem[Nthreads*3];
+    __shared__ real_type shmem[Nthreads*3];
 
     const int tid = threadIdx.x;
     const int imageIdx = blockIdx.z;
@@ -246,28 +292,28 @@ __global__ void cuCorrNormalize_kernel(
     const int templateOffset = imageIdx * templateSize;
     const int   resultOffset = imageIdx *   resultSize;
 
-    const float *   imageD =    imageIn  +    imageOffset;
-    const float *templateD = templateIn  + templateOffset;
-    float *  resultD =   resultOut +   resultOffset;
+    const real_type *   imageD =    imageIn  +    imageOffset;
+    const real_type *templateD = templateIn  + templateOffset;
+    real_type *  resultD =   resultOut +   resultOffset;
 
     // template sum^2
-    float templateSum2 = 0.0f;
+    real_type templateSum2 = 0.0;
     for (int i = tid; i < templateSize; i += Nthreads)
         {
-            const float t = templateD[i];
+            const real_type t = templateD[i];
             templateSum2 += t*t;
         }
     templateSum2 = sumReduceBlock<Nthreads>(templateSum2, shmem);
     __syncthreads();
 
     // reset shared memory value
-    shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0f;
+    shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0;
     __syncthreads();
 
     // perform the prefix sum and sum^2 for secondary window
     // see notes above
-    float imageSum  = 0.0f;
-    float imageSum2 = 0.0f;
+    real_type imageSum  = 0.0;
+    real_type imageSum2 = 0.0;
     int iaddr = 0;
     const int windowSize = templateNX*imageNY;
     // iterative till reaching the templateNX row of the secondary window
@@ -275,7 +321,7 @@ __global__ void cuCorrNormalize_kernel(
     while (iaddr < windowSize)
     {
         // cum sum for each row with a width=templateNY
-        const float2 res = partialSums<Nthreads2>(imageD[iaddr + tid], shmem, templateNY);
+        const real2_type res = partialSums<Nthreads2>(imageD[iaddr + tid], shmem, templateNY);
         // add to the total, which keeps track of the sum of area for each window
         imageSum  += res.x;
         imageSum2 += res.y;
@@ -287,17 +333,17 @@ __global__ void cuCorrNormalize_kernel(
     if (tid < resultNY)
     {
         // normalizing factor
-        const float norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2;
+        const real_type norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2;
         // normalize the correlation surface
-        resultD[tid] *= rsqrtf(norm2 + FLT_EPSILON);
+        resultD[tid] *= rsqrt(norm2 + EPSILON);
     }
     // iterative over the rest rows
     while (iaddr < imageSize)
     {
         // the prefix sum of the row removed is recomputed, to be subtracted
-        const float2 res1 = partialSums<Nthreads2>(imageD[iaddr-windowSize + tid], shmem, templateNY);
+        const real2_type res1 = partialSums<Nthreads2>(imageD[iaddr-windowSize + tid], shmem, templateNY);
         // the prefix sum of the new row, to be added
-        const float2 res2 = partialSums<Nthreads2>(imageD[iaddr            + tid], shmem, templateNY);
+        const real2_type res2 = partialSums<Nthreads2>(imageD[iaddr            + tid], shmem, templateNY);
         imageSum  += res2.x - res1.x;
         imageSum2 += res2.y - res1.y;
         // move to next row
@@ -307,8 +353,8 @@ __global__ void cuCorrNormalize_kernel(
         {
             const int ix = iaddr/imageNY; // get row index
             const int addr = (ix-templateNX)*resultNY; // get the correlation surface row index
-            const float norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2;
-            resultD[addr + tid] *= rsqrtf(norm2 + FLT_EPSILON);
+            const real_type norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2;
+            resultD[addr + tid] *= rsqrt(norm2 + EPSILON);
         }
     }
 }
@@ -322,12 +368,12 @@ __global__ void cuCorrNormalize_kernel(
  * @warning The current implementation uses one thread for one column, therefore,
  *   the secondary window width is limited to <=1024, the max threads in a block.
  */
-void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream)
+void cuCorrNormalize(cuArrays<real_type> *templates, cuArrays<real_type> *images, cuArrays<real_type> *results, cudaStream_t stream)
 {
     const int nImages = images->count;
     const int imageNY = images->width;
     const dim3 grid(1, 1, nImages);
-    const float invTemplateSize = 1.0f/templates->size;
+    const real_type invTemplateSize = 1.0f/templates->size;
 
     if      (imageNY <=   64) {
         cuCorrNormalize_kernel< 6><<<grid,  64, 0, stream>>>(nImages,
@@ -385,11 +431,11 @@ template<> struct Log2<512> { static const int value = 9; };
 template<> struct Log2<1024> { static const int value = 10; };
 
 template<int Size>
-void cuCorrNormalizeFixed(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
+void cuCorrNormalizeFixed(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream)
 {
     const int nImages = correlation->count;
     const dim3 grid(1, 1, nImages);
-    const float invReferenceSize = 1.0f/reference->size;
+    const real_type invReferenceSize = 1.0f/reference->size;
     cuCorrNormalize_kernel<Log2<Size>::value><<<grid, Size, 0, stream>>>(nImages,
                 reference->devData, reference->height, reference->width, reference->size,
                 secondary->devData, secondary->height, secondary->width, secondary->size,
@@ -398,20 +444,20 @@ void cuCorrNormalizeFixed(cuArrays<float> *correlation, cuArrays<float> *referen
     getLastCudaError("cuCorrNormalize kernel error");
 }
 
-template void cuCorrNormalizeFixed<64>(cuArrays<float> *correlation,
-        cuArrays<float> *reference, cuArrays<float> *secondary,
+template void cuCorrNormalizeFixed<64>(cuArrays<real_type> *correlation,
+        cuArrays<real_type> *reference, cuArrays<real_type> *secondary,
         cudaStream_t stream);
-template void cuCorrNormalizeFixed<128>(cuArrays<float> *correlation,
-        cuArrays<float> *reference, cuArrays<float> *secondary,
+template void cuCorrNormalizeFixed<128>(cuArrays<real_type> *correlation,
+        cuArrays<real_type> *reference, cuArrays<real_type> *secondary,
         cudaStream_t stream);
-template void cuCorrNormalizeFixed<256>(cuArrays<float> *correlation,
-        cuArrays<float> *reference, cuArrays<float> *secondary,
+template void cuCorrNormalizeFixed<256>(cuArrays<real_type> *correlation,
+        cuArrays<real_type> *reference, cuArrays<real_type> *secondary,
         cudaStream_t stream);
-template void cuCorrNormalizeFixed<512>(cuArrays<float> *correlation,
-        cuArrays<float> *reference, cuArrays<float> *secondary,
+template void cuCorrNormalizeFixed<512>(cuArrays<real_type> *correlation,
+        cuArrays<real_type> *reference, cuArrays<real_type> *secondary,
         cudaStream_t stream);
-template void cuCorrNormalizeFixed<1024>(cuArrays<float> *correlation,
-        cuArrays<float> *reference, cuArrays<float> *secondary,
+template void cuCorrNormalizeFixed<1024>(cuArrays<real_type> *correlation,
+        cuArrays<real_type> *reference, cuArrays<real_type> *secondary,
         cudaStream_t stream);
 
 // end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu
index 97b4e064b..656e5ebcb 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu
@@ -5,11 +5,14 @@
  */
 
 #include <cooperative_groups.h>
+
+#if __CUDACC_VER_MAJOR__ >= 11
 #include <cooperative_groups/reduce.h>
+#endif
+
 // my declarations
 #include "cuAmpcorUtil.h"
-// for FLT_EPSILON
-#include <float.h>
+
 
 // alias for cuda cooperative groups
 namespace cg = cooperative_groups;
@@ -25,16 +28,19 @@ namespace cg = cooperative_groups;
  * @note use one thread block for each image, blockIdx.x is image index
  **/
 
-__global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch)
+
+#if __CUDACC_VER_MAJOR__ >= 11
+// use cg::reduce for NVCC 11 and above
+__global__ void sum_square_kernel(real_type *sum2, const real_type *images, int n, int batch)
 {
     // get block id for each image
     int imageid = blockIdx.x;
-    const float *image = images + imageid*n;
+    const real_type *image = images + imageid*n;
 
     // get the thread block
     cg::thread_block cta = cg::this_thread_block();
     // get the shared memory
-    extern float __shared__ sdata[];
+    extern real_type __shared__ sdata[];
 
     // get the current thread
     int tid = cta.thread_rank();
@@ -53,12 +59,12 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b
     cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
 
     // reduce in each tile with warp
-    sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus<float>());
+    sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus<real_type>());
     cg::sync(cta);
 
     // reduce all tiles with thread 0
     if(tid == 0) {
-        float sum = 0.0;
+        real_type sum = 0.0;
         for (int i = 0; i < cta.size(); i += tile32.size())
             sum += sdata[i];
         // assign the value to results
@@ -66,6 +72,69 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b
     }
 }
 
+#else
+// use warp-shuffle reduction for NVCC 9 & 10
+__global__ void sum_square_kernel(real_type *sum2, const real_type *images, int n, int batch)
+{
+    // get block id for each image
+    int imageid = blockIdx.x;
+    const real_type *image = images + imageid*n;
+
+    // get the thread block
+    cg::thread_block cta = cg::this_thread_block();
+    // get the shared memory
+    extern real_type __shared__ sdata[];
+
+    // get the current thread
+    unsigned int tid = cta.thread_rank();
+    unsigned int blockSize = cta.size();
+
+    // stride over grid and add the values to the shared memory
+    real_type sum = 0;
+
+    for(int i = tid; i < n; i += blockSize ) {
+        auto value = image[i];
+        sum += value*value;
+    }
+    sdata[tid] = sum;
+    cg::sync(cta);
+
+    // do reduction in shared memory in log2 steps
+    if ((blockSize >= 512) && (tid < 256)) {
+        sdata[tid] = sum = sum + sdata[tid + 256];
+    }
+    cg::sync(cta);
+
+    if ((blockSize >= 256) && (tid < 128)) {
+        sdata[tid] = sum = sum + sdata[tid + 128];
+    }
+    cg::sync(cta);
+
+    if ((blockSize >= 128) && (tid < 64)) {
+        sdata[tid] = sum = sum + sdata[tid + 64];
+    }
+    cg::sync(cta);
+
+    // partition thread block into tiles in size 32 (warp)
+    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+    // reduce within warp
+    if(tid < 32) {
+        if(blockSize >=64) sum += sdata[tid + 32];
+        for (int offset = tile32.size()/2; offset >0; offset /=2) {
+            sum += tile32.shfl_down(sum, offset);
+        }
+    }
+
+    // return results with thread 0
+    if(tid == 0) {
+        // assign the value to results
+        sum2[imageid] = sum;
+    }
+}
+#endif // __CUDACC_VER_MAJOR check
+
+
 /**
  * cuda kernel for 2d sum area table
  * Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images.
@@ -77,7 +146,7 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b
  * @param[in] batch number of images
  **/
 
-__global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx, int ny, int batch)
+__global__ void sat2d_kernel(real_type *sat, real_type * sat2, const real_type *data, int nx, int ny, int batch)
 {
     // get block id for each image
     int imageid = blockIdx.x;
@@ -89,13 +158,13 @@ __global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx
     // the number of rows may be bigger than the number of threads, iterate
     for (int row = tid; row < nx; row += blockDim.x) {
         // running sum for value and value^2
-        float sum = 0.0f;
-        float sum2 = 0.0f;
+        real_type sum = 0.0;
+        real_type sum2 = 0.0;
         // starting position for this row
         int index = (imageid*nx+row)*ny;
         // iterative over column
         for (int i=0; i<ny; i++, index++) {
-            float val = data[index];
+            real_type val = data[index];
             sum += val;
             sat[index] = sum;
             sum2 += val*val;
@@ -113,10 +182,10 @@ __global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx
         int index = col + imageid*nx*ny;
 
         // assign sum with the first line value
-        float sum = sat[index];
-        float sum2 = sat2[index];
-        // iterative over rest lines
-        for (int i=1; i<nx; i++) {
+        real_type sum = sat[index];
+        real_type sum2 = sat2[index];
+    	// iterative over rest lines
+    	for (int i=1; i<nx; i++) {
             index += ny;
             sum += sat[index];
             sat[index] = sum;
@@ -130,8 +199,8 @@ __global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx
 
 
 
-__global__ void cuCorrNormalizeSAT_kernel(float *correlation, const float *referenceSum2, const float *secondarySat,
-    const float *secondarySat2, const int corNX, const int corNY, const int referenceNX, const int referenceNY,
+__global__ void cuCorrNormalizeSAT_kernel(real_type *correlation, const real_type *referenceSum2, const real_type *secondarySat,
+    const real_type *secondarySat2, const int corNX, const int corNY, const int referenceNX, const int referenceNY,
     const int secondaryNX, const int secondaryNY)
 {
     //get the image id from block z index
@@ -143,42 +212,42 @@ __global__ void cuCorrNormalizeSAT_kernel(float *correlation, const float *refer
     // check the range
     if (tx < corNX && ty < corNY) {
         // get the reference std
-        float refSum2 = referenceSum2[imageid];
+        real_type refSum2 = referenceSum2[imageid];
 
         // compute the sum and sum square of the search image from the sum area table
         // sum
-        const float *sat = secondarySat + imageid*secondaryNX*secondaryNY;
+        const real_type *sat = secondarySat + imageid*secondaryNX*secondaryNY;
         // get sat values for four corners
-        float topleft = (tx > 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0;
-        float topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0;
-        float bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0;
-        float bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)];
+        real_type topleft = (tx > 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0;
+        real_type topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0;
+        real_type bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0;
+        real_type bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)];
         // get the sum
-        float secondarySum = bottomright + topleft - topright - bottomleft;
+        real_type secondarySum = bottomright + topleft - topright - bottomleft;
         // sum of value^2
-        const float *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY;
+        const real_type *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY;
         // get sat2 values for four corners
         topleft = (tx > 0 && ty > 0) ? sat2[(tx-1)*secondaryNY+(ty-1)] : 0.0;
         topright = (tx > 0 ) ? sat2[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0;
         bottomleft = (ty > 0) ? sat2[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0;
         bottomright = sat2[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)];
-        float secondarySum2 = bottomright + topleft - topright - bottomleft;
+        real_type secondarySum2 = bottomright + topleft - topright - bottomleft;
 
         // compute the normalization
-        float norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2;
+        real_type norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2;
         // normalize the correlation surface
-        correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrtf(norm2 + FLT_EPSILON);
+        correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrt(norm2 + EPSILON);
     }
 }
 
 
-void cuCorrNormalizeSAT(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary,
-    cuArrays<float> * referenceSum2, cuArrays<float> *secondarySat, cuArrays<float> *secondarySat2, cudaStream_t stream)
+void cuCorrNormalizeSAT(cuArrays<real_type> *correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary,
+    cuArrays<real_type> * referenceSum2, cuArrays<real_type> *secondarySat, cuArrays<real_type> *secondarySat2, cudaStream_t stream)
 {
     // compute the std of reference image
     // note that the mean is already subtracted
     int nthreads = 256;
-    int sMemSize = nthreads*sizeof(float);
+    int sMemSize = nthreads*sizeof(real_type);
     int nblocks = reference->count;
     sum_square_kernel<<<nblocks, nthreads, sMemSize, stream>>>(referenceSum2->devData, reference->devData,
         reference->width * reference->height, reference->count);
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cpp
similarity index 51%
rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cu
rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cpp
index 783080d90..5c157f16a 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cpp
@@ -7,56 +7,41 @@
 #include "cuCorrNormalizer.h"
 #include "cuAmpcorUtil.h"
 
-cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count)
+cuNormalizeProcessor*
+newCuNormalizer(int secondaryNX, int secondaryNY, int count)
 {
     // depending on NY, choose different processor
     if(secondaryNY <= 64) {
-        processor = new cuNormalizeFixed<64>();
+        return new cuNormalizeFixed<64>();
     }
     else if (secondaryNY <= 128) {
-        processor = new cuNormalizeFixed<128>();
+        return new cuNormalizeFixed<128>();
     }
     else if (secondaryNY <= 256) {
-        processor = new cuNormalizeFixed<256>();
+        return new cuNormalizeFixed<256>();
     }
     else if (secondaryNY <= 512) {
-        processor = new cuNormalizeFixed<512>();
+        return new cuNormalizeFixed<512>();
     }
     else if (secondaryNY <= 1024) {
-        processor = new cuNormalizeFixed<1024>();
+        return new cuNormalizeFixed<1024>();
     }
     else {
-        processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count);
+        return new cuNormalizeSAT(secondaryNX, secondaryNY, count);
     }
 }
 
-cuNormalizer::~cuNormalizer()
-{
-    delete processor;
-}
-
-void cuNormalizer::execute(cuArrays<float> *correlation,
-    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
-{
-    processor->execute(correlation, reference, secondary, stream);
-}
-
-/**
- *
- *
- **/
-
 cuNormalizeSAT::cuNormalizeSAT(int secondaryNX, int secondaryNY, int count)
 {
     // allocate the work array
     // reference sum square
-    referenceSum2 = new cuArrays<float>(1, 1, count);
+    referenceSum2 = new cuArrays<real_type>(1, 1, count);
     referenceSum2->allocate();
 
     // secondary sum and sum square
-    secondarySAT = new cuArrays<float>(secondaryNX, secondaryNY, count);
+    secondarySAT = new cuArrays<real_type>(secondaryNX, secondaryNY, count);
     secondarySAT->allocate();
-    secondarySAT2 = new cuArrays<float>(secondaryNX, secondaryNY, count);
+    secondarySAT2 = new cuArrays<real_type>(secondaryNX, secondaryNY, count);
     secondarySAT2->allocate();
 };
 
@@ -67,16 +52,16 @@ cuNormalizeSAT::~cuNormalizeSAT()
     delete secondarySAT2;
 }
 
-void cuNormalizeSAT::execute(cuArrays<float> *correlation,
-    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
+void cuNormalizeSAT::execute(cuArrays<real_type> *correlation,
+    cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream)
 {
     cuCorrNormalizeSAT(correlation, reference, secondary,
         referenceSum2, secondarySAT, secondarySAT2, stream);
 }
 
 template<int Size>
-void cuNormalizeFixed<Size>::execute(cuArrays<float> *correlation,
-    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
+void cuNormalizeFixed<Size>::execute(cuArrays<real_type> *correlation,
+    cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream)
 {
     cuCorrNormalizeFixed<Size>(correlation, reference, secondary, stream);
 }
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h
index 043accd3a..1b6a33b68 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h
@@ -13,7 +13,7 @@
 #define __CUNORMALIZER_H
 
 #include "cuArrays.h"
-#include "cudaUtil.h"
+#include "data_types.h"
 
 /**
  * Abstract class interface for correlation surface normalization processor
@@ -22,46 +22,34 @@
 class cuNormalizeProcessor {
 public:
     // default constructor and destructor
-    cuNormalizeProcessor() {}
-    ~cuNormalizeProcessor() {}
+    cuNormalizeProcessor() = default;
+    virtual ~cuNormalizeProcessor() = default;
     // execute interface
-    virtual void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream) = 0;
+    virtual void execute(cuArrays<real_type> * correlation, cuArrays<real_type> *reference, cuArrays<real_type> *secondary, cudaStream_t stream) = 0;
 };
 
-class cuNormalizer {
-private:
-    cuNormalizeProcessor *processor;
-public:
-     // disable the default constructor
-     cuNormalizer() = delete;
-     // constructor with the secondary dimension
-     cuNormalizer(int secondaryNX, int secondaryNY, int count);
-     // destructor
-     ~cuNormalizer();
-     // execute correlation surface normalization
-     void execute(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary,
-        cudaStream_t stream);
-};
+// factory with the secondary dimension
+cuNormalizeProcessor* newCuNormalizer(int NX, int NY, int count);
 
 
 template<int Size>
 class cuNormalizeFixed : public cuNormalizeProcessor
 {
 public:
-    void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
+    void execute(cuArrays<real_type> * correlation, cuArrays<real_type> *reference, cuArrays<real_type> *search, cudaStream_t stream) override;
 };
 
 class cuNormalizeSAT : public cuNormalizeProcessor
 {
 private:
-    cuArrays<float> *referenceSum2;
-    cuArrays<float> *secondarySAT;
-    cuArrays<float> *secondarySAT2;
+    cuArrays<real_type> *referenceSum2;
+    cuArrays<real_type> *secondarySAT;
+    cuArrays<real_type> *secondarySAT2;
 
 public:
     cuNormalizeSAT(int secondaryNX, int secondaryNY, int count);
     ~cuNormalizeSAT();
-    void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
+    void execute(cuArrays<real_type> * correlation, cuArrays<real_type> *reference, cuArrays<real_type> *search, cudaStream_t stream) override;
 };
 
 #endif
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu
index 116712986..ae166849d 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu
@@ -12,11 +12,11 @@
 template<const int nthreads, const int NPT>
 __global__ void cuArraysCorrTime_kernel(
     const int nImages,
-    const float *templateIn, const int templateNX, const int templateNY, const int templateSize,
-    const float *imageIn, const int imageNX, const int imageNY, const int imageSize,
-    float *resultOut, const int resultNX, const int resultNY, const int resultSize)
+    const real_type *templateIn, const int templateNX, const int templateNY, const int templateSize,
+    const real_type *imageIn, const int imageNX, const int imageNY, const int imageSize,
+    real_type *resultOut, const int resultNX, const int resultNY, const int resultSize)
 {
-    __shared__ float shmem[nthreads*(1+NPT)];
+    __shared__ real_type shmem[nthreads*(1+NPT)];
     const int tid = threadIdx.x;
     const int bid =  blockIdx.x;
     const int  yc =  blockIdx.y*NPT;
@@ -26,9 +26,9 @@ __global__ void cuArraysCorrTime_kernel(
     const int templateOffset = imageIdx * templateSize;
     const int   resultOffset = imageIdx *   resultSize;
 
-    const float *   imageD =    imageIn  +    imageOffset + tid;
-    const float *templateD = templateIn  + templateOffset + tid;
-     float *  resultD =   resultOut +   resultOffset;
+    const real_type *   imageD =    imageIn  +    imageOffset + tid;
+    const real_type *templateD = templateIn  + templateOffset + tid;
+     real_type *  resultD =   resultOut +   resultOffset;
 
     const int q  = min(nthreads/resultNY, 4);
     const int nt = nthreads/q;
@@ -39,18 +39,18 @@ __global__ void cuArraysCorrTime_kernel(
     const int jbeg = templateNYq * ty;
     const int jend = ty+1 >= q ? templateNY : templateNYq + jbeg;
 
-    float *shTemplate = shmem;
-    float *shImage    = shmem + nthreads;
-    float *shImage1   = shImage + tx;
+    real_type *shTemplate = shmem;
+    real_type *shImage    = shmem + nthreads;
+    real_type *shImage1   = shImage + tx;
 
-    float corrCoeff[NPT];
+    real_type corrCoeff[NPT];
     for (int k = 0; k < NPT; k++)
-        corrCoeff[k] = 0.0f;
+        corrCoeff[k] = 0.0;
 
     int iaddr = yc*imageNY;
 
 
-    float img[NPT];
+    real_type img[NPT];
     for (int k = 0; k < NPT-1; k++, iaddr += imageNY)
         img[k] = imageD[iaddr];
     for (int taddr = 0; taddr < templateSize; taddr += templateNY, iaddr += imageNY)
@@ -98,15 +98,21 @@ __global__ void cuArraysCorrTime_kernel(
  * @param[out] results Output correlation surface
  * @param[in] stream cudaStream
  */
-void cuCorrTimeDomain(cuArrays<float> *templates,
-               cuArrays<float> *images,
-               cuArrays<float> *results,
+void cuCorrTimeDomain(cuArrays<real_type> *templates,
+               cuArrays<real_type> *images,
+               cuArrays<real_type> *results,
                cudaStream_t stream)
 {
     /* compute correlation matrix */
     const int nImages = images->count;
     const int imageNY = images->width;
+
+#if defined(CUAMPCOR_DOUBLE) && __CUDA_ARCH__ < 800
+    // For GPUs older than A100, static shared memory limit is 48K
+    const int NPT = 4;
+#else
     const int NPT = 8;
+#endif
 
 
     const dim3 grid(nImages, (results->width-1)/NPT+1, 1);
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
index cd46e7893..5de6e65bb 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
@@ -13,8 +13,8 @@
  */
  
 #include "cuArrays.h" 
-#include "float2.h" 
-#include <cfloat>
+#include "float2.h"
+#include "data_types.h"
 #include "cudaError.h"
 #include "cudaUtil.h"
 #include "cuAmpcorUtil.h"
@@ -24,23 +24,23 @@
 #include <limits>
 
 
-// cuda does not have a good support on volatile vector struct, e.g. float2
-// have to use regular float type for shared memory (volatile) data
-// the following methods are defined to operate float2/complex objects through float
-inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block) 
+// cuda does not have a good support on volatile vector struct, e.g. real2_type
+// have to use regular real_type type for shared memory (volatile) data
+// the following methods are defined to operate real2_type/complex objects through real_type
+inline static __device__ void copyToShared(volatile real_type *s, const int i, const real2_type x, const int block) 
 { s[i] = x.x; s[i+block] = x.y; }
 
-inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block) 
+inline static __device__ void copyFromShared(real2_type &x, volatile real_type *s, const int i, const int block) 
 { x.x = s[i]; x.y = s[i+block]; }
 
 
-inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block) 
+inline static __device__ void addInShared(volatile real_type *s, const int i, const int j, const int block) 
 { s[i] += s[i+j]; s[i+block] += s[i+j+block];}
 
 
-// kernel to do sum reduction for float2 within a block
+// kernel to do sum reduction for real2_type within a block
 template <const int nthreads>
-__device__ void complexSumReduceBlock(float2& sum, volatile float *shmem)
+__device__ void complexSumReduceBlock(real2_type& sum, volatile real_type *shmem)
 {
     const int tid = threadIdx.x;
     copyToShared(shmem, tid, sum, nthreads);
@@ -65,50 +65,56 @@ __device__ void complexSumReduceBlock(float2& sum, volatile float *shmem)
 
 // cuda kernel for cuDerampMethod1
 template<const int nthreads>
-__global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY, 
-    const int imageSize, const int nImages, const float normCoef)
+__global__ void cuDerampMethod1_kernel(real2_type *images, const int imageNX, int const imageNY, 
+    const int imageSize, const int nImages, const real_type normCoef)
 {
-    __shared__ float shmem[2*nthreads];
+    __shared__ real_type shmem[2*nthreads];
     int pixelIdx, pixelIdxX, pixelIdxY;
     
     const int bid = blockIdx.x;    
     if(bid >= nImages) return;
-    float2 *image = images+ bid*imageSize;
+    real2_type *image = images+ bid*imageSize;
     const int tid = threadIdx.x;  
-    float2 phaseDiffY  = make_float2(0.0f, 0.0f);
+    real2_type phaseDiffY  = make_real2(0.0, 0.0);
     for (int i = tid; i < imageSize; i += nthreads) {
         pixelIdxY = i % imageNY;
         if(pixelIdxY < imageNY -1) {
             pixelIdx = i;
-            float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);   
+            real2_type cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);   
             phaseDiffY += cprod;
         } 
     }       
     complexSumReduceBlock<nthreads>(phaseDiffY, shmem);
     //phaseDiffY *= normCoef;
-    float phaseY=atan2f(phaseDiffY.y, phaseDiffY.x);
+    real_type phaseY=atan2(phaseDiffY.y, phaseDiffY.x);
 
-    float2 phaseDiffX  = make_float2(0.0f, 0.0f);
+    real2_type phaseDiffX  = make_real2(0.0, 0.0);
     for (int i = tid; i < imageSize; i += nthreads)  {
         pixelIdxX = i / imageNY; 
         if(pixelIdxX < imageNX -1) {
             pixelIdx = i;
-            float2 cprod = complexMulConj(image[i], image[i+imageNY]);
+            real2_type cprod = complexMulConj(image[i], image[i+imageNY]);
             phaseDiffX += cprod;
         }
     }   
-    
     complexSumReduceBlock<nthreads>(phaseDiffX, shmem);
-   
     //phaseDiffX *= normCoef;
-    float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x);  //+FLT_EPSILON
-     
+    real_type phaseX = atan2(phaseDiffX.y, phaseDiffX.x);  //+FLT_EPSILON
+
+#ifdef CUAMPCOR_DEBUG
+    // output the phase ramp in debug mode
+    if (threadIdx.x==0)
+    {
+        printf("debug deramp az: %g rg: %g\n", phaseX, phaseY);
+    }
+#endif
+
     for (int i = tid; i < imageSize; i += nthreads)
     { 
         pixelIdxX = i%imageNY;
         pixelIdxY = i/imageNY;
-        float phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
-        float2 phase_factor = make_float2(cosf(phase), sinf(phase));
+        real_type phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
+        real2_type phase_factor = make_real2(cosf(phase), sinf(phase));
         image[i] *= phase_factor;
     }     
 }
@@ -120,12 +126,12 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co
  * @param[in,out] images input/output complex signals
  * @param[in] stream cuda stream
  */
-void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream)
+void cuDerampMethod1(cuArrays<real2_type> *images, cudaStream_t stream)
 {
     
     const dim3 grid(images->count);
     const int imageSize = images->width*images->height;
-    const float invSize = 1.0f/imageSize;
+    const real_type invSize = 1.0f/imageSize;
 
     if(imageSize <=64) {
         cuDerampMethod1_kernel<64> <<<grid, 64, 0, stream>>>
@@ -147,7 +153,7 @@ void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream)
 
 }
         
-void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream)
+void cuDeramp(int method, cuArrays<real2_type> *images, cudaStream_t stream)
 {
     switch(method) {
     case 1:
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu
index 18412bfe1..e8a92070d 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu
@@ -7,7 +7,8 @@
 
 #include "cuArrays.h"
 #include "float2.h"
-#include <cfloat>
+#include "float.h"
+#include "data_types.h"
 #include "cudaUtil.h"
 #include "cudaError.h"
 #include "cuAmpcorUtil.h"
@@ -17,14 +18,45 @@
 #include <limits>
 
 // cuda kernel for cuEstimateSnr
-__global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrValidCount, const float* maxval, float* snrValue, const int size)
+__global__ void cudaKernel_estimateSnr(const real_type* corrSum, const real_type* maxval, real_type* snrValue, const int size, const int nImages)
+{
+    int idx = threadIdx.x + blockDim.x*blockIdx.x;
+
+    if (idx >= nImages) return;
+
+    real_type peak =  maxval[idx];
+    peak *= peak;
+    real_type mean = (corrSum[idx] - peak) / (size - 1);
+    snrValue[idx] = peak / mean;
+#ifdef CUAMPCOR_DEBUG
+    if(threadIdx.x==0)
+        printf("debug snr %g %g %g\n", peak, mean, snrValue[idx]);
+#endif
+}
+
+/**
+ * Estimate the signal to noise ratio (SNR) of the correlation surface
+ * @param[in] corrSum the sum of the correlation surface
+ * @param[in] corrValidCount the number of valid pixels contributing to sum
+ * @param[out] snrValue return snr value
+ * @param[in] stream cuda stream
+ */
+void cuEstimateSnr(cuArrays<real_type> *corrSum, cuArrays<real_type> *maxval, cuArrays<real_type> *snrValue, const int size, cudaStream_t stream)
+{
+    int nImages = corrSum->getSize();
+    cudaKernel_estimateSnr<<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
+        (corrSum->devData,  maxval->devData, snrValue->devData, size, nImages);
+    getLastCudaError("cuda kernel estimate stats error\n");
+}
+
+__global__ void cudaKernel_estimateSnr(const real_type* corrSum, const int* corrValidCount, const real_type* maxval, real_type* snrValue, const int size)
 
 {
     int idx = threadIdx.x + blockDim.x*blockIdx.x;
 
     if (idx >= size) return;
 
-    float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1);
+    real_type mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1);
 
     snrValue[idx] = maxval[idx] * maxval[idx] / mean;
 }
@@ -36,7 +68,7 @@ __global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrVali
  * @param[out] snrValue return snr value
  * @param[in] stream cuda stream
  */
-void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream)
+void cuEstimateSnr(cuArrays<real_type> *corrSum, cuArrays<int> *corrValidCount, cuArrays<real_type> *maxval, cuArrays<real_type> *snrValue, cudaStream_t stream)
 {
 
     int size = corrSum->getSize();
@@ -46,8 +78,8 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
 }
 
 // cuda kernel for cuEstimateVariance
-__global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, const int NY, const int2* maxloc,
-        const float* maxval, const int templateSize, float3* covValue, const int size)
+__global__ void cudaKernel_estimateVar(const real_type* corrBatchRaw, const int NX, const int NY, const int2* maxloc,
+        const real_type* maxval, const int templateSize, const int distance, real3_type* covValue, const int size)
 {
 
     // Find image id.
@@ -58,53 +90,53 @@ __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX,
     // Preparation.
     int px = maxloc[idxImage].x;
     int py = maxloc[idxImage].y;
-    float peak = maxval[idxImage];
+    real_type peak = maxval[idxImage];
 
     // Check if maxval is on the margin.
-    if (px-1 < 0 || py-1 <0 || px + 1 >=NX || py+1 >=NY)  {
+    if (px-distance < 0 || py-distance <0 || px + distance >=NX || py+distance >=NY)  {
 
-        covValue[idxImage] = make_float3(99.0, 99.0, 0.0);
+        covValue[idxImage] = make_real3(99.0, 99.0, 0.0);
 
     }
     else {
         int offset = NX * NY * idxImage;
-        int idx00 = offset + (px - 1) * NY + py - 1;
-        int idx01 = offset + (px - 1) * NY + py    ;
-        int idx02 = offset + (px - 1) * NY + py + 1;
-        int idx10 = offset + (px    ) * NY + py - 1;
+        int idx00 = offset + (px - distance) * NY + py - distance;
+        int idx01 = offset + (px - distance) * NY + py    ;
+        int idx02 = offset + (px - distance) * NY + py + distance;
+        int idx10 = offset + (px    ) * NY + py - distance;
         int idx11 = offset + (px    ) * NY + py    ;
-        int idx12 = offset + (px    ) * NY + py + 1;
-        int idx20 = offset + (px + 1) * NY + py - 1;
-        int idx21 = offset + (px + 1) * NY + py    ;
-        int idx22 = offset + (px + 1) * NY + py + 1;
+        int idx12 = offset + (px    ) * NY + py + distance;
+        int idx20 = offset + (px + distance) * NY + py - distance;
+        int idx21 = offset + (px + distance) * NY + py    ;
+        int idx22 = offset + (px + distance) * NY + py + distance;
 
         // second-order derivatives
-        float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2.0*corrBatchRaw[idx11] );
-        float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2.0*corrBatchRaw[idx11] ) ;
-        float dxy = ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25;
+        real_type dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2.0*corrBatchRaw[idx11] );
+        real_type dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2.0*corrBatchRaw[idx11] ) ;
+        real_type dxy = ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25;
 
-        float n2 = fmaxf(1.0 - peak, 0.0);
+        real_type n2 = max(1.0 - peak, 0.0);
 
         dxx = dxx * templateSize;
         dyy = dyy * templateSize;
         dxy = dxy * templateSize;
 
-        float n4 = n2*n2;
+        real_type n4 = n2*n2;
         n2 = n2 * 2;
         n4 = n4 * 0.5 * templateSize;
 
-        float u = dxy * dxy - dxx * dyy;
-        float u2 = u*u;
+        real_type u = dxy * dxy - dxx * dyy;
+        real_type u2 = u*u;
 
         // if the Gaussian curvature is too small
         if (fabsf(u) < 1e-2) {
-            covValue[idxImage] = make_float3(99.0, 99.0, 0.0);
+            covValue[idxImage] = make_real3(99.0, 99.0, 0.0);
         }
         else {
-                float cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2;
-                float cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2;
-                float cov_xy = (  n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2;
-                covValue[idxImage] = make_float3(cov_xx, cov_yy, cov_xy);
+                real_type cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2;
+                real_type cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2;
+                real_type cov_xy = (  n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2;
+                covValue[idxImage] = make_real3(cov_xx, cov_yy, cov_xy);
         }
     }
 }
@@ -112,18 +144,19 @@ __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX,
 /**
  * Estimate the variance of the correlation surface
  * @param[in] templateSize size of reference chip
+ * @param[in] distance distance between a pixel
  * @param[in] corrBatchRaw correlation surface
  * @param[in] maxloc maximum location
  * @param[in] maxval maximum value
  * @param[out] covValue variance value
  * @param[in] stream cuda stream
  */
-void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, int templateSize, cuArrays<float3> *covValue, cudaStream_t stream)
+void cuEstimateVariance(cuArrays<real_type> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<real_type> *maxval, const int templateSize, const int distance, cuArrays<real3_type> *covValue, cudaStream_t stream)
 {
     int size = corrBatchRaw->count;
     // One dimensional launching parameters to loop over every correlation surface.
     cudaKernel_estimateVar<<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
-        (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, templateSize, covValue->devData, size);
+        (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, templateSize, distance, covValue->devData, size);
     getLastCudaError("cudaKernel_estimateVar error\n");
 }
 //end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu
index ac77f131f..f6bca24da 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu
@@ -6,11 +6,9 @@
 
 // my module dependencies
 #include "cuAmpcorUtil.h"
-// for FLT_MAX
-#include <cfloat>
 
 // find the max between two elements
-inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc,
+inline static __device__ void maxPairReduce(volatile real_type* maxval, volatile int* maxloc,
       size_t gid, size_t strideid)
 {
     if(maxval[gid] < maxval[strideid]) {
@@ -19,32 +17,38 @@ inline static __device__ void maxPairReduce(volatile float* maxval, volatile int
     }
 }
 
-// max reduction kernel
+// max reduction kernel for a 2d image
+// start from (start.x, start.y) in a rectangle range (range.x, range.y)
+// start + range <= imageSize
 template<const int BLOCKSIZE>
-__device__ void max_reduction(const float* const images,
-    const size_t imageSize,
-    const size_t nImages,
-    volatile float* shval,
+__device__ void max_reduction_2d(const real_type* const image,
+    const int2 imageSize,
+    const int2 start, const int2 range,
+    volatile real_type* shval,
     volatile int* shloc)
 {
     int tid = threadIdx.x;
-    shval[tid] = -FLT_MAX;
-    int imageStart = blockIdx.x*imageSize;
-    int imagePixel;
+    shval[tid] = -REAL_MAX;
 
     // reduction for intra-block elements
     // i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
-    for(int gid = tid; gid < imageSize; gid+=blockDim.x)
+    for(int gid = tid; gid < range.x*range.y; gid+=blockDim.x)
     {
-        imagePixel = imageStart+gid;
-        if(shval[tid] < images[imagePixel]) {
-            shval[tid] = images[imagePixel];
-            shloc[tid] = gid;
+        // gid is the flattened pixel id in the search range
+        // get the pixel 2d coordinate in whole image
+        int idx = start.x + gid / range.y;
+        int idy = start.y + gid % range.y;
+        // get the flattened 1d coordinate
+        int pixelId = IDX2R(idx, idy, imageSize.y);
+        real_type pixelValue = image[pixelId];
+        if(shval[tid] < pixelValue) {
+            shval[tid] = pixelValue;
+            shloc[tid] = pixelId;
         }
     }
     __syncthreads();
 
-    // reduction within a block
+    // reduction within a block with shared memory
     if (BLOCKSIZE >=1024){ if (tid < 512) { maxPairReduce(shval, shloc, tid, tid + 512); } __syncthreads(); }
     if (BLOCKSIZE >=512) { if (tid < 256) { maxPairReduce(shval, shloc, tid, tid + 256); } __syncthreads(); }
     if (BLOCKSIZE >=256) { if (tid < 128) { maxPairReduce(shval, shloc, tid, tid + 128); } __syncthreads(); }
@@ -65,21 +69,24 @@ __device__ void max_reduction(const float* const images,
 
 // kernel for 2D array(image), find max location only
 template <const int BLOCKSIZE>
-__global__ void  cudaKernel_maxloc2D(const float* const images, int2* maxloc, float* maxval,
-    const size_t imageNX, const size_t imageNY, const size_t nImages)
+__global__ void  cudaKernel_maxloc2D(const real_type* const images, int2* maxloc, real_type* maxval,
+    const int2 imageSize, const size_t nImages, const int2 start, const int2 range)
 {
-    __shared__ float shval[BLOCKSIZE];
+    __shared__ real_type shval[BLOCKSIZE];
     __shared__ int shloc[BLOCKSIZE];
 
-    int bid = blockIdx.x;
-    if(bid >= nImages) return;
+    int imageIdx = blockIdx.x;
+    if(imageIdx >= nImages) return;
 
-    const int imageSize = imageNX * imageNY;
-    max_reduction<BLOCKSIZE>(images, imageSize, nImages, shval, shloc);
+    // get the starting pointer for this image
+    const real_type *image = images + imageIdx*imageSize.x*imageSize.y;
 
+    max_reduction_2d<BLOCKSIZE>(image, imageSize, start, range, shval, shloc);
+
+    // thread 0 contains the final result, convert it to 2d coordinate
     if (threadIdx.x == 0) {
-        maxloc[bid] = make_int2(shloc[0]/imageNY, shloc[0]%imageNY);
-        maxval[bid] = shval[0];
+        maxloc[imageIdx] = make_int2(shloc[0]/imageSize.y, shloc[0]%imageSize.y);
+        maxval[imageIdx] = shval[0];
     }
 }
 
@@ -88,56 +95,92 @@ __global__ void  cudaKernel_maxloc2D(const float* const images, int2* maxloc, fl
  * @param[in] images input batch of images
  * @param[out] maxval arrays to hold the max values
  * @param[out] maxloc arrays to hold the max locations
+ * @param[in] start starting search pixel
+ * @param[in] range search range
  * @param[in] stream cudaStream
  * @note This routine is overloaded with the routine without maxval
  */
-void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc,
-                      cuArrays<float> *maxval, cudaStream_t stream)
+void cuArraysMaxloc2D(cuArrays<real_type> *images,
+                      const int2 start, const int2 range,
+                      cuArrays<int2> *maxloc, cuArrays<real_type> *maxval, cudaStream_t stream)
 {
     cudaKernel_maxloc2D<NTHREADS><<<images->count, NTHREADS, 0, stream>>>
-        (images->devData, maxloc->devData, maxval->devData, images->height, images->width, images->count);
+        (images->devData,
+        maxloc->devData, maxval->devData,
+        make_int2(images->height, images->width), images->count,
+        start, range
+        );
     getLastCudaError("cudaKernel find max location 2D error\n");
 }
 
-//kernel and function for 2D array(image), find max location only, use overload
-template <const int BLOCKSIZE>
-__global__ void  cudaKernel_maxloc2D(const float* const images, int2* maxloc, const size_t imageNX, const size_t imageNY, const size_t nImages)
-{
-    __shared__ float shval[BLOCKSIZE];
-    __shared__ int shloc[BLOCKSIZE];
-
-    int bid = blockIdx.x;
-    if(bid >= nImages) return;
-
-    const int imageSize = imageNX * imageNY;
-    max_reduction<BLOCKSIZE>(images, imageSize, nImages, shval, shloc);
-
-    if (threadIdx.x == 0) {
-        int xloc = shloc[0]/imageNY;
-        int yloc = shloc[0]%imageNY;
-        maxloc[bid] = make_int2(xloc, yloc);
-    }
-}
-
 /**
- * Find (only) the maximum location for a batch of 2D images
+ * Find both the maximum value and the location for a batch of 2D images
  * @param[in] images input batch of images
+ * @param[out] maxval arrays to hold the max values
  * @param[out] maxloc arrays to hold the max locations
  * @param[in] stream cudaStream
- * @note This routine is overloaded with the routine with maxval
  */
-void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cudaStream_t stream)
+void cuArraysMaxloc2D(cuArrays<real_type> *images,
+                      cuArrays<int2> *maxloc, cuArrays<real_type> *maxval, cudaStream_t stream)
 {
+    // if no start and range are provided, use the whole image
+    int2 start = make_int2(0, 0);
+    int2 imageSize = make_int2(images->height, images->width);
     cudaKernel_maxloc2D<NTHREADS><<<images->count, NTHREADS, 0, stream>>>
-        (images->devData, maxloc->devData, images->height, images->width, images->count);
+        (images->devData,
+        maxloc->devData, maxval->devData,
+        imageSize, images->count,
+        start, imageSize
+        );
     getLastCudaError("cudaKernel find max location 2D error\n");
 }
 
 // cuda kernel for cuSubPixelOffset
 __global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offsetZoomIn,
-                                        float2 *offsetFinal,
-                                        const float OSratio,
-                                        const float xoffset, const float yoffset, const int size)
+                                        real2_type *offsetFinal, const int size,
+                                        const int2 initOrigin, const real_type initRatio,
+                                        const int2 zoomInOrigin, const real_type zoomInRatio)
+{
+    int idx = threadIdx.x + blockDim.x*blockIdx.x;
+    if (idx >= size) return;
+    offsetFinal[idx].x = initRatio*(offsetInit[idx].x-initOrigin.x) + zoomInRatio*(offsetZoomIn[idx].x - zoomInOrigin.x);
+    offsetFinal[idx].y = initRatio*(offsetInit[idx].y-initOrigin.y) + zoomInRatio*(offsetZoomIn[idx].y - zoomInOrigin.y);
+}
+
+
+/**
+ * Determine the final offset value
+ * @param[in] offsetInit max location (adjusted to the starting location for extraction) determined from
+ *   the cross-correlation before oversampling, in dimensions of pixel
+ * @param[in] offsetZoomIn max location from the oversampled cross-correlation surface
+ * @param[out] offsetFinal the combined offset value
+ */
+void cuSubPixelOffset(cuArrays<int2> *offsetInit,
+    cuArrays<int2> *offsetZoomIn,
+    cuArrays<real2_type> *offsetFinal,
+    const int2 initOrigin, const int initFactor,
+    const int2 zoomInOrigin, const int zoomInFactor,
+    cudaStream_t stream)
+{
+    int size = offsetInit->getSize();
+
+    // GPU performs multiplication faster
+    real_type initRatio = 1.0f/(real_type)(initFactor);
+    real_type zoomInRatio = 1.0f/(real_type)(zoomInFactor);
+
+
+    cuSubPixelOffset_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
+        (offsetInit->devData, offsetZoomIn->devData,
+         offsetFinal->devData, size, initOrigin, initRatio, zoomInOrigin, zoomInRatio);
+    getLastCudaError("cuSubPixelOffset_kernel");
+
+}
+
+// cuda kernel for cuSubPixelOffset
+__global__ void cuSubPixelOffset2Pass_kernel(const int2 *offsetInit, const int2 *offsetZoomIn,
+                                        real2_type *offsetFinal,
+                                        const real_type OSratio,
+                                        const real_type xoffset, const real_type yoffset, const int size)
 {
     int idx = threadIdx.x + blockDim.x*blockIdx.x;
     if (idx >= size) return;
@@ -168,8 +211,8 @@ __global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offs
  *        = subpixel offset
  *    Final offset =  pixel size offset +  subpixel offset
  */
-void cuSubPixelOffset(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetZoomIn,
-    cuArrays<float2> *offsetFinal,
+void cuSubPixelOffset2Pass(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetZoomIn,
+    cuArrays<real2_type> *offsetFinal,
     int OverSampleRatioZoomin, int OverSampleRatioRaw,
     int xHalfRangeInit,  int yHalfRangeInit,
     cudaStream_t stream)
@@ -179,13 +222,14 @@ void cuSubPixelOffset(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetZoomIn,
     float xoffset = xHalfRangeInit ;
     float yoffset = yHalfRangeInit ;
 
-    cuSubPixelOffset_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
+    cuSubPixelOffset2Pass_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
         (offsetInit->devData, offsetZoomIn->devData,
          offsetFinal->devData, OSratio, xoffset, yoffset, size);
-    getLastCudaError("cuSubPixelOffset_kernel");
+    getLastCudaError("cuSubPixelOffset2Pass_kernel");
 
 }
 
+
 // cuda device function to compute the shift of center
 static inline __device__ int2 dev_adjustOffset(
     const int oldRange, const int newRange, const int maxloc)
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cpp
similarity index 68%
rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu
rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cpp
index 1b6ab6267..9da46c58e 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cpp
@@ -36,9 +36,9 @@ cuOverSamplerC2C::cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int
     */
 
     // set up work arrays
-    workIn = new cuArrays<float2>(inNXp2, inNYp2, nImages);
+    workIn = new cuArrays<complex_type>(inNXp2, inNYp2, nImages);
     workIn->allocate();
-    workOut = new cuArrays<float2>(outNXp2, outNYp2, nImages);
+    workOut = new cuArrays<complex_type>(outNXp2, outNYp2, nImages);
     workOut->allocate();
 
     // set up fft plans
@@ -47,8 +47,13 @@ cuOverSamplerC2C::cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int
     int fImageSize = inNXp2*inNYp2;
     int nOverSample[NRANK] = {outNXp2, outNYp2};
     int fImageOverSampleSize = outNXp2*outNYp2;
+#ifdef CUAMPCOR_DOUBLE
+    cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_Z2Z, nImages));
+    cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nOverSample, NULL, 1, fImageOverSampleSize, NULL, 1, fImageOverSampleSize, CUFFT_Z2Z, nImages));
+#else
     cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_C2C, nImages));
     cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nOverSample, NULL, 1, fImageOverSampleSize, NULL, 1, fImageOverSampleSize, CUFFT_C2C, nImages));
+#endif
     // set cuda stream
     setStream(stream_);
 }
@@ -69,12 +74,23 @@ void cuOverSamplerC2C::setStream(cudaStream_t stream_)
  * @param[out] imagesOut output batch of images
  * @param[in] method phase deramping method
  */
-void cuOverSamplerC2C::execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int method)
+void cuOverSamplerC2C::execute(cuArrays<complex_type> *imagesIn, cuArrays<complex_type> *imagesOut, int method)
 {   
-    cuDeramp(method, imagesIn, stream);         
-    cufft_Error(cufftExecC2C(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_INVERSE ));
-    cuArraysPaddingMany(workIn, workOut, stream);
-    cufft_Error(cufftExecC2C(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_FORWARD));
+    cuDeramp(method, imagesIn, stream);
+
+ #ifdef CUAMPCOR_DOUBLE
+    cufft_Error(cufftExecZ2Z(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_FORWARD));
+ #else
+    cufft_Error(cufftExecC2C(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_FORWARD));
+ #endif
+
+    cuArraysFFTPaddingMany(workIn, workOut, stream);
+
+#ifdef CUAMPCOR_DOUBLE
+    cufft_Error(cufftExecZ2Z(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_INVERSE));
+#else
+    cufft_Error(cufftExecC2C(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_INVERSE));
+#endif
 }
 
 /// destructor
@@ -117,12 +133,17 @@ cuOverSamplerR2R::cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int
     int fImageSize = inNXp2*inNYp2;
     int nUpSample[NRANK] = {outNXp2, outNYp2};
     int fImageUpSampleSize = outNXp2*outNYp2;
-    workSizeIn = new cuArrays<float2>(inNXp2, inNYp2, nImages);
+    workSizeIn = new cuArrays<complex_type>(inNXp2, inNYp2, nImages);
     workSizeIn->allocate();
-    workSizeOut = new cuArrays<float2>(outNXp2, outNYp2, nImages);
+    workSizeOut = new cuArrays<complex_type>(outNXp2, outNYp2, nImages);
     workSizeOut->allocate();
+#ifdef CUAMPCOR_DOUBLE
+    cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_Z2Z, nImages));
+    cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nUpSample, NULL, 1, fImageUpSampleSize, NULL, 1, outNX*outNY, CUFFT_Z2Z, nImages));
+#else
     cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_C2C, nImages));
     cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nUpSample, NULL, 1, fImageUpSampleSize, NULL, 1, outNX*outNY, CUFFT_C2C, nImages));
+#endif
     setStream(stream);
 }
 
@@ -138,12 +159,25 @@ void cuOverSamplerR2R::setStream(cudaStream_t stream_)
  * @param[in] imagesIn input batch of images
  * @param[out] imagesOut output batch of images
  */
-void cuOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut)
+void cuOverSamplerR2R::execute(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut)
 {
+
     cuArraysCopyPadded(imagesIn, workSizeIn, stream);
-    cufft_Error(cufftExecC2C(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_INVERSE));
-    cuArraysPaddingMany(workSizeIn, workSizeOut, stream);
-    cufft_Error(cufftExecC2C(backwardPlan, workSizeOut->devData, workSizeOut->devData,CUFFT_FORWARD ));
+
+#ifdef CUAMPCOR_DOUBLE
+    cufft_Error(cufftExecZ2Z(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_FORWARD));
+#else
+    cufft_Error(cufftExecC2C(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_FORWARD));
+#endif
+
+    cuArraysFFTPaddingMany(workSizeIn, workSizeOut, stream);
+
+#ifdef CUAMPCOR_DOUBLE
+    cufft_Error(cufftExecZ2Z(backwardPlan, workSizeOut->devData, workSizeOut->devData, CUFFT_INVERSE));
+#else
+    cufft_Error(cufftExecC2C(backwardPlan, workSizeOut->devData, workSizeOut->devData, CUFFT_INVERSE));
+#endif
+
     cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0), stream);	
 }
 
@@ -156,9 +190,4 @@ cuOverSamplerR2R::~cuOverSamplerR2R()
     workSizeOut->deallocate();
 }
 
-// end of file
-
-
-
-
-
+// end of file
\ No newline at end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h
index 9ddce96b2..cfc202047 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h
@@ -12,7 +12,8 @@
 #define __CUOVERSAMPLER_H
 
 #include "cuArrays.h"
-#include "cudaUtil.h"
+#include "data_types.h"
+#include <cufft.h>
 
 // FFT Oversampler for complex images
 class cuOverSamplerC2C
@@ -21,8 +22,8 @@ class cuOverSamplerC2C
      cufftHandle forwardPlan;   // forward fft handle
      cufftHandle backwardPlan;  // backward fft handle
      cudaStream_t stream;       // cuda stream
-     cuArrays<float2> *workIn;  // work array to hold forward fft data
-     cuArrays<float2> *workOut; // work array to hold padded data
+     cuArrays<complex_type> *workIn;  // work array to hold forward fft data
+     cuArrays<complex_type> *workOut; // work array to hold padded data
 public:
      // disable the default constructor
      cuOverSamplerC2C() = delete;
@@ -31,7 +32,7 @@ class cuOverSamplerC2C
      // set cuda stream
      void setStream(cudaStream_t stream_);
      // execute oversampling
-     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int deramp_method=0);
+     void execute(cuArrays<complex_type> *imagesIn, cuArrays<complex_type> *imagesOut, int deramp_method=0);
      // destructor
      ~cuOverSamplerC2C();
 };
@@ -43,14 +44,14 @@ class cuOverSamplerR2R
      cufftHandle forwardPlan;
      cufftHandle backwardPlan;
      cudaStream_t stream;
-     cuArrays<float2> *workSizeIn;
-     cuArrays<float2> *workSizeOut;
+     cuArrays<complex_type> *workSizeIn;
+     cuArrays<complex_type> *workSizeOut;
 
 public:
     cuOverSamplerR2R() = delete;
     cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
     void setStream(cudaStream_t stream_);
-    void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut);
+    void execute(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut);
     ~cuOverSamplerR2R();
 };
 
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu
index e851eefdb..cc32aedb2 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu
@@ -24,7 +24,7 @@ cuSincOverSamplerR2R::cuSincOverSamplerR2R(const int i_covs_, cudaStream_t strea
     stream = stream_;
     i_intplength = int(r_relfiltlen/r_beta+0.5f);
     i_filtercoef = i_intplength*i_decfactor;
-    checkCudaErrors(cudaMalloc((void **)&r_filter, (i_filtercoef+1)*sizeof(float)));
+    checkCudaErrors(cudaMalloc((void **)&r_filter, (i_filtercoef+1)*sizeof(real_type)));
     cuSetupSincKernel();
 }
 
@@ -35,17 +35,17 @@ cuSincOverSamplerR2R::~cuSincOverSamplerR2R()
 }
 
 // cuda kernel for cuSetupSincKernel
-__global__ void cuSetupSincKernel_kernel(float *r_filter_, const int i_filtercoef_,
-    const float r_soff_, const float r_wgthgt_, const int i_weight_,
-    const float r_soff_inverse_, const float r_beta_, const float r_decfactor_inverse_)
+__global__ void cuSetupSincKernel_kernel(real_type *r_filter_, const int i_filtercoef_,
+    const real_type r_soff_, const real_type r_wgthgt_, const int i_weight_,
+    const real_type r_soff_inverse_, const real_type r_beta_, const real_type r_decfactor_inverse_)
 {
     int i = threadIdx.x + blockDim.x*blockIdx.x;
     if(i > i_filtercoef_) return;
-    float r_wa = i - r_soff_;
-    float r_wgt = (1.0f - r_wgthgt_) + r_wgthgt_*cos(PI*r_wa*r_soff_inverse_);
-    float r_s = r_wa*r_beta_*r_decfactor_inverse_*PI;
-    float r_fct;
-    if(r_s != 0.0f) {
+    real_type r_wa = i - r_soff_;
+    real_type r_wgt = (1.0f - r_wgthgt_) + r_wgthgt_*cos(PI*r_wa*r_soff_inverse_);
+    real_type r_s = r_wa*r_beta_*r_decfactor_inverse_*PI;
+    real_type r_fct;
+    if(r_s != 0.0) {
         r_fct = sin(r_s)/r_s;
     }
     else {
@@ -68,10 +68,10 @@ void cuSincOverSamplerR2R::cuSetupSincKernel()
     const int nblocks = IDIVUP(i_filtercoef+1, nthreads);
 
     // compute some commonly used constants at first
-    float r_wgthgt =  (1.0f - r_pedestal)/2.0f;
-    float r_soff = (i_filtercoef-1.0f)/2.0f;
-    float r_soff_inverse = 1.0f/r_soff;
-    float r_decfactor_inverse = 1.0f/i_decfactor;
+    real_type r_wgthgt =  (1.0f - r_pedestal)/2.0f;
+    real_type r_soff = (i_filtercoef-1.0f)/2.0f;
+    real_type r_soff_inverse = 1.0f/r_soff;
+    real_type r_decfactor_inverse = 1.0f/i_decfactor;
 
     cuSetupSincKernel_kernel<<<nblocks, nthreads, 0, stream>>> (
         r_filter, i_filtercoef, r_soff, r_wgthgt, i_weight,
@@ -81,11 +81,12 @@ void cuSincOverSamplerR2R::cuSetupSincKernel()
 
 
 // cuda kernel for cuSincOverSamplerR2R::execute
-__global__ void cuSincInterpolation_kernel(const int nImages,
-    const float * imagesIn, const int inNX, const int inNY,
-    float * imagesOut, const int outNX, const int outNY,
-    int2 *centerShift, int factor,
-    const float * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_,
+// define a device function to overload the cases with constant or varying center shifts
+__device__ void cuSincInterpolation_kernel_common(const int nImages,
+    const real_type * imagesIn, const int inNX, const int inNY,
+    real_type * imagesOut, const int outNX, const int outNY,
+    int2 centerShift, int factor,
+    const real_type * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_,
     const int i_startX, const int i_startY, const int i_int_size)
 {
     // get image index
@@ -95,35 +96,33 @@ __global__ void cuSincInterpolation_kernel(const int nImages,
     int idxY = threadIdx.y + blockDim.y*blockIdx.y;
     // cuda: to make sure extra allocated threads doing nothing
     if(idxImage >=nImages || idxX >= i_int_size || idxY >= i_int_size) return;
-    // decide the center shift
-    int2 shift = centerShift[idxImage];
     // determine the output pixel indices
-    int outx = idxX + i_startX + shift.x*factor;
+    int outx = idxX + i_startX + centerShift.x*factor;
     if (outx >= outNX) outx-=outNX;
-    int outy = idxY + i_startY +  shift.y*factor;
+    int outy = idxY + i_startY +  centerShift.y*factor;
     if (outy >= outNY) outy-=outNY;
     // flattened to 1d
     int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
 
     // index in input grids
-    float r_xout = (float)outx/i_covs_;
+    real_type r_xout = (real_type)outx/i_covs_;
      // integer part
     int i_xout = int(r_xout);
     // factional part
-    float r_xfrac = r_xout - i_xout;
+    real_type r_xfrac = r_xout - i_xout;
     // fractional part in terms of the interpolation kernel grids
     int i_xfrac = int(r_xfrac*i_decfactor_);
 
     // same procedure for y
-    float r_yout = (float)outy/i_covs_;
+    real_type r_yout = (real_type)outy/i_covs_;
     int i_yout = int(r_yout);
-    float r_yfrac = r_yout - i_yout;
+    real_type r_yfrac = r_yout - i_yout;
     int i_yfrac = int(r_yfrac*i_decfactor_);
 
     // temp variables
-    float intpData = 0.0f; // interpolated value
-    float r_sincwgt = 0.0f; // total filter weight
-    float r_sinc_coef; // filter weight
+    real_type intpData = 0.0; // interpolated value
+    real_type r_sincwgt = 0.0; // total filter weight
+    real_type r_sinc_coef; // filter weight
 
     // iterate over lines of input image
     // i=0 -> -i_intplength/2
@@ -135,7 +134,7 @@ __global__ void cuSincInterpolation_kernel(const int nImages,
         if(inx < 0) inx+= inNX;
         if(inx >= inNX) inx-= inNY;
 
-        float r_xsinc_coef = r_filter_[i*i_decfactor_+i_xfrac];
+        real_type r_xsinc_coef = r_filter_[i*i_decfactor_+i_xfrac];
 
         for(int j=0; j< i_intplength_; j++) {
             // find the corresponding pixel in input(unsampled) image
@@ -143,7 +142,7 @@ __global__ void cuSincInterpolation_kernel(const int nImages,
             if(iny < 0) iny += inNY;
             if(iny >= inNY) iny -= inNY;
 
-            float r_ysinc_coef = r_filter_[j*i_decfactor_+i_yfrac];
+            real_type r_ysinc_coef = r_filter_[j*i_decfactor_+i_yfrac];
             // multiply the factors from xy
             r_sinc_coef = r_xsinc_coef*r_ysinc_coef;
             // add to total sinc weight
@@ -156,6 +155,36 @@ __global__ void cuSincInterpolation_kernel(const int nImages,
     imagesOut[idxOut] = intpData/r_sincwgt;
 }
 
+//constant shift
+__global__ void cuSincInterpolation_kernel(const int nImages,
+    const real_type * imagesIn, const int inNX, const int inNY,
+    real_type * imagesOut, const int outNX, const int outNY,
+    int2 centerShift, int factor,
+    const real_type * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_,
+    const int i_startX, const int i_startY, const int i_int_size)
+ {
+    // pass through
+    cuSincInterpolation_kernel_common(nImages, imagesIn, inNX, inNY,
+        imagesOut, outNX, outNY, centerShift, factor,
+        r_filter_, i_covs_, i_decfactor_, i_intplength_,
+        i_startX, i_startY, i_int_size);
+ }
+
+__global__ void cuSincInterpolation_kernel(const int nImages,
+    const real_type * imagesIn, const int inNX, const int inNY,
+    real_type * imagesOut, const int outNX, const int outNY,
+    int2 *centerShift, int factor,
+    const real_type * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_,
+    const int i_startX, const int i_startY, const int i_int_size)
+ {
+    // get image index
+    int idxImage = blockIdx.z;
+    cuSincInterpolation_kernel_common(nImages, imagesIn, inNX, inNY,
+        imagesOut, outNX, outNY, centerShift[idxImage], factor,
+        r_filter_, i_covs_, i_decfactor_, i_intplength_,
+        i_startX, i_startY, i_int_size);
+ }
+
 /**
  * Execute sinc interpolation
  * @param[in] imagesIn input images
@@ -165,7 +194,7 @@ __global__ void cuSincInterpolation_kernel(const int nImages,
  * @note rawOversamplingFactor is for the centerShift, not the signal oversampling factor
  */
 
-void cuSincOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
+void cuSincOverSamplerR2R::execute(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut,
     cuArrays<int2> *centerShift, int rawOversamplingFactor)
 {
     const int nImages = imagesIn->count;
@@ -194,4 +223,33 @@ void cuSincOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *i
     getLastCudaError("cuSincInterpolation_kernel");
 }
 
+void cuSincOverSamplerR2R::execute(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut,
+    int2 centerShift, int rawOversamplingFactor)
+{
+    const int nImages = imagesIn->count;
+    const int inNX = imagesIn->height;
+    const int inNY = imagesIn->width;
+    const int outNX = imagesOut->height;
+    const int outNY = imagesOut->width;
+
+    // only compute the overampled signals within a window
+    const int i_int_range = i_sincwindow * i_covs;
+    // set the start pixel, will be shifted by centerShift*oversamplingFactor (from raw image)
+    const int i_int_startX = outNX/2 - i_int_range;
+    const int i_int_startY = outNY/2 - i_int_range;
+    const int i_int_size = 2*i_int_range + 1;
+    // preset all pixels in out image to 0
+    imagesOut->setZero(stream);
+
+    static const int nthreads = 16;
+    dim3 threadsperblock(nthreads, nthreads, 1);
+    dim3 blockspergrid (IDIVUP(i_int_size, nthreads), IDIVUP(i_int_size, nthreads), nImages);
+    cuSincInterpolation_kernel<<<blockspergrid, threadsperblock, 0, stream>>>(nImages,
+        imagesIn->devData, inNX, inNY,
+        imagesOut->devData, outNX, outNY,
+        centerShift, rawOversamplingFactor,
+        r_filter, i_covs, i_decfactor, i_intplength, i_int_startX, i_int_startY, i_int_size);
+    getLastCudaError("cuSincInterpolation_kernel");
+}
+
 // end of file
\ No newline at end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h
index 3755c2375..741fb90d8 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h
@@ -23,6 +23,7 @@
 // dependencites
 #include "cuArrays.h"
 #include "cudaUtil.h"
+#include "data_types.h"
 
 #ifndef PI
 #define PI 3.14159265359f
@@ -34,9 +35,9 @@ class cuSincOverSamplerR2R
     static const int i_sincwindow = 2;
     ///< the oversampling is only performed within \pm i_sincwindow*i_covs around the peak
     static const int i_weight = 1;       ///< weight for cos() pedestal
-    const float r_pedestal = 0.0f;       ///< height of pedestal
-    const float r_beta = 0.75f;          ///< a low-band pass
-    const float r_relfiltlen = 6.0f;     ///< relative filter length
+    const real_type r_pedestal = 0.0;       ///< height of pedestal
+    const real_type r_beta = 0.75f;          ///< a low-band pass
+    const real_type r_relfiltlen = 6.0f;     ///< relative filter length
 
     static const int i_decfactor = 4096; ///< max decimals between original grid to set up the sinc kernel
 
@@ -44,7 +45,7 @@ class cuSincOverSamplerR2R
     int i_intplength;   ///< actual filter length = r_relfiltlen/r_beta
     int i_filtercoef;   //< length of the sinc kernel i_intplength*i_decfactor+1
 
-    float * r_filter;   // sinc kernel with size i_filtercoef
+    real_type * r_filter;   // sinc kernel with size i_filtercoef
 
     cudaStream_t stream;
 
@@ -54,7 +55,9 @@ class cuSincOverSamplerR2R
     // set up sinc interpolation coefficients
     void cuSetupSincKernel();
     // execute interface
-    void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *center, int oversamplingFactor);
+    void execute(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut, cuArrays<int2> *center, int oversamplingFactor);
+    void execute(cuArrays<real_type> *imagesIn, cuArrays<real_type> *imagesOut, int2 center, int oversamplingFactor);
+
     // destructor
     ~cuSincOverSamplerR2R();
 };
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp
new file mode 100644
index 000000000..183fa4cc0
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp
@@ -0,0 +1,44 @@
+#include "cudaError.h"
+
+#include <cuda_runtime.h>
+#include <cufft.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset();
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+template<typename T >
+void check(T result, char const *const func, const char *const file, int const line)
+{
+    if (result) {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n",
+                file, line, static_cast<unsigned int>(result), func);
+        DEVICE_RESET
+        // Make sure we call CUDA Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+template void check(cudaError_t, char const *const, const char *const, int const);
+template void check(cufftResult_t, char const *const, const char *const, int const);
+
+void __getLastCudaError(const char *errorMessage, const char *file, const int line)
+{
+    cudaError_t err = cudaGetLastError();
+
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : CUDA error : %s : (%d) %s.\n",
+                file, line, errorMessage, (int)err, cudaGetErrorString(err));
+        DEVICE_RESET
+        exit(EXIT_FAILURE);
+    }
+}
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h
index 5e857cba0..f4f9524be 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h
@@ -10,51 +10,13 @@
 
 #pragma once
 
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include <cufft.h>
 #include "debug.h"
-#include <cuda.h>
-
-
-#ifdef __DRIVER_TYPES_H__
-#ifndef DEVICE_RESET
-#define DEVICE_RESET cudaDeviceReset();
-#endif
-#else
-#ifndef DEVICE_RESET
-#define DEVICE_RESET
-#endif
-#endif
 
 template<typename T >
-void check(T result, char const *const func, const char *const file, int const line)
-{
-    if (result)
-    {
-
-        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n",
-                file, line, static_cast<unsigned int>(result), func);
-        DEVICE_RESET
-        // Make sure we call CUDA Device Reset before exiting
-        exit(EXIT_FAILURE);
-    }
-}
+void check(T result, char const *const func, const char *const file, int const line);
 
 // This will output the proper error string when calling cudaGetLastError
-inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
-{
-    cudaError_t err = cudaGetLastError();
-
-    if (cudaSuccess != err)
-    {
-        fprintf(stderr, "%s(%i) : CUDA error : %s : (%d) %s.\n",
-                file, line, errorMessage, (int)err, cudaGetErrorString(err));
-        DEVICE_RESET
-        exit(EXIT_FAILURE);
-    }
-}
+void __getLastCudaError(const char *errorMessage, const char *file, const int line);
 
 // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
 #ifdef CUDA_ERROR_CHECK
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp
new file mode 100644
index 000000000..c96a12947
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp
@@ -0,0 +1,58 @@
+#include "cudaUtil.h"
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "cudaError.h"
+
+int gpuDeviceInit(int devID)
+{
+    int device_count;
+    checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+    if (device_count == 0) {
+        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    if (devID < 0 || devID > device_count - 1) {
+        fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
+        exit(EXIT_FAILURE);
+    }
+
+    checkCudaErrors(cudaSetDevice(devID));
+    printf("Using CUDA Device %d ...\n", devID);
+
+    return devID;
+}
+
+void gpuDeviceList()
+{
+    int device_count = 0;
+    int current_device = 0;
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+    fprintf(stderr, "Detecting all CUDA devices ...\n");
+    if (device_count == 0) {
+        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    while (current_device < device_count) {
+        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
+        if (deviceProp.computeMode == cudaComputeModeProhibited) {
+            fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: "
+                    "device is running in <Compute Mode Prohibited> \n",
+                    current_device, deviceProp.name);
+        } else if (deviceProp.major < 1) {
+            fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: "
+                    "device does not support CUDA \n",
+                    current_device, deviceProp.name);
+        } else {
+            fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n",
+                    current_device, deviceProp.name);
+        }
+        current_device++;
+    }
+}
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h
index cc174e009..7516bc9ca 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h
@@ -10,9 +10,6 @@
 #ifndef __CUDAUTIL_H
 #define __CUDAUTIL_H
 
-#include <cuda_runtime.h>
-#include "cudaError.h"
-
 // for 2D FFT
 #define NRANK 2
 
@@ -47,12 +44,6 @@
 #define MIN(a,b) (a > b ? b: a)
 #endif
 
-// Float To Int conversion
-inline int ftoi(float value)
-{
-    return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
-}
-
 // compute the next integer in power of 2
 inline int nextpower2(int value)
 {
@@ -61,63 +52,11 @@ inline int nextpower2(int value)
     return r;
 }
 
-
 // General GPU Device CUDA Initialization
-inline int gpuDeviceInit(int devID)
-{
-    int device_count;
-    checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-    if (device_count == 0)
-    {
-        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (devID < 0 || devID > device_count-1)
-    {
-        fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
-        exit(EXIT_FAILURE);
-    }
-
-    checkCudaErrors(cudaSetDevice(devID));
-    printf("Using CUDA Device %d ...\n", devID);
-
-    return devID;
-}
+int gpuDeviceInit(int devID);
 
 // This function lists all available GPUs
-inline void gpuDeviceList()
-{
-    int device_count = 0;
-    int current_device = 0;
-    cudaDeviceProp deviceProp;
-    checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-    fprintf(stderr, "Detecting all CUDA devices ...\n");
-    if (device_count == 0)
-    {
-        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    while (current_device < device_count)
-    {
-        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
-        if (deviceProp.computeMode == cudaComputeModeProhibited)
-        {
-            fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device is running in <Compute Mode Prohibited> \n", current_device, deviceProp.name);
-        }
-        else if (deviceProp.major < 1)
-        {
-            fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device does not support CUDA \n", current_device, deviceProp.name);
-        }
-        else {
-            fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", current_device, deviceProp.name);
-        }
-        current_device++;
-    }
-}
+void gpuDeviceList();
 
 #endif //__CUDAUTIL_H
 //end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h
new file mode 100644
index 000000000..134deafdc
--- /dev/null
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h
@@ -0,0 +1,46 @@
+/**
+ * @file debug.h
+ * @brief Define flags to control the debugging
+ *
+ * CUAMPCOR_DEBUG is used to output debugging information and intermediate results,
+ *    disabled when NDEBUG macro is defined.
+ * CUDA_ERROR_CHECK is always enabled, to check CUDA routine errors
+ *
+ */
+
+// code guard
+#ifndef __CUAMPCOR_DATA_TYPES_H
+#define __CUAMPCOR_DATA_TYPES_H
+
+#include <float.h>
+
+// disable this for single precision version
+// #define CUAMPCOR_DOUBLE
+
+#ifdef CUAMPCOR_DOUBLE
+    #define real_type double
+    #define complex_type double2
+    #define real2_type double2
+    #define real3_type double3
+    #define make_real2 make_double2
+    #define make_real3 make_double3
+    #define make_complex_type make_double2
+    #define EPSILON DBL_EPSILON
+    #define REAL_MAX DBL_MAX
+#else
+    #define real_type float
+    #define complex_type float2
+    #define real2_type float2
+    #define real3_type float3
+    #define make_real2 make_float2
+    #define make_real3 make_float3
+    #define make_complex_type make_float2
+    #define EPSILON FLT_EPSILON
+    #define REAL_MAX FLT_MAX
+#endif
+
+#define image_complex_type float2
+#define image_real_type float
+
+#endif //__CUAMPCOR_DATA_TYPES_H
+//end of file
\ No newline at end of file
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h
index e43e8e44a..1879bb91e 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h
@@ -7,9 +7,10 @@
 #ifndef __FLOAT2_H
 #define __FLOAT2_H
 
-#include <vector_types.h>
+#include <cuda_runtime.h>
+#include <math.h>
 
-inline __host__ __device__ void zero(float2 &a) { a.x = 0.0f; a.y = 0.0f; }
+inline __host__ __device__ void zero(float2 &a) { a.x = 0.0; a.y = 0.0; }
 
 // negative
 inline __host__ __device__ float2 operator-(float2 &a)
@@ -125,5 +126,124 @@ inline __host__ __device__ float2 complexExp(float arg)
     return make_float2(cosf(arg), sinf(arg));
 }
 
+
+// double precision
+inline __host__ __device__ void zero(double2 &a) { a.x = 0.0; a.y = 0.0; }
+
+// negative
+inline __host__ __device__ double2 operator-(double2 &a)
+{
+    return make_double2(-a.x, -a.y);
+}
+
+// complex conjugate
+inline __host__ __device__ double2 conjugate(double2 a)
+{
+    return make_double2(a.x, -a.y);
+}
+
+// addition
+inline __host__ __device__ double2 operator+(double2 a, double2 b)
+{
+    return make_double2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(double2 &a, double2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+
+inline __host__ __device__ double2 operator+(double2 a, double b)
+{
+    return make_double2(a.x + b, a.y);
+}
+inline __host__ __device__ void operator+=(double2 &a, double b)
+{
+    a.x += b;
+}
+
+// subtraction
+inline __host__ __device__ double2 operator-(double2 a, double2 b)
+{
+    return make_double2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(double2 &a, double2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ double2 operator-(double2 a, double b)
+{
+    return make_double2(a.x - b, a.y);
+}
+inline __host__ __device__ void operator-=(double2 &a, double b)
+{
+    a.x -= b;
+}
+
+// multiplication
+inline __host__ __device__ double2 operator*(double2 a, double2 b)
+{
+    return make_double2(a.x*b.x - a.y*b.y, a.y*b.x + a.x*b.y);
+}
+inline __host__ __device__ void operator*=(double2 &a, double2 b)
+{
+    a.x = a.x*b.x - a.y*b.y;
+    a.y = a.y*b.x + a.x*b.y;
+}
+inline __host__ __device__ double2 operator*(double2 a, double b)
+{
+    return make_double2(a.x * b, a.y * b);
+}
+inline __host__ __device__ void operator*=(double2 &a, double b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+inline __host__ __device__ double2 operator*(double2 a, int b)
+{
+    return make_double2(a.x * b, a.y * b);
+}
+inline __host__ __device__ void operator*=(double2 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+inline __host__ __device__ double2 complexMul(double2 a, double2 b)
+{
+    return a*b;
+}
+inline __host__ __device__ double2 complexMulConj(double2 a, double2 b)
+{
+    return make_double2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y);
+}
+
+inline __host__ __device__ double2 operator/(double2 a, double b)
+{
+    return make_double2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(double2 &a, double b)
+{
+    a.x /= b;
+    a.y /= b;
+}
+
+// abs, arg
+inline __host__ __device__ double complexAbs(double2 a)
+{
+    return sqrt(a.x*a.x+a.y*a.y);
+}
+inline __host__ __device__ double complexArg(double2 a)
+{
+    return atan2(a.y, a.x);
+}
+
+// make a complex number from phase
+inline __host__ __device__ double2 complexExp(double arg)
+{
+    return make_double2(cos(arg), sin(arg));
+}
+
+
 #endif //__FLOAT2_H
-// end of file
\ No newline at end of file
+// end of file
diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp
index 7e884ba45..983b78e14 100644
--- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp
+++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp
@@ -32,13 +32,16 @@ void addbinding_pycuampcor(pybind11::module& m)
         .DEF_PARAM(int, deviceID)
         .DEF_PARAM(int, nStreams)
         .DEF_PARAM(int, derampMethod)
+        .DEF_PARAM(int, workflow)
 
         .DEF_PARAM(str, referenceImageName)
         .DEF_PARAM(int, referenceImageHeight)
         .DEF_PARAM(int, referenceImageWidth)
+        .DEF_PARAM(int, referenceImageDataType)
         .DEF_PARAM(str, secondaryImageName)
         .DEF_PARAM(int, secondaryImageHeight)
         .DEF_PARAM(int, secondaryImageWidth)
+        .DEF_PARAM(int, secondaryImageDataType)
 
         .DEF_PARAM(int, numberWindowDown)
         .DEF_PARAM(int, numberWindowAcross)
@@ -51,7 +54,7 @@ void addbinding_pycuampcor(pybind11::module& m)
         .DEF_PARAM(int, mergeGrossOffset)
         .DEF_PARAM(str, snrImageName)
         .DEF_PARAM(str, covImageName)
-        .DEF_PARAM(str, corrImageName)
+        .DEF_PARAM(str, peakValueImageName)
 
         .DEF_PARAM(int, rawDataOversamplingFactor)
         .DEF_PARAM(int, corrStatWindowSize)
@@ -67,8 +70,8 @@ void addbinding_pycuampcor(pybind11::module& m)
         .DEF_PARAM_RENAME(int, referenceStartPixelAcrossStatic, referenceStartPixelAcross0)
         .DEF_PARAM_RENAME(int, referenceStartPixelDownStatic,   referenceStartPixelDown0)
 
-        .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingMethod, oversamplingMethod)
-        .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingFactor, oversamplingFactor)
+        .DEF_PARAM(int, corrSurfaceOverSamplingMethod)
+        .DEF_PARAM(int, corrSurfaceOverSamplingFactor)
 
         .DEF_PARAM_RENAME(int, mmapSize, mmapSizeInGB)
 
@@ -78,6 +81,8 @@ void addbinding_pycuampcor(pybind11::module& m)
 
         .DEF_METHOD(runAmpcor)
 
+        .DEF_METHOD(isDoublePrecision)
+
         .def("checkPixelInImageRange", [](const cls& self) {
             self.param->checkPixelInImageRange();
         })
diff --git a/python/packages/nisar/workflows/dense_offsets.py b/python/packages/nisar/workflows/dense_offsets.py
index e897b19cd..33d583b2a 100644
--- a/python/packages/nisar/workflows/dense_offsets.py
+++ b/python/packages/nisar/workflows/dense_offsets.py
@@ -101,7 +101,7 @@ def run(cfg: dict):
             ampcor.grossOffsetImageName = str(out_dir / 'gross_offset')
             ampcor.snrImageName = str(out_dir / 'snr')
             ampcor.covImageName = str(out_dir / 'covariance')
-            ampcor.corrImageName = str(out_dir / 'correlation_peak')
+            ampcor.peakValueImageName = str(out_dir / 'correlation_peak')
 
             # Create empty ENVI datasets. PyCuAmpcor will overwrite the
             # binary files. Note, use gdal to pass interleave option
diff --git a/python/packages/nisar/workflows/offsets_product.py b/python/packages/nisar/workflows/offsets_product.py
index ed5ef8358..44d523f2f 100644
--- a/python/packages/nisar/workflows/offsets_product.py
+++ b/python/packages/nisar/workflows/offsets_product.py
@@ -158,7 +158,7 @@ def run(cfg: dict, output_hdf5: str = None):
                             layer_scratch_path / 'gross_offset')
                     ampcor.snrImageName = str(layer_scratch_path / 'snr')
                     ampcor.covImageName = str(layer_scratch_path / 'covariance')
-                    ampcor.corrImageName = str(layer_scratch_path/ 'correlation_peak')
+                    ampcor.peakValueImageName = str(layer_scratch_path/ 'correlation_peak')
 
                     create_empty_dataset(str(layer_scratch_path / 'dense_offsets'),
                                          ampcor.numberWindowAcross,