diff --git a/cxx/isce3/cuda/Sources.cmake b/cxx/isce3/cuda/Sources.cmake index b5ab0817f..ae9472736 100644 --- a/cxx/isce3/cuda/Sources.cmake +++ b/cxx/isce3/cuda/Sources.cmake @@ -29,23 +29,27 @@ geometry/utilities.cu image/gpuResampSlc.cu image/Resample.cu image/ResampSlc.cpp -matchtemplate/pycuampcor/cuAmpcorChunk.cu -matchtemplate/pycuampcor/cuAmpcorController.cu -matchtemplate/pycuampcor/cuAmpcorParameter.cu -matchtemplate/pycuampcor/cuArrays.cu +matchtemplate/pycuampcor/cuAmpcorController.cpp +matchtemplate/pycuampcor/cuAmpcorParameter.cpp +matchtemplate/pycuampcor/cuAmpcorProcessor.cpp +matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp +matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp +matchtemplate/pycuampcor/cuArrays.cpp matchtemplate/pycuampcor/cuArraysCopy.cu matchtemplate/pycuampcor/cuArraysPadding.cu matchtemplate/pycuampcor/cuCorrFrequency.cu matchtemplate/pycuampcor/cuCorrNormalization.cu matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu -matchtemplate/pycuampcor/cuCorrNormalizer.cu +matchtemplate/pycuampcor/cuCorrNormalizer.cpp matchtemplate/pycuampcor/cuCorrTimeDomain.cu matchtemplate/pycuampcor/cuDeramp.cu matchtemplate/pycuampcor/cuEstimateStats.cu matchtemplate/pycuampcor/cuOffset.cu -matchtemplate/pycuampcor/cuOverSampler.cu +matchtemplate/pycuampcor/cuOverSampler.cpp matchtemplate/pycuampcor/cuSincOverSampler.cu -matchtemplate/pycuampcor/GDALImage.cu +matchtemplate/pycuampcor/cudaError.cpp +matchtemplate/pycuampcor/cudaUtil.cpp +matchtemplate/pycuampcor/SlcImage.cpp product/SubSwaths.cu signal/gpuAzimuthFilter.cu signal/gpuCrossMul.cu diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cu deleted file mode 100644 index ef54f0386..000000000 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cu +++ /dev/null @@ -1,163 +0,0 @@ -/** - * @file GDALImage.h - * @brief Implementations of GDALImage class - * - */ - -// my declaration -#include "GDALImage.h" - -// dependencies -#include -#include "cudaError.h" - -/** - * Constructor - * @brief Create a GDAL image object - * @param filename a std::string with the raster image file name - * @param band the band number - * @param cacheSizeInGB read buffer size in GigaBytes - * @param useMmap whether to use memory map - */ -GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useMmap) - : _useMmap(useMmap) -{ - // open the file as dataset - _poDataset = (GDALDataset *) GDALOpen(filename.c_str(), GA_ReadOnly); - // if something is wrong, throw an exception - // GDAL reports the error message - if(!_poDataset) - throw; - - // check the band info - int count = _poDataset->GetRasterCount(); - if(band > count) - { - std::cout << "The desired band " << band << " is greater than " << count << " bands available"; - throw; - } - - // get the desired band - _poBand = _poDataset->GetRasterBand(band); - if(!_poBand) - throw; - - // get the width(x), and height(y) - _width = _poBand->GetXSize(); - _height = _poBand->GetYSize(); - - _dataType = _poBand->GetRasterDataType(); - // determine the image type - _isComplex = GDALDataTypeIsComplex(_dataType); - // determine the pixel size in bytes - _pixelSize = GDALGetDataTypeSize(_dataType); - - _bufferSize = 1024*1024*cacheSizeInGB; - - // checking whether using memory map - if(_useMmap) { - - char **papszOptions = NULL; - // if cacheSizeInGB = 0, use default - // else set the option - if(cacheSizeInGB > 0) - papszOptions = CSLSetNameValue( papszOptions, - "CACHE_SIZE", - std::to_string(_bufferSize).c_str()); - - // space between two lines - GIntBig pnLineSpace; - // set up the virtual mem buffer - _poBandVirtualMem = GDALGetVirtualMemAuto( - static_cast(_poBand), - GF_Read, - &_pixelSize, - &pnLineSpace, - papszOptions); - if(!_poBandVirtualMem) - throw; - - // get the starting pointer - _memPtr = CPLVirtualMemGetAddr(_poBandVirtualMem); - } - else { // use a buffer - checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize)); - } - // make sure memPtr is not Null - if (!_memPtr) - { - std::cout << "unable to locate the memory buffer\n"; - throw; - } - // all done -} - - -/** - * Load a tile of data h_tile x w_tile from CPU to GPU - * @param dArray pointer for array in device memory - * @param h_offset Down/Height offset - * @param w_offset Across/Width offset - * @param h_tile Down/Height tile size - * @param w_tile Across/Width tile size - * @param stream CUDA stream for copying - * @note Need to use size_t type to pass the parameters to cudaMemcpy2D correctly - */ -void GDALImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, - size_t h_tile, size_t w_tile, cudaStream_t stream) -{ - - size_t tileStartOffset = (h_offset*_width + w_offset)*_pixelSize; - - char * startPtr = (char *)_memPtr ; - startPtr += tileStartOffset; - - if (_useMmap) { - // direct copy from memory map buffer to device memory - checkCudaErrors(cudaMemcpy2DAsync(dArray, // dst - w_tile*_pixelSize, // dst pitch - startPtr, // src - _width*_pixelSize, // src pitch - w_tile*_pixelSize, // width in Bytes - h_tile, // height - cudaMemcpyHostToDevice,stream)); - } - else { // use a cpu buffer to load image data to gpu - - // get the total tile size in bytes - size_t tileSize = h_tile*w_tile*_pixelSize; - // if the size is bigger than existing buffer, reallocate - if (tileSize > _bufferSize) { - // TODO: fit the pagesize - _bufferSize = tileSize; - checkCudaErrors(cudaFree(_memPtr)); - checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize)); - } - // copy from file to buffer - CPLErr err = _poBand->RasterIO(GF_Read, //eRWFlag - w_offset, h_offset, //nXOff, nYOff - w_tile, h_tile, // nXSize, nYSize - _memPtr, // pData - w_tile*h_tile, 1, // nBufXSize, nBufYSize - _dataType, //eBufType - 0, 0 //nPixelSpace, nLineSpace in pData - ); - if(err != CE_None) - throw; // throw if reading error occurs; message reported by GDAL - - // copy from buffer to gpu - checkCudaErrors(cudaMemcpyAsync(dArray, _memPtr, tileSize, cudaMemcpyHostToDevice, stream)); - } - // all done -} - -/// destructor -GDALImage::~GDALImage() -{ - // free the virtual memory - CPLVirtualMemFree(_poBandVirtualMem), - // free the GDAL Dataset, close the file - delete _poDataset; -} - -// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h deleted file mode 100644 index 91c3abead..000000000 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h +++ /dev/null @@ -1,83 +0,0 @@ -/** - * @file GDALImage.h - * @brief Interface with GDAL vrt driver - * - * To read image file with the GDAL vrt driver, including SLC, GeoTIFF images - * @warning Only single precision images are supported: complex(pixelOffset=8) or real(pixelOffset=4). - * @warning Only single band file is currently supported. - */ - -// code guard -#ifndef __GDALIMAGE_H -#define __GDALIMAGE_H - -// dependencies -#include -#include -#include - - -class GDALImage{ -public: - // specify the types - using size_t = std::size_t; - -private: - int _height; ///< image height - int _width; ///< image width - - void * _memPtr = NULL; ///< pointer to buffer - - int _pixelSize; ///< pixel size in bytes - - int _isComplex; ///< whether the image is complex - - size_t _bufferSize; ///< buffer size - int _useMmap; ///< whether to use memory map - - // GDAL temporary objects - GDALDataType _dataType; - CPLVirtualMem * _poBandVirtualMem = NULL; - GDALDataset * _poDataset = NULL; - GDALRasterBand * _poBand = NULL; - -public: - //disable default constructor - GDALImage() = delete; - // constructor - GDALImage(std::string fn, int band=1, int cacheSizeInGB=0, int useMmap=1); - // destructor - ~GDALImage(); - - // get class properties - void * getmemPtr() - { - return(_memPtr); - } - - int getHeight() { - return (_height); - } - - int getWidth() - { - return (_width); - } - - int getPixelSize() - { - return _pixelSize; - } - - bool isComplex() - { - return _isComplex; - } - - // load data from cpu buffer to gpu - void loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream); - -}; - -#endif //__GDALIMAGE_H -// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp new file mode 100644 index 000000000..d9ebfb84f --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp @@ -0,0 +1,109 @@ +#include +#include + +#include "cuAmpcorController.h" +#include "cuAmpcorParameter.h" + +PYBIND11_MODULE(PyCuAmpcor, m) +{ + m.doc() = "Python module controller for underlying CUDA-Ampcor code"; + + using str = std::string; + using cls = cuAmpcorController; + + pybind11::class_(m, "PyCuAmpcor") + .def(pybind11::init<>()) + + // define a trivial binding for a controller method +#define DEF_METHOD(name) def(#name, &cls::name) + + // define a trivial getter/setter for a controller parameter +#define DEF_PARAM_RENAME(T, pyname, cppname) \ + def_property(#pyname, [](const cls& self) -> T { \ + return self.param->cppname; \ + }, [](cls& self, const T i) { \ + self.param->cppname = i; \ + }) + + // same as above, for even more trivial cases where pyname == cppname +#define DEF_PARAM(T, name) DEF_PARAM_RENAME(T, name, name) + + .DEF_PARAM(int, algorithm) + .DEF_PARAM(int, deviceID) + .DEF_PARAM(int, nStreams) + .DEF_PARAM(int, derampMethod) + .DEF_PARAM(int, workflow) + + .DEF_PARAM(str, referenceImageName) + .DEF_PARAM(int, referenceImageHeight) + .DEF_PARAM(int, referenceImageWidth) + .DEF_PARAM(int, referenceImageDataType) + .DEF_PARAM(str, secondaryImageName) + .DEF_PARAM(int, secondaryImageHeight) + .DEF_PARAM(int, secondaryImageWidth) + .DEF_PARAM(int, secondaryImageDataType) + + .DEF_PARAM(int, numberWindowDown) + .DEF_PARAM(int, numberWindowAcross) + + .DEF_PARAM_RENAME(int, windowSizeHeight, windowSizeHeightRaw) + .DEF_PARAM_RENAME(int, windowSizeWidth, windowSizeWidthRaw) + + .DEF_PARAM(str, offsetImageName) + .DEF_PARAM(str, grossOffsetImageName) + .DEF_PARAM(int, mergeGrossOffset) + .DEF_PARAM(str, snrImageName) + .DEF_PARAM(str, covImageName) + .DEF_PARAM(str, peakValueImageName) + + .DEF_PARAM(int, rawDataOversamplingFactor) + .DEF_PARAM(int, corrStatWindowSize) + + .DEF_PARAM(int, numberWindowDownInChunk) + .DEF_PARAM(int, numberWindowAcrossInChunk) + + .DEF_PARAM(int, useMmap) + + .DEF_PARAM_RENAME(int, halfSearchRangeAcross, halfSearchRangeAcrossRaw) + .DEF_PARAM_RENAME(int, halfSearchRangeDown, halfSearchRangeDownRaw) + + .DEF_PARAM_RENAME(int, referenceStartPixelAcrossStatic, referenceStartPixelAcross0) + .DEF_PARAM_RENAME(int, referenceStartPixelDownStatic, referenceStartPixelDown0) + + .DEF_PARAM(int, corrSurfaceOverSamplingMethod) + .DEF_PARAM(int, corrSurfaceOverSamplingFactor) + + .DEF_PARAM_RENAME(int, mmapSize, mmapSizeInGB) + + .DEF_PARAM_RENAME(int, skipSampleDown, skipSampleDownRaw) + .DEF_PARAM_RENAME(int, skipSampleAcross, skipSampleAcrossRaw) + .DEF_PARAM_RENAME(int, corrSurfaceZoomInWindow, zoomWindowSize) + + .DEF_METHOD(runAmpcor) + + .DEF_METHOD(isDoublePrecision) + + .def("checkPixelInImageRange", [](const cls& self) { + self.param->checkPixelInImageRange(); + }) + + .def("setupParams", [](cls& self) { + self.param->setupParameters(); + }) + + .def("setConstantGrossOffset", [](cls& self, const int goDown, + const int goAcross) { + self.param->setStartPixels( + self.param->referenceStartPixelDown0, + self.param->referenceStartPixelAcross0, + goDown, goAcross); + }) + .def("setVaryingGrossOffset", [](cls& self, std::vector vD, + std::vector vA) { + self.param->setStartPixels( + self.param->referenceStartPixelDown0, + self.param->referenceStartPixelAcross0, + vD.data(), vA.data()); + }) + ; +} diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp new file mode 100644 index 000000000..ed4bf14f5 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp @@ -0,0 +1,95 @@ +#include "SlcImage.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cudaError.h" +#include +#include + +SlcImage::SlcImage(const std::string& filepath, size_t img_height, size_t img_width, size_t pixel_size, size_t buffer_size) + : width(img_width), height(img_height), pixel_size(pixel_size), fd(-1), mapped_data(nullptr), + mapped_offset(0), mapped_size(0) +{ + file_size = width * height * pixel_size; + max_map_size = buffer_size*1024*1024*1024; + page_size = sysconf(_SC_PAGE_SIZE); // Get system page size + + // Open the file + fd = open(filepath.c_str(), O_RDONLY); + if (fd == -1) { + throw std::runtime_error("Failed to open file: " + filepath); + } +} + +void SlcImage::remapIfNeeded(size_t required_start, size_t required_end) +{ + + if(required_start < mapped_offset || required_end > mapped_offset + mapped_size) + { + // out of range, remap + // unmap first, if neccesary + if(mapped_data!=nullptr) + munmap(mapped_data, mapped_size); + // align new mapping offset + // round to the page size + mapped_offset = (required_start/page_size)*page_size; + // compute the mapped size + mapped_size = file_size - mapped_offset; + // not to exceed the buffer size + if (mapped_size > max_map_size) { + mapped_size = max_map_size; + } + // remap + mapped_data = mmap(nullptr, mapped_size, PROT_READ, MAP_PRIVATE, fd, mapped_offset); + if (mapped_data == MAP_FAILED) { + throw std::runtime_error("Failed to mmap file at offset " + std::to_string(mapped_offset)); + } + } + // else - in range, do nothing +} + + +/// load a tile of data h_tile x w_tile from CPU (mmap) to GPU +/// @param dArray pointer for array in device memory +/// @param h_offset Down/Height offset +/// @param w_offset Across/Width offset +/// @param h_tile Down/Height tile size +/// @param w_tile Across/Width tile size +/// @param stream CUDA stream for copying +void SlcImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream) +{ + size_t tileStartAddress = (h_offset*width + w_offset)*pixel_size; + size_t tileLastAddress = tileStartAddress + (h_tile*width + w_tile)*pixel_size; + + remapIfNeeded(tileStartAddress, tileLastAddress); + + char *startPtr = (char *)mapped_data ; + startPtr += tileStartAddress - mapped_offset; + + // @note + // We assume down/across directions as rows/cols. Therefore, SLC mmap and device array both use row major. + // cuBlas assumes both source and target arrays are column major. + // To use cublasSetMatrix, we need to switch w_tile/h_tile for rows/cols + // checkCudaErrors(cublasSetMatrixAsync(w_tile, h_tile, pixelsize, startPtr, width, dArray, w_tile, stream)); + + checkCudaErrors(cudaMemcpy2DAsync(dArray, w_tile*pixel_size, startPtr, width*pixel_size, + w_tile*pixel_size, h_tile, cudaMemcpyHostToDevice,stream)); +} + +SlcImage::~SlcImage() +{ + if (mapped_data!=nullptr) { + munmap(mapped_data, mapped_size); + } + if (fd != -1) { + close(fd); + } +} + +// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h new file mode 100644 index 000000000..a22650f1e --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h @@ -0,0 +1,38 @@ +// -*- c++ -*- +// file slcimage.h +// a mmap loader for slc images + +#ifndef __SLCIMAGE_H +#define __SLCIMAGE_H + +#include +#include + +class SlcImage{ +public: + // disable default constructor + SlcImage()=delete; + // constructor + SlcImage(const std::string& fn, size_t image_height, size_t image_width, size_t pixel_size, size_t buffersize); + // interface + void loadToDevice(void* dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream); + // destructor + ~SlcImage(); + +private: + int fd; + size_t file_size; + size_t pixel_size; + size_t height; + size_t width; + + void* mapped_data; + size_t page_size; + size_t mapped_offset; + size_t mapped_size; + size_t max_map_size; + + void remapIfNeeded(size_t required_start, size_t required_end); +}; + +#endif //__SLCIMAGE_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h deleted file mode 100644 index 5c990e233..000000000 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * @file cuAmpcorChunk.h - * @brief Ampcor processor for a batch of windows - * - * - */ - -#ifndef __CUAMPCORCHUNK_H -#define __CUAMPCORCHUNK_H - -#include "GDALImage.h" -#include "cuArrays.h" -#include "cuAmpcorParameter.h" -#include "cuOverSampler.h" -#include "cuSincOverSampler.h" -#include "cuCorrFrequency.h" -#include "cuCorrNormalizer.h" - - -/** - * cuAmpcor processor for a chunk (a batch of windows) - */ -class cuAmpcorChunk{ -private: - int idxChunkDown; ///< index of the chunk in total batches, down - int idxChunkAcross; ///< index of the chunk in total batches, across - int idxChunk; ///< - int nWindowsDown; ///< number of windows in one chunk, down - int nWindowsAcross; ///< number of windows in one chunk, across - - int devId; ///< GPU device ID to use - cudaStream_t stream; ///< CUDA stream to use - - GDALImage *referenceImage; ///< reference image object - GDALImage *secondaryImage; ///< secondary image object - cuAmpcorParameter *param; ///< reference to the (global) parameters - cuArrays *offsetImage; ///< output offsets image - cuArrays *snrImage; ///< snr image - cuArrays *covImage; ///< cov image - cuArrays *corrImage; ///< corr image - - // local variables and workers - // gpu buffer to load images from file - cuArrays * c_referenceChunkRaw, * c_secondaryChunkRaw; - cuArrays * r_referenceChunkRaw, * r_secondaryChunkRaw; - - // windows raw (not oversampled) data, complex and real - cuArrays * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn; - cuArrays * r_referenceBatchRaw, * r_secondaryBatchRaw; - - // windows oversampled data - cuArrays * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled; - cuArrays * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled; - cuArrays * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust; - - // offset data - cuArrays *ChunkOffsetDown, *ChunkOffsetAcross; - - // oversampling processors for complex images - cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler; - - // oversampling processor for correlation surface - cuOverSamplerR2R *corrOverSampler; - cuSincOverSamplerR2R *corrSincOverSampler; - - // cross-correlation processor with frequency domain algorithm - cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; - - // correlation surface normalizer - cuNormalizer *corrNormalizerRaw; - cuNormalizer *corrNormalizerOverSampled; - - // save offset results in different stages - cuArrays *offsetInit; - cuArrays *offsetZoomIn; - cuArrays *offsetFinal; - cuArrays *maxLocShift; // record the maxloc from the extract center - cuArrays *corrMaxValue; - cuArrays *i_maxloc; - cuArrays *r_maxval; - - // SNR estimation - cuArrays *r_corrBatchRawZoomIn; - cuArrays *r_corrBatchSum; - cuArrays *i_corrBatchZoomInValid, *i_corrBatchValidCount; - cuArrays *r_snrValue; - - // Variance estimation - cuArrays *r_covValue; - -public: - // constructor - cuAmpcorChunk(cuAmpcorParameter *param_, - GDALImage *reference_, GDALImage *secondary_, - cuArrays *offsetImage_, cuArrays *snrImage_, - cuArrays *covImage_, cuArrays* corrImage_, cudaStream_t stream_); - // destructor - ~cuAmpcorChunk(); - - // local methods - void setIndex(int idxDown_, int idxAcross_); - void loadReferenceChunk(); - void loadSecondaryChunk(); - void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff); - // run the given chunk - void run(int, int); -}; - - - -#endif diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp similarity index 62% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp index e906ef340..ceb0a6759 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp @@ -7,21 +7,35 @@ #include "cuAmpcorController.h" // dependencies -#include "GDALImage.h" +#include "SlcImage.h" #include "cuArrays.h" #include "cudaUtil.h" -#include "cuAmpcorChunk.h" +#include "cuAmpcorProcessor.h" #include "cuAmpcorUtil.h" +#include #include // constructor cuAmpcorController::cuAmpcorController() { // create a new set of parameters - param.reset(new cuAmpcorParameter()); + param = new cuAmpcorParameter(); } +// destructor +cuAmpcorController::~cuAmpcorController() +{ + delete param; +} +bool cuAmpcorController::isDoublePrecision() +{ +#ifdef CUAMPCOR_DOUBLE + return true; +#else + return false; +#endif +} /** * Run ampcor * @@ -31,66 +45,67 @@ void cuAmpcorController::runAmpcor() { // set the gpu id param->deviceID = gpuDeviceInit(param->deviceID); - // initialize the gdal driver - GDALAllRegister(); - // reference and secondary images; use band=1 as default + + // reference and secondary images // TODO: selecting band std::cout << "Opening reference image " << param->referenceImageName << "...\n"; - GDALImage *referenceImage = new GDALImage(param->referenceImageName, 1, param->mmapSizeInGB); + SlcImage *referenceImage = new SlcImage(param->referenceImageName, param->referenceImageHeight, param->referenceImageWidth, + param->referenceImageDataType*sizeof(float), param->mmapSizeInGB); std::cout << "Opening secondary image " << param->secondaryImageName << "...\n"; - GDALImage *secondaryImage = new GDALImage(param->secondaryImageName, 1, param->mmapSizeInGB); + SlcImage *secondaryImage = new SlcImage(param->secondaryImageName, param->secondaryImageHeight, param->secondaryImageWidth, + param->secondaryImageDataType*sizeof(float), param->mmapSizeInGB); - cuArrays *offsetImage, *offsetImageRun; - cuArrays *snrImage, *snrImageRun; - cuArrays *covImage, *covImageRun; - cuArrays *corrImage, *corrImageRun; + cuArrays *offsetImage, *offsetImageRun; + cuArrays *snrImage, *snrImageRun; + cuArrays *covImage, *covImageRun; + cuArrays *peakValueImage, *peakValueImageRun; // nWindowsDownRun is defined as numberChunk * numberWindowInChunk // It may be bigger than the actual number of windows int nWindowsDownRun = param->numberChunkDown * param->numberWindowDownInChunk; int nWindowsAcrossRun = param->numberChunkAcross * param->numberWindowAcrossInChunk; - offsetImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + offsetImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); offsetImageRun->allocate(); - snrImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + snrImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); snrImageRun->allocate(); - covImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + covImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); covImageRun->allocate(); - corrImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); - corrImageRun->allocate(); + peakValueImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + peakValueImageRun->allocate(); // Offset fields. - offsetImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + offsetImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); offsetImage->allocate(); // SNR. - snrImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + snrImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); snrImage->allocate(); // Variance. - covImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + covImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); covImage->allocate(); - // Cross-correlation peak - corrImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); - corrImage->allocate(); + // Correlation surface peak value + peakValueImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + peakValueImage->allocate(); + + // set up the cuda streams cudaStream_t streams[param->nStreams]; - cuAmpcorChunk *chunk[param->nStreams]; + std::unique_ptr chunk[param->nStreams]; // iterate over cuda streams for(int ist=0; istnStreams; ist++) { // create each stream checkCudaErrors(cudaStreamCreate(&streams[ist])); // create the chunk processor for each stream - chunk[ist]= new cuAmpcorChunk(param.get(), referenceImage, secondaryImage, - offsetImageRun, snrImageRun, covImageRun, corrImageRun, - streams[ist]); - + chunk[ist]= cuAmpcorProcessor::create(param->workflow, param, referenceImage, secondaryImage, + offsetImageRun, snrImageRun, covImageRun, peakValueImageRun, streams[ist]); } int nChunksDown = param->numberChunkDown; @@ -131,17 +146,17 @@ void cuAmpcorController::runAmpcor() cuArraysCopyExtract(offsetImageRun, offsetImage, make_int2(0,0), streams[0]); cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]); cuArraysCopyExtract(covImageRun, covImage, make_int2(0,0), streams[0]); - cuArraysCopyExtract(corrImageRun, corrImage, make_int2(0,0), streams[0]); + cuArraysCopyExtract(peakValueImageRun, peakValueImage, make_int2(0,0), streams[0]); /* save the offsets and gross offsets */ // copy the offset to host offsetImage->allocateHost(); offsetImage->copyToHost(streams[0]); // construct the gross offset - cuArrays *grossOffsetImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + cuArrays *grossOffsetImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); grossOffsetImage->allocateHost(); for(int i=0; i< param->numberWindows; i++) - grossOffsetImage->hostData[i] = make_float2(param->grossOffsetDown[i], param->grossOffsetAcross[i]); + grossOffsetImage->hostData[i] = make_real2(param->grossOffsetDown[i], param->grossOffsetAcross[i]); // check whether to merge gross offset if (param->mergeGrossOffset) @@ -158,25 +173,24 @@ void cuAmpcorController::runAmpcor() // save the snr/cov images snrImage->outputToFile(param->snrImageName, streams[0]); covImage->outputToFile(param->covImageName, streams[0]); - - // save the cross-correlation peak - corrImage->outputToFile(param->corrImageName, streams[0]); + peakValueImage->outputToFile(param->peakValueImageName, streams[0]); // Delete arrays. delete offsetImage; delete snrImage; delete covImage; - delete corrImage; + delete peakValueImage; delete offsetImageRun; delete snrImageRun; delete covImageRun; - delete corrImageRun; + delete peakValueImageRun; for (int ist=0; istnStreams; ist++) { + // cufftplan etc are stream dependent, need to be deleted before stream is destroyed + chunk[ist].release(); checkCudaErrors(cudaStreamDestroy(streams[ist])); - delete chunk[ist]; } delete referenceImage; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h index e807cce32..466fd6859 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h @@ -15,18 +15,19 @@ #ifndef CU_AMPCOR_CONTROLLER_H #define CU_AMPCOR_CONTROLLER_H -#include - // dependencies #include "cuAmpcorParameter.h" class cuAmpcorController { public: - std::unique_ptr param; ///< the parameter set + cuAmpcorParameter *param; ///< the parameter set // constructor cuAmpcorController(); + // destructor + ~cuAmpcorController(); // run interface void runAmpcor(); + bool isDoublePrecision(); }; #endif diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp similarity index 54% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp index 9acc6b294..0445f9343 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp @@ -5,6 +5,9 @@ #include "cuAmpcorParameter.h" #include +#include +#include + #ifndef IDIVUP #define IDIVUP(i,j) ((i+j-1)/j) @@ -22,7 +25,8 @@ cuAmpcorParameter::cuAmpcorParameter() algorithm = 0; //0 freq; 1 time deviceID = 0; nStreams = 1; - derampMethod = 1; + derampMethod = 0; + workflow = 0; windowSizeWidthRaw = 64; windowSizeHeightRaw = 64; @@ -31,21 +35,24 @@ cuAmpcorParameter::cuAmpcorParameter() skipSampleAcrossRaw = 64; skipSampleDownRaw = 64; - rawDataOversamplingFactor = 2; - zoomWindowSize = 16; - oversamplingFactor = 16; - oversamplingMethod = 0; + rawDataOversamplingFactor = 2; //antialiasing + zoomWindowSize = 16; // correlation surface oversampling zoom + corrSurfaceOverSamplingFactor = 16; // correlation surface oversampling + corrSurfaceOverSamplingMethod = 0; referenceImageName = "reference.slc"; referenceImageWidth = 1000; referenceImageHeight = 1000; + referenceImageDataType = 2; // complex secondaryImageName = "secondary.slc"; secondaryImageWidth = 1000; secondaryImageHeight = 1000; - offsetImageName = "DenseOffset.off"; - grossOffsetImageName = "GrossOffset.off"; - snrImageName = "snr.snr"; - covImageName = "cov.cov"; + secondaryImageDataType = 2; // complex + offsetImageName = "DenseOffset.bip"; + grossOffsetImageName = "GrossOffset.bip"; + snrImageName = "snr.bip"; + covImageName = "cov.bip"; + peakValueImageName = "peakValue.bip"; numberWindowDown = 1; numberWindowAcross = 1; numberWindowDownInChunk = 1; @@ -68,6 +75,34 @@ cuAmpcorParameter::cuAmpcorParameter() */ void cuAmpcorParameter::setupParameters() +{ + // workflow specific parameter settings + // + switch (workflow) { + case 0: + _setupParameters_TwoPass(); + break; + case 1: + _setupParameters_OnePass(); + break; + default: + throw std::invalid_argument("Unsupported workflow"); + } + + // common parameter settings + numberWindows = numberWindowDown*numberWindowAcross; + if(numberWindows <=0) { + fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross); + exit(EXIT_FAILURE); + } + + numberChunkDown = IDIVUP(numberWindowDown, numberWindowDownInChunk); + numberChunkAcross = IDIVUP(numberWindowAcross, numberWindowAcrossInChunk); + numberChunks = numberChunkDown*numberChunkAcross; + allocateArrays(); +} + +void cuAmpcorParameter::_setupParameters_TwoPass() { // Size to extract the raw correlation surface for snr/cov corrRawZoomInHeight = std::min(corrStatWindowSize, 2*halfSearchRangeDownRaw+1); @@ -87,8 +122,8 @@ void cuAmpcorParameter::setupParameters() windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor; // windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor; - searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeDownRaw; - searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw; + searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeAcrossRaw; + searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeDownRaw; searchWindowSizeWidthRawZoomIn = windowSizeWidthRaw + 2*halfZoomWindowSizeRaw; searchWindowSizeHeightRawZoomIn = windowSizeHeightRaw + 2*halfZoomWindowSizeRaw; @@ -96,40 +131,126 @@ void cuAmpcorParameter::setupParameters() searchWindowSizeWidth = searchWindowSizeWidthRawZoomIn*rawDataOversamplingFactor; searchWindowSizeHeight = searchWindowSizeHeightRawZoomIn*rawDataOversamplingFactor; - numberWindows = numberWindowDown*numberWindowAcross; - if(numberWindows <=0) { - fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross); - exit(EXIT_FAILURE); - } + windowSizeWidthRawEnlarged = windowSizeWidthRaw; + windowSizeHeightRawEnlarged = windowSizeHeightRaw; + + // loading offsets + referenceLoadingOffsetDown = 0; + referenceLoadingOffsetAcross = 0; + // secondary loading offset relative to reference + secondaryLoadingOffsetDown = -halfSearchRangeDownRaw; + secondaryLoadingOffsetAcross = -halfSearchRangeAcrossRaw; + +} + +void cuAmpcorParameter::_setupParameters_OnePass() +{ + + // template window size (after antialiasing oversampling) + windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor; // + windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor; + + // serve as extra margin for search range + halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); + + // add extra search range to ensure enough area to oversample correlation surface + halfSearchRangeDownRaw += halfZoomWindowSizeRaw; + halfSearchRangeAcrossRaw += halfZoomWindowSizeRaw; + + // search window size before antialiasing oversampling + searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeAcrossRaw; + searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeDownRaw; + + windowSizeHeightRawEnlarged = searchWindowSizeHeightRaw; + windowSizeWidthRawEnlarged = searchWindowSizeWidthRaw; + + // search window size (after antialiasing oversampling) + searchWindowSizeWidth = searchWindowSizeWidthRaw*rawDataOversamplingFactor; + searchWindowSizeHeight = searchWindowSizeHeightRaw*rawDataOversamplingFactor; + + windowSizeHeightEnlarged = searchWindowSizeHeight; + windowSizeWidthEnlarged = searchWindowSizeWidth; + + // loading offsets + // reference size matching the secondary size + referenceLoadingOffsetDown = -halfSearchRangeDownRaw; + referenceLoadingOffsetAcross = -halfSearchRangeAcrossRaw; + // secondary loading offset relative to reference + secondaryLoadingOffsetDown = 0; + secondaryLoadingOffsetAcross = 0; + + // correlation surface size + corrWindowSize = make_int2(searchWindowSizeHeight - windowSizeHeight + 1, + searchWindowSizeWidth - windowSizeWidth +1); + // check zoom in window size, if larger, issue a warning + if(zoomWindowSize >= corrWindowSize.x || zoomWindowSize >= corrWindowSize.y) + fprintf(stderr, "Warning: zoomWindowSize %d is bigger than the original correlation surface size (%d, %d)!\n", + zoomWindowSize, corrWindowSize.x, corrWindowSize.y ); + // use the smaller values + corrZoomInSize = make_int2(std::min(zoomWindowSize+1, corrWindowSize.x), + std::min(zoomWindowSize+1, corrWindowSize.y)); + + // oversampled correlation surface size + corrZoomInOversampledSize = make_int2(corrZoomInSize.x * corrSurfaceOverSamplingFactor, + corrZoomInSize.y * corrSurfaceOverSamplingFactor); + + int totalOS = rawDataOversamplingFactor*corrSurfaceOverSamplingFactor; + // search start : image center - 1 pixel * totalOS + corrZoomInOversampledSearchStart = make_int2((corrZoomInSize.x-1)*corrSurfaceOverSamplingFactor/2 - totalOS, + (corrZoomInSize.y-1)*corrSurfaceOverSamplingFactor/2 - totalOS); + // search range \pm 1 pixel * totalOS + corrZoomInOversampledSearchRange = make_int2(2*totalOS, 2*totalOS); + + fprintf(stderr, "zoomWindowSize is (%d, %d)!\n", + corrZoomInSize.x, corrZoomInSize.y ); - numberChunkDown = IDIVUP(numberWindowDown, numberWindowDownInChunk); - numberChunkAcross = IDIVUP(numberWindowAcross, numberWindowAcrossInChunk); - numberChunks = numberChunkDown*numberChunkAcross; - allocateArrays(); } void cuAmpcorParameter::allocateArrays() { - int arraySize = numberWindows; - grossOffsetDown.resize(arraySize); - grossOffsetAcross.resize(arraySize); - referenceStartPixelDown.resize(arraySize); - referenceStartPixelAcross.resize(arraySize); - secondaryStartPixelDown.resize(arraySize); - secondaryStartPixelAcross.resize(arraySize); - - int arraySizeChunk = numberChunks; - referenceChunkStartPixelDown.resize(arraySizeChunk); - referenceChunkStartPixelAcross.resize(arraySizeChunk); - secondaryChunkStartPixelDown.resize(arraySizeChunk); - secondaryChunkStartPixelAcross.resize(arraySizeChunk); - referenceChunkHeight.resize(arraySizeChunk); - referenceChunkWidth.resize(arraySizeChunk); - secondaryChunkHeight.resize(arraySizeChunk); - secondaryChunkWidth.resize(arraySizeChunk); + int arraySize = numberWindows*sizeof(int); + grossOffsetDown = (int *)malloc(arraySize); + grossOffsetAcross = (int *)malloc(arraySize); + referenceStartPixelDown = (int *)malloc(arraySize); + referenceStartPixelAcross = (int *)malloc(arraySize); + secondaryStartPixelDown = (int *)malloc(arraySize); + secondaryStartPixelAcross = (int *)malloc(arraySize); + + int arraySizeChunk = numberChunks*sizeof(int); + referenceChunkStartPixelDown = (int *)malloc(arraySizeChunk); + referenceChunkStartPixelAcross = (int *)malloc(arraySizeChunk); + secondaryChunkStartPixelDown = (int *)malloc(arraySizeChunk); + secondaryChunkStartPixelAcross = (int *)malloc(arraySizeChunk); + referenceChunkHeight = (int *)malloc(arraySizeChunk); + referenceChunkWidth = (int *)malloc(arraySizeChunk); + secondaryChunkHeight = (int *)malloc(arraySizeChunk); + secondaryChunkWidth = (int *)malloc(arraySizeChunk); } +void cuAmpcorParameter::deallocateArrays() +{ + free(grossOffsetDown); + free(grossOffsetAcross); + free(referenceStartPixelDown); + free(referenceStartPixelAcross); + free(secondaryStartPixelDown); + free(secondaryStartPixelAcross); + free(referenceChunkStartPixelDown); + free(referenceChunkStartPixelAcross); + free(secondaryChunkStartPixelDown); + free(secondaryChunkStartPixelAcross); + free(referenceChunkHeight); + free(referenceChunkWidth); + free(secondaryChunkHeight); + free(secondaryChunkWidth); +} + + +// **************** +// make reference window the same as secondary for oversampling +// **************** + /// Set starting pixels for reference and secondary windows from arrays /// set also gross offsets between reference and secondary windows /// @@ -137,12 +258,12 @@ void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD { for(int i=0; i referenceImageHeight) mChunkED = referenceImageHeight; + mChunkEA += windowSizeWidthRawEnlarged; + if (mChunkEA > referenceImageWidth) mChunkED = referenceImageWidth; + sChunkED += searchWindowSizeHeightRaw; + if (sChunkED > secondaryImageHeight) sChunkED = secondaryImageHeight; + sChunkEA += searchWindowSizeWidthRaw; + if (sChunkEA > secondaryImageWidth) sChunkED = secondaryImageWidth; + // set the starting pixel and size of the chunk referenceChunkStartPixelDown[idxChunk] = mChunkSD; referenceChunkStartPixelAcross[idxChunk] = mChunkSA; secondaryChunkStartPixelDown[idxChunk] = sChunkSD; secondaryChunkStartPixelAcross[idxChunk] = sChunkSA; - referenceChunkHeight[idxChunk] = mChunkED - mChunkSD + windowSizeHeightRaw; - referenceChunkWidth[idxChunk] = mChunkEA - mChunkSA + windowSizeWidthRaw; - secondaryChunkHeight[idxChunk] = sChunkED - sChunkSD + searchWindowSizeHeightRaw; - secondaryChunkWidth[idxChunk] = sChunkEA - sChunkSA + searchWindowSizeWidthRaw; + referenceChunkHeight[idxChunk] = mChunkED - mChunkSD; + referenceChunkWidth[idxChunk] = mChunkEA - mChunkSA; + secondaryChunkHeight[idxChunk] = sChunkED - sChunkSD; + secondaryChunkWidth[idxChunk] = sChunkEA - sChunkSA; + // search the max chunk size, used to determine the allocated read buffer size if(maxReferenceChunkHeight < referenceChunkHeight[idxChunk]) maxReferenceChunkHeight = referenceChunkHeight[idxChunk]; if(maxReferenceChunkWidth < referenceChunkWidth[idxChunk] ) maxReferenceChunkWidth = referenceChunkWidth[idxChunk]; if(maxSecondaryChunkHeight < secondaryChunkHeight[idxChunk]) maxSecondaryChunkHeight = secondaryChunkHeight[idxChunk]; @@ -253,8 +394,11 @@ void cuAmpcorParameter::setChunkStartPixels() } /// check whether reference and secondary windows are within the image range +// now issue warning rather than errors void cuAmpcorParameter::checkPixelInImageRange() { +// check range is no longer required, but offered as an option in DEBUG +#ifdef CUAMPCOR_DEBUG int endPixel; for(int row=0; row= referenceImageHeight) { - fprintf(stderr, "Reference Window end pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); + printf("Warning: Warning: Reference Window end pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); } endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw; if(endPixel >= referenceImageWidth) { - fprintf(stderr, "Reference Window end pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); + printf("Warning: Reference Window end pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); } //secondary if(secondaryStartPixelDown[i] <0) { - fprintf(stderr, "Secondary Window start pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]); - exit(EXIT_FAILURE); + printf("Warning: Secondary Window start pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]); } if(secondaryStartPixelAcross[i] <0) { - fprintf(stderr, "Secondary Window start pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]); - exit(EXIT_FAILURE); + printf("Warning: Secondary Window start pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]); } endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw; if(endPixel >= secondaryImageHeight) { - fprintf(stderr, "Secondary Window end pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); + printf("Warning: Secondary Window end pixel out of range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); } endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw; if(endPixel >= secondaryImageWidth) { - fprintf(stderr, "Secondary Window end pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); + printf("Warning: Secondary Window end pixel out of range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); } } } +#endif // CUAMPCOR_DEBUG } -cuAmpcorParameter::~cuAmpcorParameter() {} +cuAmpcorParameter::~cuAmpcorParameter() +{ + deallocateArrays(); +} // end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h index 357f36ba9..6a6d83a12 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h @@ -10,7 +10,7 @@ #define __CUAMPCORPARAMETER_H #include -#include +#include // for int2 /// Class container for all parameters /// @@ -32,40 +32,34 @@ class cuAmpcorParameter{ public: - - // TODO this is to avoid memory issues due to copying the non-owning pointers - // we should use RAII ownership so these can be trivially defaulted - cuAmpcorParameter(const cuAmpcorParameter&) = delete; - cuAmpcorParameter& operator=(const cuAmpcorParameter&) = delete; - cuAmpcorParameter(cuAmpcorParameter&&) = delete; - cuAmpcorParameter& operator=(cuAmpcorParameter&&) = delete; - int algorithm; ///< Cross-correlation algorithm: 0=freq domain (default) 1=time domain int deviceID; ///< Targeted GPU device ID: use -1 to auto select int nStreams; ///< Number of streams to asynchonize data transfers and compute kernels int derampMethod; ///< Method for deramping 0=None, 1=average + int workflow; ///< Workflow 0: two passes, first pass without antialiasing oversampling, 1: one pass with antialiasing oversampling // chip or window size for raw data int windowSizeHeightRaw; ///< Template window height (original size) int windowSizeWidthRaw; ///< Template window width (original size) + + int windowSizeHeightRawEnlarged; ///< Template window height Enlarged to search window size for oversampling + int windowSizeWidthRawEnlarged; ///< Template window width Enlarged to search window size for oversampling + int searchWindowSizeHeightRaw; ///< Search window height (original size) int searchWindowSizeWidthRaw; ///< Search window width (original size) int halfSearchRangeDownRaw; ///< (searchWindowSizeHeightRaw-windowSizeHeightRaw)/2 int halfSearchRangeAcrossRaw; ///< (searchWindowSizeWidthRaw-windowSizeWidthRaw)/2 // search range is (-halfSearchRangeRaw, halfSearchRangeRaw) - - int searchWindowSizeHeightRawZoomIn; ///< search window height used for zoom in - int searchWindowSizeWidthRawZoomIn; ///< search window width used for zoom in - - int corrStatWindowSize; ///< correlation surface size used to estimate snr - int corrRawZoomInHeight; ///< correlation surface height used for oversampling - int corrRawZoomInWidth; ///< correlation surface width used for oversampling + // note the search range now includes extra margin for the correlation surface extraction // chip or window size after oversampling - int rawDataOversamplingFactor; ///< Raw data overampling factor (from original size to oversampled size) + int rawDataOversamplingFactor; ///< Raw data oversampling factor (from original size to oversampled size) int windowSizeHeight; ///< Template window length (oversampled size) int windowSizeWidth; ///< Template window width (original size) + int windowSizeHeightEnlarged; ///< Template window length enlarged (oversampled size) + int windowSizeWidthEnlarged; ///< Template window width enlarged (original size) + int searchWindowSizeHeight; ///< Search window height (oversampled size) int searchWindowSizeWidth; ///< Search window width (oversampled size) @@ -77,21 +71,35 @@ class cuAmpcorParameter{ int zoomWindowSize; ///< Zoom-in window size in correlation surface (same for down and across directions) int halfZoomWindowSizeRaw; ///< half of zoomWindowSize/rawDataOversamplingFactor - int oversamplingFactor; ///< Oversampling factor for interpolating correlation surface - int oversamplingMethod; ///< correlation surface oversampling method 0 = fft (default) 1 = sinc + int corrSurfaceOverSamplingFactor; ///< Oversampling factor for interpolating correlation surface + int corrSurfaceOverSamplingMethod; ///< correlation surface oversampling method 0 = fft (default) 1 = sinc + + // correlation surface + int2 corrWindowSize; // 2*halfSearchRange + 1 + int2 corrZoomInSize; // zoomWindowSize+1 + int2 corrZoomInOversampledSize; // corrZoomInSize * oversamplingFactor + int2 corrZoomInOversampledSearchStart; // search start position in the oversampled correlation surface + int2 corrZoomInOversampledSearchRange; // search range in the oversampled correlation surface + + int corrStatWindowSize; ///< correlation surface size used to estimate snr + // parameters used in the first pass in two-pass workflow + // @TODO move them to workflow specific files + int searchWindowSizeHeightRawZoomIn; + int searchWindowSizeWidthRawZoomIn; + int corrRawZoomInHeight; ///< correlation surface height used for oversampling + int corrRawZoomInWidth; ///< correlation surface width used for oversampling - float thresholdSNR; ///< Threshold of Signal noise ratio to remove noisy data //reference image std::string referenceImageName; ///< reference SLC image name - int imageDataType1; ///< reference image data type, 2=cfloat=complex=float2 1=float + int referenceImageDataType; ///< reference image data type, 2=cfloat=complex=float2 1=float int referenceImageHeight; ///< reference image height int referenceImageWidth; ///< reference image width //secondary image std::string secondaryImageName; ///< secondary SLC image name - int imageDataType2; ///< secondary image data type, 2=cfloat=complex=float2 1=float + int secondaryImageDataType; ///< secondary image data type, 2=cfloat=complex=float2 1=float int secondaryImageHeight; ///< secondary image height int secondaryImageWidth; ///< secondary image width @@ -113,32 +121,35 @@ class cuAmpcorParameter{ int referenceStartPixelDown0; ///< first starting pixel in reference image (down) int referenceStartPixelAcross0; ///< first starting pixel in reference image (across) - std::vector referenceStartPixelDown; ///< reference starting pixels for each window (down) - std::vector referenceStartPixelAcross; ///< reference starting pixels for each window (across) - std::vector secondaryStartPixelDown; ///< secondary starting pixels for each window (down) - std::vector secondaryStartPixelAcross; ///< secondary starting pixels for each window (across) + int *referenceStartPixelDown; ///< reference starting pixels for each window (down) + int *referenceStartPixelAcross; ///< reference starting pixels for each window (across) + int *secondaryStartPixelDown; ///< secondary starting pixels for each window (down) + int *secondaryStartPixelAcross; ///< secondary starting pixels for each window (across) int grossOffsetDown0; ///< gross offset static component (down) int grossOffsetAcross0; ///< gross offset static component (across) - std::vector grossOffsetDown; ///< Gross offsets between reference and secondary windows (down) - std::vector grossOffsetAcross; ///< Gross offsets between reference and secondary windows (across) + int *grossOffsetDown; ///< Gross offsets between reference and secondary windows (down) + int *grossOffsetAcross; ///< Gross offsets between reference and secondary windows (across) int mergeGrossOffset; ///< whether to merge gross offsets into the final offsets - std::vector referenceChunkStartPixelDown; ///< reference starting pixels for each chunk (down) - std::vector referenceChunkStartPixelAcross; ///< reference starting pixels for each chunk (across) - std::vector secondaryChunkStartPixelDown; ///< secondary starting pixels for each chunk (down) - std::vector secondaryChunkStartPixelAcross; ///< secondary starting pixels for each chunk (across) - std::vector referenceChunkHeight; ///< reference chunk height - std::vector referenceChunkWidth; ///< reference chunk width - std::vector secondaryChunkHeight; ///< secondary chunk height - std::vector secondaryChunkWidth; ///< secondary chunk width + int *referenceChunkStartPixelDown; ///< reference starting pixels for each chunk (down) + int *referenceChunkStartPixelAcross; ///< reference starting pixels for each chunk (across) + int *secondaryChunkStartPixelDown; ///< secondary starting pixels for each chunk (down) + int *secondaryChunkStartPixelAcross; ///< secondary starting pixels for each chunk (across) + int *referenceChunkHeight; ///< reference chunk height + int *referenceChunkWidth; ///< reference chunk width + int *secondaryChunkHeight; ///< secondary chunk height + int *secondaryChunkWidth; ///< secondary chunk width int maxReferenceChunkHeight, maxReferenceChunkWidth; ///< max reference chunk size int maxSecondaryChunkHeight, maxSecondaryChunkWidth; ///< max secondary chunk size + int referenceLoadingOffsetDown, referenceLoadingOffsetAcross; ///< reference loading offset, depending on workflow (e.g, matching secondary) + int secondaryLoadingOffsetDown, secondaryLoadingOffsetAcross; ///< secondary loading offset, depending on workflow (e.g., with extra pads) + std::string grossOffsetImageName; ///< gross offset output filename std::string offsetImageName; ///< Offset fields output filename std::string snrImageName; ///< Output SNR filename std::string covImageName; ///< Output variance filename - std::string corrImageName; ///< Output cross-correlation peak filename + std::string peakValueImageName; ///< Output normalized correlation surface peak value filename // Class constructor and default parameters setter cuAmpcorParameter(); @@ -147,6 +158,9 @@ class cuAmpcorParameter{ // Allocate various arrays after the number of Windows is given void allocateArrays(); + // Deallocate arrays on exit + void deallocateArrays(); + // Three methods to set reference/secondary starting pixels and gross offsets from input reference start pixel(s) and gross offset(s) // 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets @@ -161,6 +175,8 @@ class cuAmpcorParameter{ void checkPixelInImageRange(); // Process other parameters after Python Input void setupParameters(); + void _setupParameters_TwoPass(); + void _setupParameters_OnePass(); }; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp new file mode 100644 index 000000000..d96ff8f57 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp @@ -0,0 +1,81 @@ +#include "cuAmpcorProcessor.h" +#include "cuAmpcorProcessorTwoPass.h" +#include "cuAmpcorProcessorOnePass.h" + +#include + +// Factory method implementation +// create the batch processor for a given {workflow} +std::unique_ptr cuAmpcorProcessor::create(int workflow, + cuAmpcorParameter *param_, + SlcImage *reference_, SlcImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_) +{ + if (workflow == 0) { + return std::unique_ptr(new cuAmpcorProcessorTwoPass( + param_, reference_, secondary_, offsetImage_, + snrImage_, covImage_, peakValueImage_, stream_)); + } else if (workflow == 1) { + return std::unique_ptr(new cuAmpcorProcessorOnePass( + param_, reference_, secondary_, offsetImage_, + snrImage_, covImage_, peakValueImage_, stream_)); + } else { + throw std::invalid_argument("Unsupported workflow"); + } +} + +// constructor +cuAmpcorProcessor::cuAmpcorProcessor(cuAmpcorParameter *param_, + SlcImage *reference_, SlcImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_) + : param(param_), referenceImage(reference_), secondaryImage(secondary_), + offsetImage(offsetImage_), snrImage(snrImage_), covImage(covImage_), + peakValueImage(peakValueImage_), stream(stream_) +{ +} + +/// set chunk index +void cuAmpcorProcessor::setIndex(int idxDown_, int idxAcross_) +{ + idxChunkDown = idxDown_; + idxChunkAcross = idxAcross_; + idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross; + + if(idxChunkDown == param->numberChunkDown -1) { + nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1); + } + else { + nWindowsDown = param->numberWindowDownInChunk; + } + + if(idxChunkAcross == param->numberChunkAcross -1) { + nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1); + } + else { + nWindowsAcross = param->numberWindowAcrossInChunk; + } +} + +/// obtain the starting pixels for each chip +/// @param[in] oStartPixel start pixel locations for all chips +/// @param[out] rstartPixel start pixel locations for chips within the chunk +void cuAmpcorProcessor::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff) +{ + for(int i=0; inumberWindowDownInChunk; ++i) { + int iDown = i; + if(i>=nWindowsDown) iDown = nWindowsDown-1; + for(int j=0; jnumberWindowAcrossInChunk; ++j){ + int iAcross = j; + if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; + int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; + int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross + + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; + rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff; + } + } +} + diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h new file mode 100644 index 000000000..2e084a67a --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h @@ -0,0 +1,73 @@ +/* + * @file cuAmpcorChunk.h + * @brief Ampcor processor for a batch of windows + * + * + */ + +#ifndef __CUAMPCORPROCESSOR_H +#define __CUAMPCORPROCESSOR_H + +#include "SlcImage.h" +#include "data_types.h" +#include "cuArrays.h" +#include "cuAmpcorParameter.h" +#include "cuOverSampler.h" +#include "cuSincOverSampler.h" +#include "cuCorrFrequency.h" +#include "cuCorrNormalizer.h" +#include + + +/** + * cuAmpcor batched processor (virtual class) + */ +class cuAmpcorProcessor{ +// shared variables +protected: + int idxChunkDown; ///< index of the chunk in total batches, down + int idxChunkAcross; ///< index of the chunk in total batches, across + int idxChunk; ///< + int nWindowsDown; ///< number of windows in one chunk, down + int nWindowsAcross; ///< number of windows in one chunk, across + + int devId; ///< GPU device ID to use + cudaStream_t stream; ///< CUDA stream to use + + SlcImage *referenceImage; ///< reference image object + SlcImage *secondaryImage; ///< secondary image object + cuAmpcorParameter *param; ///< reference to the (global) parameters + cuArrays *offsetImage; ///< output offsets image + cuArrays *snrImage; ///< snr image + cuArrays *covImage; ///< cov image + cuArrays *peakValueImage; ///< peak value image + + +public: + // default constructor and destructor + cuAmpcorProcessor(cuAmpcorParameter *param_, + SlcImage *reference_, SlcImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_); + virtual ~cuAmpcorProcessor() = default; + + // Factory method (virtual constructor) + static std::unique_ptr create(int workflow, + cuAmpcorParameter *param_, + SlcImage *reference_, SlcImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_); + + // workflow specific methods + virtual void run(int, int) = 0; + +protected: + // shared methods + void setIndex(int idxDown_, int idxAcross_); + void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff); + +}; + +#endif //__CUAMPCORPROCESSOR_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp new file mode 100644 index 000000000..060f6f574 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp @@ -0,0 +1,527 @@ +#include "cuAmpcorProcessorOnePass.h" + +#include "cuAmpcorUtil.h" +#include +#include + +/** + * Run ampcor process for a batch of images (a chunk) + * @param[in] idxDown_ index of the chunk along Down/Azimuth direction + * @param[in] idxAcross_ index of the chunk along Across/Range direction + */ +void cuAmpcorProcessorOnePass::run(int idxDown_, int idxAcross_) +{ + // set chunk index + setIndex(idxDown_, idxAcross_); + + // load reference image chunk + loadReferenceChunk(); + // oversample reference + // (deramping included in oversampler) + referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod); + // c_referenceBatchRaw and c_referenceBatchOverSampled now have enlarged size + + int2 offset = make_int2((c_referenceBatchOverSampled->height - r_referenceBatchOverSampled->height)/2, + (c_referenceBatchOverSampled->width - r_referenceBatchOverSampled->width)/2); + // extract and take amplitudes + cuArraysCopyExtractAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, offset, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the raw reference image(s) + c_referenceBatchRaw->outputToFile("c_referenceBatchRaw", stream); + // dump the oversampled reference image(s) + c_referenceBatchOverSampled->outputToFile("c_referenceBatchOverSampled", stream); + r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampled", stream); +#endif + + // compute and subtract the mean value + cuArraysSubtractMean(r_referenceBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled reference image(s) with mean subtracted + r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream); +#endif + + // load secondary image chunk to c_secondaryBatchRaw + loadSecondaryChunk(); + // oversampling the secondary image(s) + secondaryBatchOverSampler->execute(c_secondaryBatchRaw, c_secondaryBatchOverSampled, param->derampMethod); + // take amplitudes + cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the raw secondary image + c_secondaryBatchRaw->outputToFile("c_secondaryBatchRaw", stream); + // dump the oversampled secondary image(s) + c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream); + r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream); +#endif + + // correlate oversampled images + if(param->algorithm == 0) { + cuCorrFreqDomain_OverSampled->execute(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatch); + } + else { + cuCorrTimeDomain(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatch, stream); + } + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled correlation surface (un-normalized) + r_corrBatch->outputToFile("r_corrBatch", stream); +#endif + + // normalize the correlation surface + corrNormalizerOverSampled->execute(r_corrBatch, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled correlation surface (normalized) + r_corrBatch->outputToFile("r_corrBatchNormed", stream); +#endif + + // find the maximum location of the correlation surface, in a rectangle area {range} from {start} + int extraPadSize = param->halfZoomWindowSizeRaw*param->rawDataOversamplingFactor; + int2 start = make_int2(extraPadSize, extraPadSize); + int2 range = make_int2(r_corrBatch->height-2*extraPadSize, r_corrBatch->width-2*extraPadSize); + cuArraysMaxloc2D(r_corrBatch, start, range, offsetInit, r_maxval, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the max location and value + offsetInit->outputToFile("i_offsetInit", stream); + r_maxval->outputToFile("r_maxvalInit", stream); +#endif + + // extract a smaller chip around the peak {offsetInit} + // with extra pads, i_corrBatchValidCount is no longer needed + cuArraysCopyExtractCorr(r_corrBatch, r_corrBatchZoomIn, offsetInit, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the extracted correlation Surface + r_corrBatchZoomIn->outputToFile("r_corrBatchZoomIn", stream); +#endif + + // statistics of correlation surface + // estimate variance on r_corrBatch + cuEstimateVariance(r_corrBatch, offsetInit, r_maxval, r_referenceBatchOverSampled->size, param->rawDataOversamplingFactor, r_covValue, stream); + + // snr on the extracted surface r_corrBatchZoomIn + cuArraysSumSquare(r_corrBatchZoomIn, r_corrBatchSum, stream); + int corrSurfaceSize = r_corrBatch->height*r_corrBatch->width; + cuEstimateSnr(r_corrBatchSum, r_maxval, r_snrValue, corrSurfaceSize, stream); + +#ifdef CUAMPCOR_DEBUG + r_snrValue->outputToFile("r_snrValue", stream); + r_covValue->outputToFile("r_covValue", stream); +#endif + + // oversample the correlation surface + if(param->corrSurfaceOverSamplingMethod) { + // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor + // we need the max loc as the center if shifted + corrSincOverSampler->execute(r_corrBatchZoomIn, r_corrBatchZoomInOverSampled, + maxLocShift, param->corrSurfaceOverSamplingFactor*param->rawDataOversamplingFactor + ); + + } + else { + corrOverSampler->execute(r_corrBatchZoomIn, r_corrBatchZoomInOverSampled); + } + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled correlation surface + r_corrBatchZoomInOverSampled->outputToFile("r_corrBatchZoomInOverSampled", stream); +#endif + + //find the max again, within the range of \pm 1 pixel * totalOS + cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, param->corrZoomInOversampledSearchStart, param->corrZoomInOversampledSearchRange, offsetZoomIn, corrMaxValue, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the max location on oversampled correlation surface + offsetZoomIn->outputToFile("i_offsetZoomIn", stream); + corrMaxValue->outputToFile("r_maxvalZoomInOversampled", stream); +#endif + + // determine the final offset from initial (pixel/2) and oversampled (sub-pixel) + cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal, + make_int2(param->corrWindowSize.x/2, param->corrWindowSize.y/2), // init offset origin + param->rawDataOversamplingFactor, // init offset factor + make_int2(param->corrZoomInSize.x/2*param->corrSurfaceOverSamplingFactor, param->corrZoomInSize.y/2*param->corrSurfaceOverSamplingFactor), + param->rawDataOversamplingFactor*param->corrSurfaceOverSamplingFactor, + stream); + +#ifdef CUAMPCOR_DEBUG + // dump the final offset + offsetFinal->outputToFile("i_offsetFinal", stream); +#endif + + // Insert the chunk results to final images + cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // snr + cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // Variance. + cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // peak value. + cuArraysCopyInsert(r_maxval, peakValueImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // all done +} + + + +/// constructor +cuAmpcorProcessorOnePass::cuAmpcorProcessorOnePass(cuAmpcorParameter *param_, SlcImage *reference_, SlcImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_) + : cuAmpcorProcessor(param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_) +{ + + // allocate the SLC image read buffer + ChunkOffsetDown = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + ChunkOffsetDown->allocate(); + ChunkOffsetDown->allocateHost(); + ChunkOffsetAcross = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + ChunkOffsetAcross->allocate(); + ChunkOffsetAcross->allocateHost(); + + c_referenceBatchRaw = new cuArrays ( + param->windowSizeHeightRawEnlarged, param->windowSizeWidthRawEnlarged, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_referenceBatchRaw->allocate(); + + c_secondaryBatchRaw = new cuArrays ( + param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_secondaryBatchRaw->allocate(); + + c_referenceBatchOverSampled = new cuArrays ( + param->windowSizeHeightEnlarged, param->windowSizeWidthEnlarged, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_referenceBatchOverSampled->allocate(); + + c_secondaryBatchOverSampled = new cuArrays ( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_secondaryBatchOverSampled->allocate(); + + r_referenceBatchOverSampled = new cuArrays ( + param->windowSizeHeight, param->windowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_referenceBatchOverSampled->allocate(); + + r_secondaryBatchOverSampled = new cuArrays ( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_secondaryBatchOverSampled->allocate(); + + referenceBatchOverSampler = new cuOverSamplerC2C( + c_referenceBatchRaw->height, c_referenceBatchRaw->width, //orignal size + c_referenceBatchOverSampled->height, c_referenceBatchOverSampled->width, //oversampled size + c_referenceBatchRaw->count, stream); + + secondaryBatchOverSampler = new cuOverSamplerC2C( + c_secondaryBatchRaw->height, c_secondaryBatchRaw->width, + c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, + c_secondaryBatchRaw->count, stream); + + r_corrBatch = new cuArrays ( + param->corrWindowSize.x, + param->corrWindowSize.y, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatch->allocate(); + + r_corrBatchZoomIn = new cuArrays ( + param->corrZoomInSize.x, + param->corrZoomInSize.y, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchZoomIn->allocate(); + + r_corrBatchZoomInOverSampled = new cuArrays ( + param->corrZoomInOversampledSize.x, + param->corrZoomInOversampledSize.y, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchZoomInOverSampled->allocate(); + + offsetInit = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetInit->allocate(); + + offsetZoomIn = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetZoomIn->allocate(); + + offsetFinal = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetFinal->allocate(); + + // the sinc interpolation center is at the correlation surface center + // E.g. (10, 10) for 21x21. + maxLocShift = make_int2(param->corrWindowSize.x/2, param->corrWindowSize.y/2); + + corrMaxValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + corrMaxValue->allocate(); + + r_corrBatchSum = new cuArrays ( + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchSum->allocate(); + + i_maxloc = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + i_maxloc->allocate(); + + r_maxval = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_maxval->allocate(); + + r_snrValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_snrValue->allocate(); + + r_covValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_covValue->allocate(); + + // end of new arrays + + if(param->corrSurfaceOverSamplingMethod) { + corrSincOverSampler = new cuSincOverSamplerR2R(param->corrSurfaceOverSamplingFactor, stream); + } + else { + corrOverSampler= new cuOverSamplerR2R( + r_corrBatchZoomIn->height, r_corrBatchZoomIn->width, + r_corrBatchZoomInOverSampled->height, r_corrBatchZoomInOverSampled->width, + param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + stream); + } + if(param->algorithm == 0) { + cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk, + stream); + } + + + corrNormalizerOverSampled = + std::unique_ptr(newCuNormalizer( + param->searchWindowSizeHeight, + param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + )); + + +#ifdef CUAMPCOR_DEBUG + std::cout << "all objects in chunk are created ...\n"; +#endif +} + +void cuAmpcorProcessorOnePass::loadReferenceChunk() +{ + + // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw + // then copy to a batch of windows with (nImages, height, width) (leading dimension on the right) + + // get the chunk size to be loaded to gpu + int startD = param->referenceChunkStartPixelDown[idxChunk]; //start pixel down (along height) + int startA = param->referenceChunkStartPixelAcross[idxChunk]; // start pixel across (along width) + int height = param->referenceChunkHeight[idxChunk]; // number of pixels along height + int width = param->referenceChunkWidth[idxChunk]; // number of pixels along width + +#ifdef CUAMPCOR_DEBUG + std::cout << "loading reference chunk ...\n " + << " index: " << idxChunk << " " + << "starting pixel: (" << startD << ", " << startA << ") " + << "size : (" << height << ", " << width << ")" + << "\n"; +#endif + + + // check whether all pixels are outside the original image range + if (height ==0 || width ==0) + { + // yes, simply set the image to 0 + c_referenceBatchRaw->setZero(stream); + } + else + { + // use cpu to compute the starting positions for each window + getRelativeOffset(ChunkOffsetDown->hostData, param->referenceStartPixelDown, param->referenceChunkStartPixelDown[idxChunk]); + // copy the positions to gpu + ChunkOffsetDown->copyToDevice(stream); + // same for the across direction + getRelativeOffset(ChunkOffsetAcross->hostData, param->referenceStartPixelAcross, param->referenceChunkStartPixelAcross[idxChunk]); + ChunkOffsetAcross->copyToDevice(stream); + +#ifdef CUAMPCOR_DEBUG + std::cout << "loading reference windows from chunk debug ... \n"; + auto * startPixelDownToChunk = ChunkOffsetDown->hostData; + auto * startPixelAcrossToChunk = ChunkOffsetAcross->hostData; + + for(int i=0; inumberWindowDownInChunk; ++i) { + int iDown = i; + if(i>=nWindowsDown) iDown = nWindowsDown-1; + for(int j=0; jnumberWindowAcrossInChunk; ++j){ + int iAcross = j; + if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; + int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; + int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross + + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; + std::cout << "Window index in chuck: (" << iDown << ", " << iAcross << ") \n"; + std::cout << " Staring pixel location from raw: (" << param->referenceStartPixelDown[idxInAll] << ", " + << param->referenceStartPixelAcross[idxInAll] <<")\n"; + std::cout << " Staring pixel location from chunk: (" << startPixelDownToChunk[idxInChunk] << ", " + << startPixelAcrossToChunk[idxInChunk] <<")\n"; + + } + } +#endif + + // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) + if(param->referenceImageDataType==2) + { + // allocate a gpu buffer to load data from cpu/file + // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19 + c_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); + c_referenceChunkRaw->allocate(); + + // load the data from cpu + referenceImage->loadToDevice((void *)c_referenceChunkRaw->devData, startD, startA, height, width, stream); + + //copy the chunk to a batch format (nImages, height, width) + // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + // deallocate the gpu buffer + delete c_referenceChunkRaw; + } + // if the image is real + else { + r_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); + r_referenceChunkRaw->allocate(); + + // load the data from cpu + referenceImage->loadToDevice((void *)r_referenceChunkRaw->devData, startD, startA, height, width, stream); + + // copy the chunk (real) to a batch format (complex) + cuArraysCopyToBatchWithOffsetR2C(r_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + // deallocate the gpu buffer + delete r_referenceChunkRaw; + } // end of if complex + } // end of if all pixels out of range +} + +void cuAmpcorProcessorOnePass::loadSecondaryChunk() +{ + // get the chunk size to be loaded to gpu + int height = param->secondaryChunkHeight[idxChunk]; // number of pixels along height + int width = param->secondaryChunkWidth[idxChunk]; // number of pixels along width + + // check whether all pixels are outside the original image range + if (height ==0 || width ==0) + { + // yes, simply set the image to 0 + c_secondaryBatchRaw->setZero(stream); + } + else + { + //copy to a batch format (nImages, height, width) + getRelativeOffset(ChunkOffsetDown->hostData, param->secondaryStartPixelDown, param->secondaryChunkStartPixelDown[idxChunk]); + ChunkOffsetDown->copyToDevice(stream); + getRelativeOffset(ChunkOffsetAcross->hostData, param->secondaryStartPixelAcross, param->secondaryChunkStartPixelAcross[idxChunk]); + ChunkOffsetAcross->copyToDevice(stream); + + if(param->secondaryImageDataType==2) + { + c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); + c_secondaryChunkRaw->allocate(); + + //load a chunk from mmap to gpu + secondaryImage->loadToDevice(c_secondaryChunkRaw->devData, + param->secondaryChunkStartPixelDown[idxChunk], + param->secondaryChunkStartPixelAcross[idxChunk], + param->secondaryChunkHeight[idxChunk], + param->secondaryChunkWidth[idxChunk], + stream); + + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + delete c_secondaryChunkRaw; + } + else { //real image + //allocate the gpu buffer + r_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); + r_secondaryChunkRaw->allocate(); + + //load a chunk from mmap to gpu + secondaryImage->loadToDevice(r_secondaryChunkRaw->devData, + param->secondaryChunkStartPixelDown[idxChunk], + param->secondaryChunkStartPixelAcross[idxChunk], + param->secondaryChunkHeight[idxChunk], + param->secondaryChunkWidth[idxChunk], + stream); + + // convert to the batch format + cuArraysCopyToBatchWithOffsetR2C(r_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + delete r_secondaryChunkRaw; + } + } +} + +// destructor +cuAmpcorProcessorOnePass::~cuAmpcorProcessorOnePass() +{ + corrNormalizerOverSampled.release(); + + if(param->corrSurfaceOverSamplingMethod) { + delete corrSincOverSampler; + } + else { + delete corrOverSampler; + } + if(param->algorithm == 0) { + delete cuCorrFreqDomain_OverSampled; + } + + delete ChunkOffsetDown ; + delete ChunkOffsetAcross ; + delete c_referenceBatchRaw; + delete c_secondaryBatchRaw; + delete c_referenceBatchOverSampled; + delete c_secondaryBatchOverSampled; + delete r_referenceBatchOverSampled; + delete r_secondaryBatchOverSampled; + delete referenceBatchOverSampler; + delete secondaryBatchOverSampler; + + delete r_corrBatch; + delete r_corrBatchZoomIn; + delete r_corrBatchZoomInOverSampled; + delete offsetInit; + delete offsetZoomIn; + delete offsetFinal; + delete corrMaxValue; + + delete r_corrBatchSum; + delete i_maxloc; + delete r_maxval; + delete r_snrValue; + delete r_covValue; + + // end of deletions + +} + +// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.h new file mode 100644 index 000000000..fb5f1e8b4 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.h @@ -0,0 +1,88 @@ +/* + * @file cuAmpcorProcessorOnePass.h + * @brief Ampcor processor for a batch of windows with OnePass workflow + * + * + */ + +#ifndef __CUAMPCORPROCESSOROnePass_H +#define __CUAMPCORPROCESSOROnePass_H + +#include "cuAmpcorProcessor.h" + + +/** + * cuAmpcor processor for a chunk (a batch of windows) + */ +class cuAmpcorProcessorOnePass : public cuAmpcorProcessor { + +private: + + // local variables and workers + // gpu buffer to load images from file + // image_complex_type uses original image type, + // convert to complex_type when copied to c_referenceBatchRaw + cuArrays * c_referenceChunkRaw, * c_secondaryChunkRaw; + cuArrays * r_referenceChunkRaw, * r_secondaryChunkRaw; + + // offset data + cuArrays *ChunkOffsetDown, *ChunkOffsetAcross; + + // windows raw (not oversampled) data, complex and real + cuArrays * c_referenceBatchRaw, * c_secondaryBatchRaw; + // cuArrays * r_referenceBatchRaw, * r_secondaryBatchRaw; + + // windows oversampled data + cuArrays * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled; + cuArrays * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled; + cuArrays * r_corrBatch, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled; + + // oversampling processors for complex images + cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler; + + // oversampling processor for correlation surface + cuOverSamplerR2R *corrOverSampler; + cuSincOverSamplerR2R *corrSincOverSampler; + + // cross-correlation processor with frequency domain algorithm + cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; + + // correlation surface normalizer + std::unique_ptr corrNormalizerRaw; + std::unique_ptr corrNormalizerOverSampled; + + // save offset results in different stages + cuArrays *offsetInit; + cuArrays *offsetZoomIn; + cuArrays *offsetFinal; + int2 maxLocShift; // record the maxloc from the extract center + cuArrays *corrMaxValue; + cuArrays *i_maxloc; + cuArrays *r_maxval; + + // SNR estimation + cuArrays *r_corrBatchRawZoomIn; + cuArrays *r_corrBatchSum; + cuArrays *r_snrValue; + + // Variance estimation + cuArrays *r_covValue; + +public: + // constructor + cuAmpcorProcessorOnePass(cuAmpcorParameter *param_, + SlcImage *reference_, SlcImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_); + // destructor + ~cuAmpcorProcessorOnePass() override; + + // run the given chunk + void run(int, int) override; + + void loadReferenceChunk(); + void loadSecondaryChunk(); +}; + +#endif //__CUAMPCORPROCESSOROnePass_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp similarity index 60% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp index 8f35fa075..b43721ca4 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp @@ -1,12 +1,15 @@ -#include "cuAmpcorChunk.h" +#include "cuAmpcorProcessorTwoPass.h" + #include "cuAmpcorUtil.h" +#include +#include /** * Run ampcor process for a batch of images (a chunk) * @param[in] idxDown_ index of the chunk along Down/Azimuth direction * @param[in] idxAcross_ index of the chunk along Across/Range direction */ -void cuAmpcorChunk::run(int idxDown_, int idxAcross_) +void cuAmpcorProcessorTwoPass::run(int idxDown_, int idxAcross_) { // set chunk index setIndex(idxDown_, idxAcross_); @@ -66,7 +69,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, r_maxval, stream); // estimate variance - cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_referenceBatchRaw->size, r_covValue, stream); + cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_referenceBatchRaw->size, 1, r_covValue, stream); // estimate SNR // step1: extraction of correlation surface around the peak @@ -171,11 +174,11 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) #endif // oversample the correlation surface - if(param->oversamplingMethod) { - // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor + if(param->corrSurfaceOverSamplingMethod) { + // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversampling factor // we need the max loc as the center if shifted corrSincOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled, - maxLocShift, param->oversamplingFactor*param->rawDataOversamplingFactor + maxLocShift, param->corrSurfaceOverSamplingFactor*param->rawDataOversamplingFactor ); } else { @@ -198,8 +201,8 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel) // = (Init-HalfsearchRange) + ZoomIn/(2*ovs) - cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal, - param->oversamplingFactor, param->rawDataOversamplingFactor, + cuSubPixelOffset2Pass(offsetInit, offsetZoomIn, offsetFinal, + param->corrSurfaceOverSamplingFactor, param->rawDataOversamplingFactor, param->halfSearchRangeDownRaw, param->halfSearchRangeAcrossRaw, stream); @@ -209,54 +212,13 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); // Variance. cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); - // Cross-correlation peak - cuArraysCopyInsert(corrMaxValue, corrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // peak value. + cuArraysCopyInsert(r_maxval, peakValueImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); // all done } -/// set chunk index -void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_) -{ - idxChunkDown = idxDown_; - idxChunkAcross = idxAcross_; - idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross; - - if(idxChunkDown == param->numberChunkDown -1) { - nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1); - } - else { - nWindowsDown = param->numberWindowDownInChunk; - } - - if(idxChunkAcross == param->numberChunkAcross -1) { - nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1); - } - else { - nWindowsAcross = param->numberWindowAcrossInChunk; - } -} - -/// obtain the starting pixels for each chip -/// @param[in] oStartPixel start pixel locations for all chips -/// @param[out] rstartPixel start pixel locations for chips within the chunk -void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff) -{ - for(int i=0; inumberWindowDownInChunk; ++i) { - int iDown = i; - if(i>=nWindowsDown) iDown = nWindowsDown-1; - for(int j=0; jnumberWindowAcrossInChunk; ++j){ - int iAcross = j; - if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; - int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; - int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross - + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; - rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff; - } - } -} - -void cuAmpcorChunk::loadReferenceChunk() +void cuAmpcorProcessorTwoPass::loadReferenceChunk() { // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw @@ -268,114 +230,139 @@ void cuAmpcorChunk::loadReferenceChunk() int height = param->referenceChunkHeight[idxChunk]; // number of pixels along height int width = param->referenceChunkWidth[idxChunk]; // number of pixels along width - //use cpu to compute the starting positions for each window - getRelativeOffset(ChunkOffsetDown->hostData, ¶m->referenceStartPixelDown[0], param->referenceChunkStartPixelDown[idxChunk]); - // copy the positions to gpu - ChunkOffsetDown->copyToDevice(stream); - // same for the across direction - getRelativeOffset(ChunkOffsetAcross->hostData, ¶m->referenceStartPixelAcross[0], param->referenceChunkStartPixelAcross[idxChunk]); - ChunkOffsetAcross->copyToDevice(stream); - - // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) - if(referenceImage->isComplex()) + // check whether all pixels are outside the original image range + if (height ==0 || width ==0) + { + // yes, simply set the image to 0 + c_referenceBatchRaw->setZero(stream); + } + else { - // allocate a gpu buffer to load data from cpu/file - // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19 - c_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); - c_referenceChunkRaw->allocate(); - - // load the data from cpu - referenceImage->loadToDevice((void *)c_referenceChunkRaw->devData, startD, startA, height, width, stream); - - //copy the chunk to a batch format (nImages, height, width) - // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data - if(param->derampMethod == 0) { - cuArraysCopyToBatchAbsWithOffset(c_referenceChunkRaw, param->referenceChunkWidth[idxChunk], - c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + // use cpu to compute the starting positions for each window + getRelativeOffset(ChunkOffsetDown->hostData, param->referenceStartPixelDown, param->referenceChunkStartPixelDown[idxChunk]); + // copy the positions to gpu + ChunkOffsetDown->copyToDevice(stream); + // same for the across direction + getRelativeOffset(ChunkOffsetAcross->hostData, param->referenceStartPixelAcross, param->referenceChunkStartPixelAcross[idxChunk]); + ChunkOffsetAcross->copyToDevice(stream); + + // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) + if(param->referenceImageDataType==2) + { + // allocate a gpu buffer to load data from cpu/file + // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19 + + c_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); + c_referenceChunkRaw->allocate(); + + // load the data from cpu + referenceImage->loadToDevice((void *)c_referenceChunkRaw->devData, startD, startA, height, width, stream); + + //copy the chunk to a batch format (nImages, height, width) + // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + // deallocate the gpu buffer + delete c_referenceChunkRaw; } + // if the image is real else { - cuArraysCopyToBatchWithOffset(c_referenceChunkRaw, param->referenceChunkWidth[idxChunk], - c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - } - // deallocate the gpu buffer - delete c_referenceChunkRaw; - } - // if the image is real - else { - r_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); - r_referenceChunkRaw->allocate(); - - // load the data from cpu - referenceImage->loadToDevice((void *)r_referenceChunkRaw->devData, startD, startA, height, width, stream); - - // copy the chunk (real) to a batch format (complex) - cuArraysCopyToBatchWithOffsetR2C(r_referenceChunkRaw, param->referenceChunkWidth[idxChunk], - c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - // deallocate the gpu buffer - delete r_referenceChunkRaw; - } - - + r_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); + r_referenceChunkRaw->allocate(); + + // load the data from cpu + referenceImage->loadToDevice((void *)r_referenceChunkRaw->devData, startD, startA, height, width, stream); + + // copy the chunk (real) to a batch format (complex) + cuArraysCopyToBatchWithOffsetR2C(r_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + // deallocate the gpu buffer + delete r_referenceChunkRaw; + } // end of if complex + } // end of if all pixels out of range } -void cuAmpcorChunk::loadSecondaryChunk() +void cuAmpcorProcessorTwoPass::loadSecondaryChunk() { + // get the chunk size to be loaded to gpu + int height = param->secondaryChunkHeight[idxChunk]; // number of pixels along height + int width = param->secondaryChunkWidth[idxChunk]; // number of pixels along width - //copy to a batch format (nImages, height, width) - getRelativeOffset(ChunkOffsetDown->hostData, ¶m->secondaryStartPixelDown[0], param->secondaryChunkStartPixelDown[idxChunk]); - ChunkOffsetDown->copyToDevice(stream); - getRelativeOffset(ChunkOffsetAcross->hostData, ¶m->secondaryStartPixelAcross[0], param->secondaryChunkStartPixelAcross[idxChunk]); - ChunkOffsetAcross->copyToDevice(stream); - - if(secondaryImage->isComplex()) + // check whether all pixels are outside the original image range + if (height ==0 || width ==0) { - c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); - c_secondaryChunkRaw->allocate(); - - //load a chunk from mmap to gpu - secondaryImage->loadToDevice(c_secondaryChunkRaw->devData, - param->secondaryChunkStartPixelDown[idxChunk], - param->secondaryChunkStartPixelAcross[idxChunk], - param->secondaryChunkHeight[idxChunk], - param->secondaryChunkWidth[idxChunk], - stream); - - if(param->derampMethod == 0) { - cuArraysCopyToBatchAbsWithOffset(c_secondaryChunkRaw, param->secondaryChunkWidth[idxChunk], - c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + // yes, simply set the image to 0 + c_secondaryBatchRaw->setZero(stream); + } + else + { + //copy to a batch format (nImages, height, width) + getRelativeOffset(ChunkOffsetDown->hostData, param->secondaryStartPixelDown, param->secondaryChunkStartPixelDown[idxChunk]); + ChunkOffsetDown->copyToDevice(stream); + getRelativeOffset(ChunkOffsetAcross->hostData, param->secondaryStartPixelAcross, param->secondaryChunkStartPixelAcross[idxChunk]); + ChunkOffsetAcross->copyToDevice(stream); + + if(param->secondaryImageDataType==2) + { + c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); + c_secondaryChunkRaw->allocate(); + + //load a chunk from mmap to gpu + secondaryImage->loadToDevice(c_secondaryChunkRaw->devData, + param->secondaryChunkStartPixelDown[idxChunk], + param->secondaryChunkStartPixelAcross[idxChunk], + param->secondaryChunkHeight[idxChunk], + param->secondaryChunkWidth[idxChunk], + stream); + + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + delete c_secondaryChunkRaw; } - else { - cuArraysCopyToBatchWithOffset(c_secondaryChunkRaw, param->secondaryChunkWidth[idxChunk], + else { //real image + //allocate the gpu buffer + r_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); + r_secondaryChunkRaw->allocate(); + + //load a chunk from mmap to gpu + secondaryImage->loadToDevice(r_secondaryChunkRaw->devData, + param->secondaryChunkStartPixelDown[idxChunk], + param->secondaryChunkStartPixelAcross[idxChunk], + param->secondaryChunkHeight[idxChunk], + param->secondaryChunkWidth[idxChunk], + stream); + + // convert to the batch format + cuArraysCopyToBatchWithOffsetR2C(r_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + delete r_secondaryChunkRaw; } - delete c_secondaryChunkRaw; - } - else { //real image - //allocate the gpu buffer - r_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); - r_secondaryChunkRaw->allocate(); - - //load a chunk from mmap to gpu - secondaryImage->loadToDevice(r_secondaryChunkRaw->devData, - param->secondaryChunkStartPixelDown[idxChunk], - param->secondaryChunkStartPixelAcross[idxChunk], - param->secondaryChunkHeight[idxChunk], - param->secondaryChunkWidth[idxChunk], - stream); - - // convert to the batch format - cuArraysCopyToBatchWithOffsetR2C(r_secondaryChunkRaw, param->secondaryChunkWidth[idxChunk], - c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - delete r_secondaryChunkRaw; } } /// constructor -cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, - cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, - cuArrays *corrImage_, +cuAmpcorProcessorTwoPass::cuAmpcorProcessorTwoPass(cuAmpcorParameter *param_, SlcImage *reference_, SlcImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_) - + : cuAmpcorProcessor(param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_) { param = param_; referenceImage = reference_; @@ -383,7 +370,6 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G offsetImage = offsetImage_; snrImage = snrImage_; covImage = covImage_; - corrImage = corrImage_; stream = stream_; @@ -394,47 +380,47 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G ChunkOffsetAcross->allocate(); ChunkOffsetAcross->allocateHost(); - c_referenceBatchRaw = new cuArrays ( + c_referenceBatchRaw = new cuArrays ( param->windowSizeHeightRaw, param->windowSizeWidthRaw, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_referenceBatchRaw->allocate(); - c_secondaryBatchRaw = new cuArrays ( + c_secondaryBatchRaw = new cuArrays ( param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_secondaryBatchRaw->allocate(); - r_referenceBatchRaw = new cuArrays ( + r_referenceBatchRaw = new cuArrays ( param->windowSizeHeightRaw, param->windowSizeWidthRaw, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_referenceBatchRaw->allocate(); - r_secondaryBatchRaw = new cuArrays ( + r_secondaryBatchRaw = new cuArrays ( param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_secondaryBatchRaw->allocate(); - c_secondaryBatchZoomIn = new cuArrays ( + c_secondaryBatchZoomIn = new cuArrays ( param->searchWindowSizeHeightRawZoomIn, param->searchWindowSizeWidthRawZoomIn, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_secondaryBatchZoomIn->allocate(); - c_referenceBatchOverSampled = new cuArrays ( + c_referenceBatchOverSampled = new cuArrays ( param->windowSizeHeight, param->windowSizeWidth, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_referenceBatchOverSampled->allocate(); - c_secondaryBatchOverSampled = new cuArrays ( + c_secondaryBatchOverSampled = new cuArrays ( param->searchWindowSizeHeight, param->searchWindowSizeWidth, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_secondaryBatchOverSampled->allocate(); - r_referenceBatchOverSampled = new cuArrays ( + r_referenceBatchOverSampled = new cuArrays ( param->windowSizeHeight, param->windowSizeWidth, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_referenceBatchOverSampled->allocate(); - r_secondaryBatchOverSampled = new cuArrays ( + r_secondaryBatchOverSampled = new cuArrays ( param->searchWindowSizeHeight, param->searchWindowSizeWidth, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_secondaryBatchOverSampled->allocate(); @@ -447,21 +433,21 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G secondaryBatchOverSampler = new cuOverSamplerC2C(c_secondaryBatchZoomIn->height, c_secondaryBatchZoomIn->width, c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, c_secondaryBatchRaw->count, stream); - r_corrBatchRaw = new cuArrays ( + r_corrBatchRaw = new cuArrays ( param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1, param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_corrBatchRaw->allocate(); - r_corrBatchZoomIn = new cuArrays ( + r_corrBatchZoomIn = new cuArrays ( param->searchWindowSizeHeight - param->windowSizeHeight+1, param->searchWindowSizeWidth - param->windowSizeWidth+1, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_corrBatchZoomIn->allocate(); - r_corrBatchZoomInAdjust = new cuArrays ( + r_corrBatchZoomInAdjust = new cuArrays ( param->searchWindowSizeHeight - param->windowSizeHeight, param->searchWindowSizeWidth - param->windowSizeWidth, param->numberWindowDownInChunk, @@ -469,9 +455,9 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G r_corrBatchZoomInAdjust->allocate(); - r_corrBatchZoomInOverSampled = new cuArrays ( - param->zoomWindowSize * param->oversamplingFactor, - param->zoomWindowSize * param->oversamplingFactor, + r_corrBatchZoomInOverSampled = new cuArrays ( + param->zoomWindowSize * param->corrSurfaceOverSamplingFactor, + param->zoomWindowSize * param->corrSurfaceOverSamplingFactor, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_corrBatchZoomInOverSampled->allocate(); @@ -482,18 +468,18 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G offsetZoomIn = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); offsetZoomIn->allocate(); - offsetFinal = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetFinal = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); offsetFinal->allocate(); maxLocShift = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); maxLocShift->allocate(); - corrMaxValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + corrMaxValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); corrMaxValue->allocate(); // new arrays due to snr estimation - r_corrBatchRawZoomIn = new cuArrays ( + r_corrBatchRawZoomIn = new cuArrays ( param->corrRawZoomInHeight, param->corrRawZoomInWidth, param->numberWindowDownInChunk, @@ -508,7 +494,7 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G i_corrBatchZoomInValid->allocate(); - r_corrBatchSum = new cuArrays ( + r_corrBatchSum = new cuArrays ( param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_corrBatchSum->allocate(); @@ -522,27 +508,27 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G i_maxloc->allocate(); - r_maxval = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_maxval = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_maxval->allocate(); - r_snrValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_snrValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_snrValue->allocate(); - r_covValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_covValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_covValue->allocate(); // end of new arrays - if(param->oversamplingMethod) { - corrSincOverSampler = new cuSincOverSamplerR2R(param->oversamplingFactor, stream); + if(param->corrSurfaceOverSamplingMethod) { + corrSincOverSampler = new cuSincOverSamplerR2R(param->corrSurfaceOverSamplingFactor, stream); } else { corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize, - (param->zoomWindowSize)*param->oversamplingFactor, - (param->zoomWindowSize)*param->oversamplingFactor, + (param->zoomWindowSize)*param->corrSurfaceOverSamplingFactor, + (param->zoomWindowSize)*param->corrSurfaceOverSamplingFactor, param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, stream); } @@ -557,17 +543,18 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G stream); } - corrNormalizerRaw = new cuNormalizer( + corrNormalizerRaw = std::unique_ptr(newCuNormalizer( param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, param->numberWindowDownInChunk * param->numberWindowAcrossInChunk - ); + )); - corrNormalizerOverSampled = new cuNormalizer( + corrNormalizerOverSampled = + std::unique_ptr(newCuNormalizer( param->searchWindowSizeHeight, param->searchWindowSizeWidth, param->numberWindowDownInChunk * param->numberWindowAcrossInChunk - ); + )); #ifdef CUAMPCOR_DEBUG @@ -576,8 +563,57 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G } // destructor -cuAmpcorChunk::~cuAmpcorChunk() +cuAmpcorProcessorTwoPass::~cuAmpcorProcessorTwoPass() { + corrNormalizerOverSampled.release(); + corrNormalizerRaw.release(); + + if(param->corrSurfaceOverSamplingMethod) { + delete corrSincOverSampler; + } + else { + delete corrOverSampler; + } + if(param->algorithm == 0) { + delete cuCorrFreqDomain; + delete cuCorrFreqDomain_OverSampled; + } + + delete ChunkOffsetDown ; + delete ChunkOffsetAcross ; + delete c_referenceBatchRaw; + delete c_secondaryBatchRaw; + delete r_referenceBatchRaw; + delete r_secondaryBatchRaw; + delete c_secondaryBatchZoomIn; + delete c_referenceBatchOverSampled; + delete c_secondaryBatchOverSampled; + delete r_referenceBatchOverSampled; + delete r_secondaryBatchOverSampled; + delete referenceBatchOverSampler; + delete secondaryBatchOverSampler; + + delete r_corrBatchRaw; + delete r_corrBatchZoomIn; + delete r_corrBatchZoomInAdjust; + delete r_corrBatchZoomInOverSampled; + delete offsetInit; + delete offsetZoomIn; + delete offsetFinal; + delete maxLocShift; + delete corrMaxValue; + + delete r_corrBatchRawZoomIn; + delete i_corrBatchZoomInValid; + delete r_corrBatchSum; + delete i_corrBatchValidCount; + delete i_maxloc; + delete r_maxval; + delete r_snrValue; + delete r_covValue; + + // end of deletions + } // end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.h new file mode 100644 index 000000000..de2a29d80 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.h @@ -0,0 +1,88 @@ +/* + * @file cuAmpcorProcessorTwoPass.h + * @brief Ampcor processor for a batch of windows with TwoPass workflow + * + * + */ + +#ifndef __CUAMPCORPROCESSORTwoPass_H +#define __CUAMPCORPROCESSORTwoPass_H + +#include "cuAmpcorProcessor.h" + + +/** + * cuAmpcor processor for a chunk (a batch of windows) + */ +class cuAmpcorProcessorTwoPass : public cuAmpcorProcessor{ +private: + + // local variables and workers + // gpu buffer to load images from file + cuArrays * c_referenceChunkRaw, * c_secondaryChunkRaw; + cuArrays * r_referenceChunkRaw, * r_secondaryChunkRaw; + + // windows raw (not oversampled) data, complex and real + cuArrays * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn; + cuArrays * r_referenceBatchRaw, * r_secondaryBatchRaw; + + // windows oversampled data + cuArrays * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled; + cuArrays * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled; + cuArrays * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust; + + // offset data + cuArrays *ChunkOffsetDown, *ChunkOffsetAcross; + + // oversampling processors for complex images + cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler; + + // oversampling processor for correlation surface + cuOverSamplerR2R *corrOverSampler; + cuSincOverSamplerR2R *corrSincOverSampler; + + // cross-correlation processor with frequency domain algorithm + cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; + + // correlation surface normalizer + std::unique_ptr corrNormalizerRaw; + std::unique_ptr corrNormalizerOverSampled; + + // save offset results in different stages + cuArrays *offsetInit; + cuArrays *offsetZoomIn; + cuArrays *offsetFinal; + cuArrays *maxLocShift; // record the maxloc from the extract center + cuArrays *corrMaxValue; + cuArrays *i_maxloc; + cuArrays *r_maxval; + + // SNR estimation + cuArrays *r_corrBatchRawZoomIn; + cuArrays *r_corrBatchSum; + cuArrays *i_corrBatchZoomInValid, *i_corrBatchValidCount; + cuArrays *r_snrValue; + + // Variance estimation + cuArrays *r_covValue; + +public: + // constructor + cuAmpcorProcessorTwoPass(cuAmpcorParameter *param_, + SlcImage *reference_, SlcImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_); + // destructor + ~cuAmpcorProcessorTwoPass() override; + + // local methods + void loadReferenceChunk(); + void loadSecondaryChunk(); + // run the given chunk + void run(int, int) override; +}; + + + +#endif //__CUAMPCORPROCESSORTwoPass_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h index d56a01c67..111ab79f2 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h @@ -9,6 +9,7 @@ #ifndef __CUAMPCORUTIL_H #define __CUAMPCORUTIL_H +#include "data_types.h" #include "cuArrays.h" #include "cuAmpcorParameter.h" #include "cudaError.h" @@ -18,14 +19,14 @@ //in cuArraysCopy.cu: various utilities for copy images file in gpu memory -void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); -void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream); -void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream); -void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream); -void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); +void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); +void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream); +void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream); +void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream); +void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); // same routine name overloaded for different data type // extract data from a large image @@ -33,67 +34,76 @@ template void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offset, cudaStream_t); template void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t); +void cuArraysCopyExtractAbs(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); template void cuArraysCopyInsert(cuArrays *in, cuArrays *out, int offsetX, int offsetY, cudaStream_t); template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream); -void cuArraysSetConstant(cuArrays *imageIn, float value, cudaStream_t stream); +void cuArraysSetConstant(cuArrays *imageIn, real_type value, cudaStream_t stream); -void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream); // cuDeramp.cu: deramping phase -void cuDeramp(int method, cuArrays *images, cudaStream_t stream); -void cuDerampMethod1(cuArrays *images, cudaStream_t stream); +void cuDeramp(int method, cuArrays *images, cudaStream_t stream); +void cuDerampMethod1(cuArrays *images, cudaStream_t stream); // cuArraysPadding.cu: various utilities for oversampling padding -void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysFFTPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream); //in cuCorrNormalization.cu: utilities to normalize the cross correlation function -void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream); -void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream); +void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); + +// in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySAT, cuArrays *secondarySAT2, cudaStream_t stream); template -void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); // in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table -void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, - cuArrays * referenceSum2, cuArrays *secondarySAT, cuArrays *secondarySAT2, cudaStream_t stream); +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySAT, cuArrays *secondarySAT2, cudaStream_t stream); //in cuOffset.cu: utitilies for determining the max location of cross correlations or the offset -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cudaStream_t stream); -void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetZoomIn, cuArrays *offsetFinal, - int OverSampleRatioZoomin, int OverSampleRatioRaw, - int xHalfRangeInit, int yHalfRangeInit, - cudaStream_t stream); - -void cuDetermineSecondaryExtractOffset(cuArrays *maxLoc, cuArrays *maxLocShift, - int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream); +void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); +void cuArraysMaxloc2D(cuArrays *images, const int2 start, const int2 range, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); +void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetRoomIn, cuArrays *offsetFinal, const int2 initOrigin, const int initFactor, const int2 zoomInOrigin, const int zoomInFactor, cudaStream_t stream); +void cuSubPixelOffset2Pass(cuArrays *offsetInit, cuArrays *offsetZoomIn, cuArrays *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit, int yHalfRangeInit, cudaStream_t stream); +void cuDetermineSecondaryExtractOffset(cuArrays *maxLoc, cuArrays *maxLocShift, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream); //in cuCorrTimeDomain.cu: cross correlation in time domain -void cuCorrTimeDomain(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuCorrTimeDomain(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); //in cuCorrFrequency.cu: cross correlation in freq domain, also include fft correlatior class -void cuArraysElementMultiply(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *image2, float coef, cudaStream_t stream); +void cuArraysElementMultiply(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *image2, real_type coef, cudaStream_t stream); // For SNR estimation on Correlation surface (Minyan Zhong) // implemented in cuArraysCopy.cu -void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *imagesValid, cuArrays *maxloc, cudaStream_t stream); +void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *maxloc, cudaStream_t stream); +void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *imagesValid, cuArrays *maxloc, cudaStream_t stream); // implemented in cuCorrNormalization.cu -void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, cuArrays *imagesValidCount, cudaStream_t stream); +void cuArraysSumSquare(cuArrays *images, cuArrays *imagesSum, cudaStream_t stream); +void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, cuArrays *imagesValidCount, cudaStream_t stream); + // implemented in cuEstimateStats.cu -void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream); +void cuEstimateSnr(cuArrays *corrSum, cuArrays *maxval, cuArrays *snrValue, const int size, cudaStream_t stream); +void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream); // implemented in cuEstimateStats.cu -void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, int templateSize, cuArrays *covValue, cudaStream_t stream); +void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, const int templateSize, const int distance, cuArrays *covValue, cudaStream_t stream); #endif diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp similarity index 60% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp index f7cc88837..bb08c3d56 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp @@ -7,6 +7,11 @@ // dependencies #include "cuArrays.h" #include "cudaError.h" +#include "float2.h" +#include "data_types.h" +#include +#include +#include // allocate arrays in device memory template @@ -62,6 +67,34 @@ void cuArrays::setZero(cudaStream_t stream) checkCudaErrors(cudaMemsetAsync(devData, 0, getByteSize(), stream)); } + +// Overloaded << operator for composite type +std::ostream& operator<<(std::ostream& os, const float2& p) { + + return os << "(" << p.x << ", " << p.y << ")"; +} + +std::ostream& operator<<(std::ostream& os, const float3& p) { + + return os << "(" << p.x << ", " << p.y << ", " << p.z << ")"; +} + +std::ostream& operator<<(std::ostream& os, const double2& p) { + + return os << "(" << p.x << ", " << p.y << ")"; +} + +std::ostream& operator<<(std::ostream& os, const double3& p) { + + return os << "(" << p.x << ", " << p.y << ", " << p.z << ")"; +} + +std::ostream& operator<<(std::ostream& os, const int2& p) { + + return os << "(" << p.x << ", " << p.y << ")"; +} + + // output (partial) data when debugging template void cuArrays::debuginfo(cudaStream_t stream) { @@ -87,63 +120,6 @@ void cuArrays::debuginfo(cudaStream_t stream) { } } -// need specializations for x,y components -template<> -void cuArrays::debuginfo(cudaStream_t stream) { - std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl; - if( !is_allocatedHost) - allocateHost(); - copyToHost(stream); - - int range = std::min(10, size*count); - - for(int i=0; irange) { - for(int i=size*count-range; i -void cuArrays::debuginfo(cudaStream_t stream) { - std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl; - if( !is_allocatedHost) - allocateHost(); - copyToHost(stream); - - int range = std::min(10, size*count); - - for(int i=0; irange) { - for(int i=size*count-range; i -void cuArrays::debuginfo(cudaStream_t stream) { - std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl; - if( !is_allocatedHost) - allocateHost(); - copyToHost(stream); - - int range = std::min(10, size*count); - - for(int i=0; irange) { - for(int i=size*count-range; i @@ -163,12 +139,16 @@ void cuArrays::outputHostToFile(std::string fn) file.open(fn.c_str(), std::ios_base::binary); file.write((char *)hostData, getByteSize()); file.close(); + std::cout << fn << "size: height " << height << " width "<< width << " count " << count << "\n"; } // instantiations, required by python extensions template class cuArrays; template class cuArrays; template class cuArrays; +template class cuArrays; +template class cuArrays; +template class cuArrays; template class cuArrays; template class cuArrays; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h index b823a6b9d..5fb7878dc 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h @@ -12,14 +12,9 @@ #define __CUARRAYS_H // cuda dependencies -#include #include - #include -#include -#include -#include - +#include template class cuArrays{ @@ -109,5 +104,11 @@ class cuArrays{ }; +std::ostream& operator<<(std::ostream& os, const float2& p); +std::ostream& operator<<(std::ostream& os, const float3& p); +std::ostream& operator<<(std::ostream& os, const double2& p); +std::ostream& operator<<(std::ostream& os, const float3& p); +std::ostream& operator<<(std::ostream& os, const int2& p); + #endif //__CUARRAYS_H //end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu index d9e36461b..cb86fd3c6 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu @@ -25,10 +25,11 @@ #include "cudaUtil.h" #include "cudaError.h" #include "float2.h" +#include "data_types.h" // cuda kernel for cuArraysCopyToBatch -__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY, - float2 *imageOut, const int outNX, const int outNY, +__global__ void cuArraysCopyToBatch_kernel(const image_complex_type *imageIn, const int inNX, const int inNY, + complex_type *imageOut, const int outNX, const int outNY, const int nImagesX, const int nImagesY, const int strideX, const int strideY) { @@ -40,7 +41,7 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX int idxImageX = idxImage/nImagesY; int idxImageY = idxImage%nImagesY; int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy; - imageOut[idxOut] = imageIn[idxIn]; + imageOut[idxOut] = make_complex_type(imageIn[idxIn].x, imageIn[idxIn].y); } /** @@ -52,7 +53,7 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX * @param strideW stride along width to extract chips * @param stream cudaStream */ -void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, +void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream) { const int nthreads = NTHREADS2D; @@ -68,54 +69,113 @@ void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, // kernel for cuArraysCopyToBatchWithOffset template -__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNY, +__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNX, const int inNY, T_out *imageOut, const int outNX, const int outNY, const int nImages, const int *offsetX, const int *offsetY) { + // get image index int idxImage = blockIdx.z; + // check the image index within range + if(idxImage>=nImages ) return; + // get the output pixel location int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; + // check the output location within range (due to cuda threads) + if(outx >= outNX || outy >= outNY) return; + // flatten the output location to 1d int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; - imageOut[idxOut] = T_out{imageIn[idxIn]}; + // find the input pixel location + int inx = offsetX[idxImage] + outx; + int iny = offsetY[idxImage] + outy; + // check whether the location is within the input image range + if(inx>=0 && inx=0 && iny +__global__ void cuArraysCopyToBatchWithOffset_kernel(const image_complex_type *imageIn, const int inNX, const int inNY, + complex_type *imageOut, const int outNX, const int outNY, const int nImages, + const int *offsetX, const int *offsetY) +{ + // get image index + int idxImage = blockIdx.z; + // check the image index within range + if(idxImage>=nImages ) return; + // get the output pixel location + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + // check the output location within range (due to cuda threads) + if(outx >= outNX || outy >= outNY) return; + // flatten the output location to 1d + int idxOut = idxImage*outNX*outNY + outx*outNY + outy; + // find the input pixel location + int inx = offsetX[idxImage] + outx; + int iny = offsetY[idxImage] + outy; + // check whether the location is within the input image range + if(inx>=0 && inx=0 && iny *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream) +void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream) { const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); cuArraysCopyToBatchWithOffset_kernel<<>> ( - image1->devData, lda1, + image1->devData, inNX, inNY, image2->devData, image2->height, image2->width, image2->count, offsetH, offsetW); getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); } // same as above, but from complex to real(take amplitudes) -__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, +__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const image_complex_type *imageIn, const int inNX, const int inNY, + complex_type *imageOut, const int outNX, const int outNY, const int nImages, const int *offsetX, const int *offsetY) { + + // get image index int idxImage = blockIdx.z; + // check the image index within range + if(idxImage>=nImages ) return; + // get the output pixel location int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; + // check the output location within range (due to cuda threads) + if(outx >= outNX || outy >= outNY) return; + // flatten the output location to 1d int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; - imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0); + // find the input pixel location + int inx = offsetX[idxImage] + outx; + int iny = offsetY[idxImage] + outy; + // check whether the location is within the input image range + if(inx>=0 && inx=0 && iny *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream) +void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream) { const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); cuArraysCopyToBatchAbsWithOffset_kernel<<>> ( - image1->devData, lda1, + image1->devData, inNX, inNY, image2->devData, image2->height, image2->width, image2->count, offsetH, offsetW); getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); @@ -151,24 +211,25 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, * @param strideW (varying) offsets along width to extract chips * @param stream cudaStream */ -void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream) +void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream) { const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); + cuArraysCopyToBatchWithOffset_kernel<<>> ( - image1->devData, lda1, + image1->devData, inNX, inNY, image2->devData, image2->height, image2->width, image2->count, offsetH, offsetW); getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel"); } //copy a chunk into a series of chips, from complex to real -__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, +__global__ void cuArraysCopyC2R_kernel(const complex_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, const int nImagesX, const int nImagesY, - const int strideX, const int strideY, const float factor) + const int strideX, const int strideY, const real_type factor) { int idxImage = blockIdx.z; int outx = threadIdx.x + blockDim.x*blockIdx.x; @@ -190,13 +251,13 @@ __global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, co * @param strideW offsets along width to extract chips * @param stream cudaStream */ -void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, +void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream) { const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); - float factor = 1.0f/image1->size; //the FFT factor + real_type factor = 1.0f/image1->size; //the FFT factor cuArraysCopyC2R_kernel<<>> ( image1->devData, image1->height, image1->width, image2->devData, image2->height, image2->width, @@ -242,12 +303,59 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays } // instantiate the above template for the data types we need -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); // correlation surface extraction (Minyan Zhong) -__global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages, +__global__ void cuArraysCopyExtractVaryingOffsetCorr(const real_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, const int nImages, + const int2 *maxloc) +{ + + // get the image index + int idxImage = blockIdx.z; + + // One thread per out point. Find the coordinates within the current image. + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + + // check whether thread is within output image range + if (outx < outNX && outy < outNY) + { + // Find the corresponding input. + int inx = outx + maxloc[idxImage].x - outNX/2; + int iny = outy + maxloc[idxImage].y - outNY/2; + + // Find the location in flattened array. + int idxOut = (idxImage * outNX + outx) * outNY + outy; + int idxIn = (idxImage * inNX + inx) * inNY + iny; + imageOut[idxOut] = imageIn[idxIn]; + } +} + +/** + * copy a tile of images to another image, with starting pixels offsets accouting for boundary + * @param[in] imageIn inut images + * @param[out] imageOut output images of dimension nImages*outNX*outNY + */ +void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *maxloc, cudaStream_t stream) +{ + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = 16; + + dim3 threadsperblock(nthreads, nthreads,1); + + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + + cuArraysCopyExtractVaryingOffsetCorr<<>>( + imagesIn->devData, imagesIn->height, imagesIn->width, + imagesOut->devData, imagesOut->height, imagesOut->width, + imagesOut->count, maxloc->devData); + getLastCudaError("cuArraysCopyExtract error"); +} + +__global__ void cuArraysCopyExtractVaryingOffsetCorr(const real_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages, const int2 *maxloc) { @@ -289,7 +397,7 @@ __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const * @param[in] imageIn input images * @param[out] imageOut output images of dimension nImages*outNX*outNY */ -void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *imagesValid, cuArrays *maxloc, cudaStream_t stream) +void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *imagesValid, cuArrays *maxloc, cudaStream_t stream) { //assert(imagesIn->height >= imagesOut && inNY >= outNY); const int nthreads = 16; @@ -303,6 +411,7 @@ void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesO getLastCudaError("cuArraysCopyExtract error"); } + // end of correlation surface extraction (Minyan Zhong) @@ -323,8 +432,8 @@ __global__ void cuArraysCopyExtractFixedOffset(const T *imageIn, const int inNX, } } -__global__ void cuArraysCopyExtractFixedOffset(const float2 *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, const int nImages, +__global__ void cuArraysCopyExtractFixedOffset(const complex_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { int outx = threadIdx.x + blockDim.x*blockIdx.x; @@ -356,10 +465,43 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, i } // instantiate the above template for the data types we need -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); + + +__global__ void cuArraysCopyExtractAbsFixedOffset(const complex_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, const int nImages, + const int offsetX, const int offsetY) +{ + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + + if(outx < outNX && outy < outNY) + { + int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + imageOut[idxOut] = complexAbs(imageIn[idxIn]); + } +} +/** + * copy/extract images from a large size to + * a smaller size from the location (offsetX, offsetY), take amplitude + */ +void cuArraysCopyExtractAbs(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) +{ + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = NTHREADS2D; + dim3 threadsperblock(nthreads, nthreads,1); + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + cuArraysCopyExtractAbsFixedOffset<<>> + (imagesIn->devData, imagesIn->height, imagesIn->width, + imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); + getLastCudaError("cuArraysCopyExtractAbs error"); +} + + template __global__ void cuArraysCopyInsert_kernel(const T* imageIn, const int inNX, const int inNY, @@ -389,9 +531,9 @@ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX } // instantiate the above template for the data types we need -template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); -template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); -template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); template @@ -430,12 +572,12 @@ void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cuda } // instantiate the above template for the data types we need -template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); -template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); -template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); +template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); +template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); +template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); // cuda kernel for setting a constant value -__global__ void cuArraysSetConstant_kernel(float *image, int size, float value) +__global__ void cuArraysSetConstant_kernel(real_type *image, int size, real_type value) { int idx = threadIdx.x + blockDim.x*blockIdx.x; @@ -449,7 +591,7 @@ __global__ void cuArraysSetConstant_kernel(float *image, int size, float value) * Set real images to a constant value * @note use setZero if value=0 because cudaMemset is faster */ -void cuArraysSetConstant(cuArrays *imageIn, float value, cudaStream_t stream) +void cuArraysSetConstant(cuArrays *imageIn, real_type value, cudaStream_t stream) { const int nthreads = 256; int size = imageIn->getSize(); @@ -460,14 +602,14 @@ void cuArraysSetConstant(cuArrays *imageIn, float value, cudaStream_t str } -// convert float to float2(complex) -__global__ void cuArraysR2C_kernel(float *image1, float2 *image2, int size) +// convert real_type to complex_type(complex) +__global__ void cuArraysR2C_kernel(real_type *image1, complex_type *image2, int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if(idx < size) { image2[idx].x = image1[idx]; - image2[idx].y = 0.0f; + image2[idx].y = 0.0; } } @@ -476,7 +618,7 @@ __global__ void cuArraysR2C_kernel(float *image1, float2 *image2, int size) * @param[in] image1 input images * @param[out] image2 output images */ -void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream) +void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { int size = image1->getSize(); cuArraysR2C_kernel<<>>(image1->devData, image2->devData, size); @@ -484,8 +626,8 @@ void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t } -// take real part of float2 to float -__global__ void cuArraysC2R_kernel(float2 *image1, float *image2, int size) +// take real part of complex_type to real_type +__global__ void cuArraysC2R_kernel(complex_type *image1, real_type *image2, int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if(idx < size) @@ -499,7 +641,7 @@ __global__ void cuArraysC2R_kernel(float2 *image1, float *image2, int size) * @param[in] image1 input images * @param[out] image2 output images */ -void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream) +void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { int size = image1->getSize(); cuArraysC2R_kernel<<>>(image1->devData, image2->devData, size); @@ -507,7 +649,7 @@ void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t } // cuda kernel for cuArraysAbs -__global__ void cuArraysAbs_kernel(float2 *image1, float *image2, int size) +__global__ void cuArraysAbs_kernel(complex_type *image1, real_type *image2, int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if(idx < size) @@ -521,7 +663,7 @@ __global__ void cuArraysAbs_kernel(float2 *image1, float *image2, int size) * @param[in] image1 input images * @param[out] image2 output images */ -void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream) +void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { int size = image1->getSize(); cuArraysAbs_kernel<<>>(image1->devData, image2->devData, size); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu index af3927056..c3feacf30 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu @@ -1,83 +1,121 @@ /* * @file cuArraysPadding.cu - * @brief Utilities for padding zeros to cuArrays + * @brief Utilities for padding zeros to cuArrays for FFT (zero padding in the middle) */ #include "cuAmpcorUtil.h" #include "float2.h" -// cuda kernel for cuArraysPadding -__global__ void cuArraysPadding_kernel( - const float2 *image1, const int height1, const int width1, - float2 *image2, const int height2, const int width2) -{ - int tx = threadIdx.x + blockDim.x*blockIdx.x; - int ty = threadIdx.y + blockDim.y*blockIdx.y; - if(tx < height1/2 && ty < width1/2) - { - int tx1 = height1 - 1 - tx; - int ty1 = width1 -1 -ty; - int tx2 = height2 -1 -tx; - int ty2 = width2 -1 -ty; - - image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)]; - image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)]; - image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)]; - image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)]; - - } -} - -/** - * Padding zeros in the middle, move quads to corners - * @param[in] image1 input images - * @param[out] image2 output images - * @note This routine is for a single image, no longer used - */ -void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream) -{ - int ThreadsPerBlock = NTHREADS2D; - int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock); - int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock); - dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock); - dim3 dimGrid(BlockPerGridx, BlockPerGridy); - // set output image to 0 - checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream)); - // copy the quads of input images to four corners of the output images - cuArraysPadding_kernel<<>>( - image1->devData, image1->height, image1->width, - image2->devData, image2->height, image2->width); - getLastCudaError("cuArraysPadding_kernel"); -} -inline __device__ float2 cmplxMul(float2 c, float a) +// cuda kernel for zero padding for FFT oversampling, +// for both even and odd sequences +// @param[in] image1 input images +// @param[inout] image2 output images - memset to 0 in prior +// @note siding Nyquist frequency for even length with negative frequency +// for even N - positive f[0, ..., N/2-1], +// zeros 0...0, +// negative f[N/2], f[N/2+1, ..., N-1] +// for odd N - positive f[0, ..., (N-1)/2], +// zeros 0...0, +// negative f [(N+1)/2, ..., N-1] +__global__ void cuArraysPaddingMany_kernel( + const complex_type *image1, const int height1, const int width1, const int size1, + complex_type *image2, const int height2, const int width2, const int size2, const real_type factor ) { - return make_float2(c.x*a, c.y*a); + // thread indices are for input image1 + int x1 = threadIdx.x + blockDim.x*blockIdx.x; + int y1 = threadIdx.y + blockDim.y*blockIdx.y; + int imageIdx = blockIdx.z; + + // ensure threads are in the range of image1 + if (x1 >= height1 || y1 >= width1) + return; + + // determine the quadrants + // divup the length to be consistent with both even and odd lengths + int x2 = (x1 < (height1+1)/2) ? x1 : height2 - height1 + x1; + int y2 = (y1 < (width1+1)/2) ? y1 : width2 - width1 + y1; + image2[IDX2R(x2, y2, width2)+imageIdx*size2] + = image1[IDX2R(x1, y1, width1)+imageIdx*size1]*factor; + return; } -// cuda kernel for -__global__ void cuArraysPaddingMany_kernel( - const float2 *image1, const int height1, const int width1, const int size1, - float2 *image2, const int height2, const int width2, const int size2, const float factor ) +// cuda kernel for zero padding for FFT oversampling, +// for both even and odd sequences +// @param[in] image1 input images +// @param[inout] image2 output images - memset to 0 in prior +// @note for even length (height, width or both), +// the Nyquist frequency (N/2) component is split between positive and negative frequencies. +__global__ void cuArraysPaddingMany_split_Nyquist_kernel( + const complex_type *image1, const int height1, const int width1, const int size1, + complex_type *image2, const int height2, const int width2, const int size2, const real_type factor ) { - int tx = threadIdx.x + blockDim.x*blockIdx.x; - int ty = threadIdx.y + blockDim.y*blockIdx.y; - if(tx < height1/2 && ty < width1/2) + // thread indices are for input image1 + int x1 = threadIdx.x + blockDim.x*blockIdx.x; + int y1 = threadIdx.y + blockDim.y*blockIdx.y; + int imageIdx = blockIdx.z; + + // ensure threads are in the range of image1 + if (x1 >= height1 || y1 >= width1) + return; + + int xcenter1 = (height1+1)/2; // divup + int ycenter1 = (width1+1)/2; + + // for even N - positive f[0, ..., N/2-1], f[N/2]/2, + // zeros 0...0, + // negative f[N/2]/2, f[N/2+1, ..., N-1] + // f[N/2] is split to N/2 and M-N/2 + // for odd N - positive f[0, ..., (N-1)/2], + // zeros 0...0, + // negative f [(N+1)/2, ..., N-1] + + // split spectrum for even height1 at the center + if (height1 % 2 ==0 && x1 == xcenter1) { - - int tx1 = height1 - 1 - tx; - int ty1 = width1 -1 -ty; - int tx2 = height2 -1 -tx; - int ty2 = width2 -1 -ty; - - int stride1 = blockIdx.z*size1; - int stride2 = blockIdx.z*size2; - - image2[IDX2R(tx, ty, width2)+stride2] = image1[IDX2R(tx, ty, width1)+stride1]*factor; - image2[IDX2R(tx2, ty, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty, width1)+stride1], factor); - image2[IDX2R(tx, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx, ty1, width1)+stride1], factor); - image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor); + // if also for the width + if (width1 % 2 ==0 && y1 == ycenter1) + { + // split into 4 + complex_type input = image1[IDX2R(x1, y1, width1)+imageIdx*size1]; + input *= 0.25f*factor; + image2[IDX2R(x1, y1, width2)+imageIdx*size2] = input; + int x2 = x1 + height2 - height1; + int y2 = y1 + width2 - width1; + image2[IDX2R(x2, y1, width2)+imageIdx*size2] = input; + image2[IDX2R(x1, y2, width2)+imageIdx*size2] = input; + image2[IDX2R(x2, y2, width2)+imageIdx*size2] = input; + // all done for this pixel, so return + return; + } + // odd width or not at the center + complex_type input = image1[IDX2R(x1, y1, width1)+imageIdx*size1]; + input *= 0.5f*factor; + int x2 = x1 + height2 - height1; + image2[IDX2R(x1, y1, width2)+imageIdx*size2] = input; + image2[IDX2R(x2, y1, width2)+imageIdx*size2] = input; + return; } + // split spectrum for even width1 + if (width1 % 2 ==0 && y1 == ycenter1) + { + // even height even and at the center has been considered earlier + // so just odd height or not at the center + complex_type input = image1[IDX2R(x1, y1, width1)+imageIdx*size1]; + input *= 0.5f*factor; + int y2 = y1 + width2 - width1; + image2[IDX2R(x1, y1, width2)+imageIdx*size2] = input; + image2[IDX2R(x1, y2, width2)+imageIdx*size2] = input; + return; + } + + // all other cases, simply copy + // determine the quadrants + int x2 = (x1 < xcenter1) ? x1 : height2 - height1 + x1; + int y2 = (y1 < ycenter1) ? y1 : width2 - width1 + y1; + image2[IDX2R(x2, y2, width2)+imageIdx*size2] + = image1[IDX2R(x1, y1, width1)+imageIdx*size1]*factor; + return; } /** @@ -86,16 +124,17 @@ __global__ void cuArraysPaddingMany_kernel( * @param[out] image2 output images * @note To keep the band center at (0,0), move quads to corners and pad zeros in the middle */ -void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream) +void cuArraysFFTPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { int ThreadsPerBlock = NTHREADS2D; - int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock); - int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock); + // up to IDIVUP(dim, 2) for odd-length sequences + int BlockPerGridx = IDIVUP (image1->height, ThreadsPerBlock); + int BlockPerGridy = IDIVUP (image1->width, ThreadsPerBlock); dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1); dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count); checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream)); - float factor = 1.0f/image1->size; + real_type factor = 1.0f/image1->size; cuArraysPaddingMany_kernel<<>>( image1->devData, image1->height, image1->width, image1->size, image2->devData, image2->height, image2->width, image2->size, factor); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu index 79c11fa2a..215caa723 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu @@ -21,6 +21,16 @@ cuFreqCorrelator::cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaSt int n[NRANK] ={imageNX, imageNY}; // set up fft plans +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, + NULL, 1, imageSize, + NULL, 1, fImageSize, + CUFFT_D2Z, nImages)); + cufft_Error(cufftPlanMany(&backwardPlan, NRANK, n, + NULL, 1, fImageSize, + NULL, 1, imageSize, + CUFFT_Z2D, nImages)); +#else cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, @@ -29,16 +39,17 @@ cuFreqCorrelator::cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaSt NULL, 1, fImageSize, NULL, 1, imageSize, CUFFT_C2R, nImages)); +#endif stream = stream_; cufftSetStream(forwardPlan, stream); cufftSetStream(backwardPlan, stream); // set up work arrays - workFM = new cuArrays(imageNX, (imageNY/2+1), nImages); + workFM = new cuArrays(imageNX, (imageNY/2+1), nImages); workFM->allocate(); - workFS = new cuArrays(imageNX, (imageNY/2+1), nImages); + workFS = new cuArrays(imageNX, (imageNY/2+1), nImages); workFS->allocate(); - workT = new cuArrays (imageNX, imageNY, nImages); + workT = new cuArrays (imageNX, imageNY, nImages); workT->allocate(); } @@ -60,38 +71,46 @@ cuFreqCorrelator::~cuFreqCorrelator() * @param[out] results the correlation surfaces */ -void cuFreqCorrelator::execute(cuArrays *templates, cuArrays *images, cuArrays *results) +void cuFreqCorrelator::execute(cuArrays *templates, cuArrays *images, cuArrays *results) { // pad the reference windows to the the size of search windows cuArraysCopyPadded(templates, workT, stream); // forward fft to frequency domain +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecD2Z(forwardPlan, workT->devData, workFM->devData)); + cufft_Error(cufftExecD2Z(forwardPlan, images->devData, workFS->devData)); +#else cufft_Error(cufftExecR2C(forwardPlan, workT->devData, workFM->devData)); cufft_Error(cufftExecR2C(forwardPlan, images->devData, workFS->devData)); +#endif + // cufft doesn't normalize, so manually get the image size for normalization - float coef = 1.0/(images->size); + real_type coef = 1.0/(images->size); // multiply reference with secondary windows in frequency domain cuArraysElementMultiplyConjugate(workFM, workFS, coef, stream); // backward fft to get correlation surface in time domain +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2D(backwardPlan, workFM->devData, workT->devData)); +#else cufft_Error(cufftExecC2R(backwardPlan, workFM->devData, workT->devData)); +#endif // extract to get proper size of correlation surface cuArraysCopyExtract(workT, results, make_int2(0, 0), stream); // all done } // a = a^* * b -inline __device__ float2 cuMulConj(float2 a, float2 b) +inline __device__ complex_type cuMulConj(complex_type a, complex_type b, real_type f) { - return make_float2(a.x*b.x + a.y*b.y, -a.y*b.x + a.x*b.y); + return make_complex_type(f*(a.x*b.x + a.y*b.y), f*(-a.y*b.x + a.x*b.y)); } // cuda kernel for cuArraysElementMultiplyConjugate -__global__ void cudaKernel_elementMulConjugate(float2 *ainout, float2 *bin, int size, float coef) +__global__ void cudaKernel_elementMulConjugate(complex_type *ainout, complex_type *bin, int size, real_type coef) { int idx = threadIdx.x + blockIdx.x*blockDim.x; if(idx < size) { - cuComplex prod; - prod = cuMulConj(ainout[idx], bin[idx]); - ainout [idx] = prod*coef; + ainout [idx] = cuMulConj(ainout[idx], bin[idx], coef); } } @@ -101,7 +120,7 @@ __global__ void cudaKernel_elementMulConjugate(float2 *ainout, float2 *bin, int * @param[in] image2, the secondary image * @param[in] coef, usually the normalization factor */ -void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *image2, float coef, cudaStream_t stream) +void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *image2, real_type coef, cudaStream_t stream) { int size = image1->getSize(); int threadsperblock = NTHREADS; @@ -109,4 +128,4 @@ void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays cudaKernel_elementMulConjugate<<>>(image1->devData, image2->devData, size, coef ); getLastCudaError("cuArraysElementMultiply error\n"); } -//end of file \ No newline at end of file +//end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h index ef101e686..d05a58860 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h @@ -8,8 +8,9 @@ #define __CUCORRFREQUENCY_H // dependencies -#include "cudaUtil.h" #include "cuArrays.h" +#include "data_types.h" +#include class cuFreqCorrelator { @@ -18,9 +19,9 @@ class cuFreqCorrelator cufftHandle forwardPlan; cufftHandle backwardPlan; // work data - cuArrays *workFM; - cuArrays *workFS; - cuArrays *workT; + cuArrays *workFM; + cuArrays *workFS; + cuArrays *workT; // cuda stream cudaStream_t stream; @@ -30,8 +31,8 @@ class cuFreqCorrelator // destructor ~cuFreqCorrelator(); // executor - void execute(cuArrays *templates, cuArrays *images, cuArrays *results); + void execute(cuArrays *templates, cuArrays *images, cuArrays *results); }; #endif //__CUCORRFREQUENCY_H -// end of file \ No newline at end of file +// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu index b0ba6b731..d5135a23b 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu @@ -20,14 +20,13 @@ */ #include "cuAmpcorUtil.h" -#include #include // sum reduction within a block // the following implementation is compatible for sm_20 and above // newer architectures may support faster implementations, such as warp shuffle, cooperative groups template -__device__ float sumReduceBlock(float sum, volatile float *shmem) +__device__ real_type sumReduceBlock(real_type sum, volatile real_type *shmem) { const int tid = threadIdx.x; shmem[tid] = sum; @@ -53,9 +52,9 @@ __device__ float sumReduceBlock(float sum, volatile float *shmem) // cuda kernel to subtract mean value from the images template -__global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSize, float invSize, int nImages) +__global__ void cuArraysMean_kernel(real_type *images, real_type *image_sum, int imageSize, real_type invSize, int nImages) { - __shared__ float shmem[Nthreads]; + __shared__ real_type shmem[Nthreads]; const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -64,9 +63,9 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi const int imageIdx = bid; const int imageOffset = imageIdx * imageSize; - float *imageD = images + imageOffset; + real_type *imageD = images + imageOffset; - float sum = 0.0f; + real_type sum = 0.0; // perform the reduction beyond one block // save the results for each thread in block for (int i = tid; i < imageSize; i += Nthreads) @@ -74,7 +73,7 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi // reduction within the block sum = sumReduceBlock(sum, shmem); - const float mean = sum * invSize; + const real_type mean = sum * invSize; if(tid ==0) image_sum[bid] = mean; } @@ -84,11 +83,11 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi * @param[out] mean Output mean values * @param[in] stream cudaStream */ -void cuArraysMeanValue(cuArrays *images, cuArrays *mean, cudaStream_t stream) +void cuArraysMeanValue(cuArrays *images, cuArrays *mean, cudaStream_t stream) { const dim3 grid(images->count, 1, 1); const int imageSize = images->width*images->height; - const float invSize = 1.0f/imageSize; + const real_type invSize = 1.0f/imageSize; cuArraysMean_kernel <<>>(images->devData, mean->devData, imageSize, invSize, images->count); getLastCudaError("cuArraysMeanValue kernel error\n"); @@ -96,9 +95,9 @@ void cuArraysMeanValue(cuArrays *images, cuArrays *mean, cudaStrea // cuda kernel to compute and subtracts mean value from the images template -__global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float invSize, int nImages) +__global__ void cuArraysSubtractMean_kernel(real_type *images, int imageSize, real_type invSize, int nImages) { - __shared__ float shmem[Nthreads]; + __shared__ real_type shmem[Nthreads]; const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -107,16 +106,16 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float const int imageIdx = bid; const int imageOffset = imageIdx * imageSize; - float *imageD = images + imageOffset; + real_type *imageD = images + imageOffset; // compute the sum - float sum = 0.0f; + real_type sum = 0.0; for (int i = tid; i < imageSize; i += Nthreads) sum += imageD[i]; sum = sumReduceBlock(sum, shmem); // compute the mean - const float mean = sum * invSize; + const real_type mean = sum * invSize; // subtract the mean from each pixel for (int i = tid; i < imageSize; i += Nthreads) imageD[i] -= mean; @@ -128,11 +127,11 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float * @param[out] mean Output mean values * @param[in] stream cudaStream */ -void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream) +void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream) { const dim3 grid(images->count, 1, 1); const int imageSize = images->width*images->height; - const float invSize = 1.0f/imageSize; + const real_type invSize = 1.0f/imageSize; cuArraysSubtractMean_kernel <<>>(images->devData, imageSize, invSize, images->count); getLastCudaError("cuArraysSubtractMean kernel error\n"); @@ -141,9 +140,9 @@ void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream) // cuda kernel to compute summation on extracted correlation surface (Minyan) template -__global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *imagesSum, int *imagesValidCount, int imageSize, int nImages) +__global__ void cuArraysSum_kernel(real_type *images, real_type *imagesSum, int imageSize, int nImages) { - __shared__ float shmem[Nthreads]; + __shared__ real_type shmem[Nthreads]; const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -152,11 +151,57 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i const int imageIdx = bid; const int imageOffset = imageIdx * imageSize; - float* imageD = images + imageOffset; + real_type* imageD = images + imageOffset; + + real_type sum = 0.0; + + for (int i = tid; i < imageSize; i += Nthreads) { + sum += imageD[i] * imageD[i]; + } + + sum = sumReduceBlock(sum, shmem); + + if(tid ==0) { + imagesSum[bid] = sum; + } +} + +/** + * Compute the variance of images (for SNR) + * @param[in] images Input images + * @param[in] imagesValid validity flags for each pixel + * @param[out] imagesSum variance + * @param[out] imagesValidCount count of total valid pixels + * @param[in] stream cudaStream + */ +void cuArraysSumSquare(cuArrays *images, cuArrays *imagesSum, cudaStream_t stream) +{ + const dim3 grid(images->count, 1, 1); + const int imageSize = images->width*images->height; + + cuArraysSum_kernel <<>>(images->devData, + imagesSum->devData, imageSize, images->count); + getLastCudaError("cuArraysSumValue kernel error\n"); +} + +// cuda kernel to compute summation on extracted correlation surface (Minyan) +template +__global__ void cuArraysSumCorr_kernel(real_type *images, int *imagesValid, real_type *imagesSum, int *imagesValidCount, int imageSize, int nImages) +{ + __shared__ real_type shmem[Nthreads]; + + const int tid = threadIdx.x; + const int bid = blockIdx.x; + + if (bid >= nImages) return; + + const int imageIdx = bid; + const int imageOffset = imageIdx * imageSize; + real_type* imageD = images + imageOffset; int* imageValidD = imagesValid + imageOffset; - float sum = 0.0f; - int count = 0; + real_type sum = 0.0f; + real_type count = 0; for (int i = tid; i < imageSize; i += Nthreads) { sum += imageD[i] * imageD[i]; @@ -168,7 +213,7 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i if(tid ==0) { imagesSum[bid] = sum; - imagesValidCount[bid] = count; + imagesValidCount[bid] = (int)count; } } @@ -180,7 +225,7 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i * @param[out] imagesValidCount count of total valid pixels * @param[in] stream cudaStream */ -void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, +void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, cuArrays *imagesValidCount, cudaStream_t stream) { const dim3 grid(images->count, 1, 1); @@ -191,9 +236,10 @@ void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArra getLastCudaError("cuArraysSumValueCorr kernel error\n"); } + // intra-block inclusive prefix sum template -__device__ void inclusive_prefix_sum(float sum, volatile float *shmem) +__device__ void inclusive_prefix_sum(real_type sum, volatile real_type *shmem) { const int tid = threadIdx.x; shmem[tid] = sum; @@ -212,31 +258,31 @@ __device__ void inclusive_prefix_sum(float sum, volatile float *shmem) // prefix sum of pixel value and pixel value^2 template -__device__ float2 partialSums(const float v, volatile float* shmem, const int stride) +__device__ real2_type partialSums(const real_type v, volatile real_type* shmem, const int stride) { const int tid = threadIdx.x; - volatile float *shMem = shmem + 1; - volatile float *shMem2 = shMem + 1 + (1 << Nthreads2); + volatile real_type *shMem = shmem + 1; + volatile real_type *shMem2 = shMem + 1 + (1 << Nthreads2); inclusive_prefix_sum(v, shMem); inclusive_prefix_sum(v*v, shMem2); - const float Sum = shMem [tid-1 + stride] - shMem [tid-1]; - const float Sum2 = shMem2[tid-1 + stride] - shMem2[tid-1]; - return make_float2(Sum, Sum2); + const real_type Sum = shMem [tid-1 + stride] - shMem [tid-1]; + const real_type Sum2 = shMem2[tid-1 + stride] - shMem2[tid-1]; + return make_real2(Sum, Sum2); } // cuda kernel for cuCorrNormalize template __global__ void cuCorrNormalize_kernel( int nImages, - const float *templateIn, int templateNX, int templateNY, int templateSize, - const float *imageIn, int imageNX, int imageNY, int imageSize, - float *resultOut, int resultNX, int resultNY, int resultSize, - float templateCoeff) + const real_type *templateIn, int templateNX, int templateNY, int templateSize, + const real_type *imageIn, int imageNX, int imageNY, int imageSize, + real_type *resultOut, int resultNX, int resultNY, int resultSize, + real_type templateCoeff) { const int Nthreads = 1<(templateSum2, shmem); __syncthreads(); // reset shared memory value - shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0f; + shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0; __syncthreads(); // perform the prefix sum and sum^2 for secondary window // see notes above - float imageSum = 0.0f; - float imageSum2 = 0.0f; + real_type imageSum = 0.0; + real_type imageSum2 = 0.0; int iaddr = 0; const int windowSize = templateNX*imageNY; // iterative till reaching the templateNX row of the secondary window @@ -275,7 +321,7 @@ __global__ void cuCorrNormalize_kernel( while (iaddr < windowSize) { // cum sum for each row with a width=templateNY - const float2 res = partialSums(imageD[iaddr + tid], shmem, templateNY); + const real2_type res = partialSums(imageD[iaddr + tid], shmem, templateNY); // add to the total, which keeps track of the sum of area for each window imageSum += res.x; imageSum2 += res.y; @@ -287,17 +333,17 @@ __global__ void cuCorrNormalize_kernel( if (tid < resultNY) { // normalizing factor - const float norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2; + const real_type norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2; // normalize the correlation surface - resultD[tid] *= rsqrtf(norm2 + FLT_EPSILON); + resultD[tid] *= rsqrt(norm2 + EPSILON); } // iterative over the rest rows while (iaddr < imageSize) { // the prefix sum of the row removed is recomputed, to be subtracted - const float2 res1 = partialSums(imageD[iaddr-windowSize + tid], shmem, templateNY); + const real2_type res1 = partialSums(imageD[iaddr-windowSize + tid], shmem, templateNY); // the prefix sum of the new row, to be added - const float2 res2 = partialSums(imageD[iaddr + tid], shmem, templateNY); + const real2_type res2 = partialSums(imageD[iaddr + tid], shmem, templateNY); imageSum += res2.x - res1.x; imageSum2 += res2.y - res1.y; // move to next row @@ -307,8 +353,8 @@ __global__ void cuCorrNormalize_kernel( { const int ix = iaddr/imageNY; // get row index const int addr = (ix-templateNX)*resultNY; // get the correlation surface row index - const float norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2; - resultD[addr + tid] *= rsqrtf(norm2 + FLT_EPSILON); + const real_type norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2; + resultD[addr + tid] *= rsqrt(norm2 + EPSILON); } } } @@ -322,12 +368,12 @@ __global__ void cuCorrNormalize_kernel( * @warning The current implementation uses one thread for one column, therefore, * the secondary window width is limited to <=1024, the max threads in a block. */ -void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream) +void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream) { const int nImages = images->count; const int imageNY = images->width; const dim3 grid(1, 1, nImages); - const float invTemplateSize = 1.0f/templates->size; + const real_type invTemplateSize = 1.0f/templates->size; if (imageNY <= 64) { cuCorrNormalize_kernel< 6><<>>(nImages, @@ -385,11 +431,11 @@ template<> struct Log2<512> { static const int value = 9; }; template<> struct Log2<1024> { static const int value = 10; }; template -void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) { const int nImages = correlation->count; const dim3 grid(1, 1, nImages); - const float invReferenceSize = 1.0f/reference->size; + const real_type invReferenceSize = 1.0f/reference->size; cuCorrNormalize_kernel::value><<>>(nImages, reference->devData, reference->height, reference->width, reference->size, secondary->devData, secondary->height, secondary->width, secondary->size, @@ -398,20 +444,20 @@ void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *referen getLastCudaError("cuCorrNormalize kernel error"); } -template void cuCorrNormalizeFixed<64>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<64>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -template void cuCorrNormalizeFixed<128>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<128>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -template void cuCorrNormalizeFixed<256>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<256>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -template void cuCorrNormalizeFixed<512>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<512>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -template void cuCorrNormalizeFixed<1024>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<1024>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); // end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu index 97b4e064b..656e5ebcb 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu @@ -5,11 +5,14 @@ */ #include + +#if __CUDACC_VER_MAJOR__ >= 11 #include +#endif + // my declarations #include "cuAmpcorUtil.h" -// for FLT_EPSILON -#include + // alias for cuda cooperative groups namespace cg = cooperative_groups; @@ -25,16 +28,19 @@ namespace cg = cooperative_groups; * @note use one thread block for each image, blockIdx.x is image index **/ -__global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch) + +#if __CUDACC_VER_MAJOR__ >= 11 +// use cg::reduce for NVCC 11 and above +__global__ void sum_square_kernel(real_type *sum2, const real_type *images, int n, int batch) { // get block id for each image int imageid = blockIdx.x; - const float *image = images + imageid*n; + const real_type *image = images + imageid*n; // get the thread block cg::thread_block cta = cg::this_thread_block(); // get the shared memory - extern float __shared__ sdata[]; + extern real_type __shared__ sdata[]; // get the current thread int tid = cta.thread_rank(); @@ -53,12 +59,12 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); // reduce in each tile with warp - sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus()); + sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus()); cg::sync(cta); // reduce all tiles with thread 0 if(tid == 0) { - float sum = 0.0; + real_type sum = 0.0; for (int i = 0; i < cta.size(); i += tile32.size()) sum += sdata[i]; // assign the value to results @@ -66,6 +72,69 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b } } +#else +// use warp-shuffle reduction for NVCC 9 & 10 +__global__ void sum_square_kernel(real_type *sum2, const real_type *images, int n, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + const real_type *image = images + imageid*n; + + // get the thread block + cg::thread_block cta = cg::this_thread_block(); + // get the shared memory + extern real_type __shared__ sdata[]; + + // get the current thread + unsigned int tid = cta.thread_rank(); + unsigned int blockSize = cta.size(); + + // stride over grid and add the values to the shared memory + real_type sum = 0; + + for(int i = tid; i < n; i += blockSize ) { + auto value = image[i]; + sum += value*value; + } + sdata[tid] = sum; + cg::sync(cta); + + // do reduction in shared memory in log2 steps + if ((blockSize >= 512) && (tid < 256)) { + sdata[tid] = sum = sum + sdata[tid + 256]; + } + cg::sync(cta); + + if ((blockSize >= 256) && (tid < 128)) { + sdata[tid] = sum = sum + sdata[tid + 128]; + } + cg::sync(cta); + + if ((blockSize >= 128) && (tid < 64)) { + sdata[tid] = sum = sum + sdata[tid + 64]; + } + cg::sync(cta); + + // partition thread block into tiles in size 32 (warp) + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // reduce within warp + if(tid < 32) { + if(blockSize >=64) sum += sdata[tid + 32]; + for (int offset = tile32.size()/2; offset >0; offset /=2) { + sum += tile32.shfl_down(sum, offset); + } + } + + // return results with thread 0 + if(tid == 0) { + // assign the value to results + sum2[imageid] = sum; + } +} +#endif // __CUDACC_VER_MAJOR check + + /** * cuda kernel for 2d sum area table * Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images. @@ -77,7 +146,7 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b * @param[in] batch number of images **/ -__global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx, int ny, int batch) +__global__ void sat2d_kernel(real_type *sat, real_type * sat2, const real_type *data, int nx, int ny, int batch) { // get block id for each image int imageid = blockIdx.x; @@ -89,13 +158,13 @@ __global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx // the number of rows may be bigger than the number of threads, iterate for (int row = tid; row < nx; row += blockDim.x) { // running sum for value and value^2 - float sum = 0.0f; - float sum2 = 0.0f; + real_type sum = 0.0; + real_type sum2 = 0.0; // starting position for this row int index = (imageid*nx+row)*ny; // iterative over column for (int i=0; i 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0; - float topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; - float bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; - float bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; + real_type topleft = (tx > 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0; + real_type topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; + real_type bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; + real_type bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; // get the sum - float secondarySum = bottomright + topleft - topright - bottomleft; + real_type secondarySum = bottomright + topleft - topright - bottomleft; // sum of value^2 - const float *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY; + const real_type *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY; // get sat2 values for four corners topleft = (tx > 0 && ty > 0) ? sat2[(tx-1)*secondaryNY+(ty-1)] : 0.0; topright = (tx > 0 ) ? sat2[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; bottomleft = (ty > 0) ? sat2[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; bottomright = sat2[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; - float secondarySum2 = bottomright + topleft - topright - bottomleft; + real_type secondarySum2 = bottomright + topleft - topright - bottomleft; // compute the normalization - float norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2; + real_type norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2; // normalize the correlation surface - correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrtf(norm2 + FLT_EPSILON); + correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrt(norm2 + EPSILON); } } -void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, - cuArrays * referenceSum2, cuArrays *secondarySat, cuArrays *secondarySat2, cudaStream_t stream) +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySat, cuArrays *secondarySat2, cudaStream_t stream) { // compute the std of reference image // note that the mean is already subtracted int nthreads = 256; - int sMemSize = nthreads*sizeof(float); + int sMemSize = nthreads*sizeof(real_type); int nblocks = reference->count; sum_square_kernel<<>>(referenceSum2->devData, reference->devData, reference->width * reference->height, reference->count); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cpp similarity index 51% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cpp index 783080d90..5c157f16a 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cpp @@ -7,56 +7,41 @@ #include "cuCorrNormalizer.h" #include "cuAmpcorUtil.h" -cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count) +cuNormalizeProcessor* +newCuNormalizer(int secondaryNX, int secondaryNY, int count) { // depending on NY, choose different processor if(secondaryNY <= 64) { - processor = new cuNormalizeFixed<64>(); + return new cuNormalizeFixed<64>(); } else if (secondaryNY <= 128) { - processor = new cuNormalizeFixed<128>(); + return new cuNormalizeFixed<128>(); } else if (secondaryNY <= 256) { - processor = new cuNormalizeFixed<256>(); + return new cuNormalizeFixed<256>(); } else if (secondaryNY <= 512) { - processor = new cuNormalizeFixed<512>(); + return new cuNormalizeFixed<512>(); } else if (secondaryNY <= 1024) { - processor = new cuNormalizeFixed<1024>(); + return new cuNormalizeFixed<1024>(); } else { - processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count); + return new cuNormalizeSAT(secondaryNX, secondaryNY, count); } } -cuNormalizer::~cuNormalizer() -{ - delete processor; -} - -void cuNormalizer::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - processor->execute(correlation, reference, secondary, stream); -} - -/** - * - * - **/ - cuNormalizeSAT::cuNormalizeSAT(int secondaryNX, int secondaryNY, int count) { // allocate the work array // reference sum square - referenceSum2 = new cuArrays(1, 1, count); + referenceSum2 = new cuArrays(1, 1, count); referenceSum2->allocate(); // secondary sum and sum square - secondarySAT = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT = new cuArrays(secondaryNX, secondaryNY, count); secondarySAT->allocate(); - secondarySAT2 = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT2 = new cuArrays(secondaryNX, secondaryNY, count); secondarySAT2->allocate(); }; @@ -67,16 +52,16 @@ cuNormalizeSAT::~cuNormalizeSAT() delete secondarySAT2; } -void cuNormalizeSAT::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +void cuNormalizeSAT::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) { cuCorrNormalizeSAT(correlation, reference, secondary, referenceSum2, secondarySAT, secondarySAT2, stream); } template -void cuNormalizeFixed::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +void cuNormalizeFixed::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) { cuCorrNormalizeFixed(correlation, reference, secondary, stream); } diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h index 043accd3a..1b6a33b68 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h @@ -13,7 +13,7 @@ #define __CUNORMALIZER_H #include "cuArrays.h" -#include "cudaUtil.h" +#include "data_types.h" /** * Abstract class interface for correlation surface normalization processor @@ -22,46 +22,34 @@ class cuNormalizeProcessor { public: // default constructor and destructor - cuNormalizeProcessor() {} - ~cuNormalizeProcessor() {} + cuNormalizeProcessor() = default; + virtual ~cuNormalizeProcessor() = default; // execute interface - virtual void execute(cuArrays * correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) = 0; + virtual void execute(cuArrays * correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) = 0; }; -class cuNormalizer { -private: - cuNormalizeProcessor *processor; -public: - // disable the default constructor - cuNormalizer() = delete; - // constructor with the secondary dimension - cuNormalizer(int secondaryNX, int secondaryNY, int count); - // destructor - ~cuNormalizer(); - // execute correlation surface normalization - void execute(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, - cudaStream_t stream); -}; +// factory with the secondary dimension +cuNormalizeProcessor* newCuNormalizer(int NX, int NY, int count); template class cuNormalizeFixed : public cuNormalizeProcessor { public: - void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; }; class cuNormalizeSAT : public cuNormalizeProcessor { private: - cuArrays *referenceSum2; - cuArrays *secondarySAT; - cuArrays *secondarySAT2; + cuArrays *referenceSum2; + cuArrays *secondarySAT; + cuArrays *secondarySAT2; public: cuNormalizeSAT(int secondaryNX, int secondaryNY, int count); ~cuNormalizeSAT(); - void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; }; #endif diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu index 116712986..ae166849d 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu @@ -12,11 +12,11 @@ template __global__ void cuArraysCorrTime_kernel( const int nImages, - const float *templateIn, const int templateNX, const int templateNY, const int templateSize, - const float *imageIn, const int imageNX, const int imageNY, const int imageSize, - float *resultOut, const int resultNX, const int resultNY, const int resultSize) + const real_type *templateIn, const int templateNX, const int templateNY, const int templateSize, + const real_type *imageIn, const int imageNX, const int imageNY, const int imageSize, + real_type *resultOut, const int resultNX, const int resultNY, const int resultSize) { - __shared__ float shmem[nthreads*(1+NPT)]; + __shared__ real_type shmem[nthreads*(1+NPT)]; const int tid = threadIdx.x; const int bid = blockIdx.x; const int yc = blockIdx.y*NPT; @@ -26,9 +26,9 @@ __global__ void cuArraysCorrTime_kernel( const int templateOffset = imageIdx * templateSize; const int resultOffset = imageIdx * resultSize; - const float * imageD = imageIn + imageOffset + tid; - const float *templateD = templateIn + templateOffset + tid; - float * resultD = resultOut + resultOffset; + const real_type * imageD = imageIn + imageOffset + tid; + const real_type *templateD = templateIn + templateOffset + tid; + real_type * resultD = resultOut + resultOffset; const int q = min(nthreads/resultNY, 4); const int nt = nthreads/q; @@ -39,18 +39,18 @@ __global__ void cuArraysCorrTime_kernel( const int jbeg = templateNYq * ty; const int jend = ty+1 >= q ? templateNY : templateNYq + jbeg; - float *shTemplate = shmem; - float *shImage = shmem + nthreads; - float *shImage1 = shImage + tx; + real_type *shTemplate = shmem; + real_type *shImage = shmem + nthreads; + real_type *shImage1 = shImage + tx; - float corrCoeff[NPT]; + real_type corrCoeff[NPT]; for (int k = 0; k < NPT; k++) - corrCoeff[k] = 0.0f; + corrCoeff[k] = 0.0; int iaddr = yc*imageNY; - float img[NPT]; + real_type img[NPT]; for (int k = 0; k < NPT-1; k++, iaddr += imageNY) img[k] = imageD[iaddr]; for (int taddr = 0; taddr < templateSize; taddr += templateNY, iaddr += imageNY) @@ -98,15 +98,21 @@ __global__ void cuArraysCorrTime_kernel( * @param[out] results Output correlation surface * @param[in] stream cudaStream */ -void cuCorrTimeDomain(cuArrays *templates, - cuArrays *images, - cuArrays *results, +void cuCorrTimeDomain(cuArrays *templates, + cuArrays *images, + cuArrays *results, cudaStream_t stream) { /* compute correlation matrix */ const int nImages = images->count; const int imageNY = images->width; + +#if defined(CUAMPCOR_DOUBLE) && __CUDA_ARCH__ < 800 + // For GPUs older than A100, static shared memory limit is 48K + const int NPT = 4; +#else const int NPT = 8; +#endif const dim3 grid(nImages, (results->width-1)/NPT+1, 1); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu index cd46e7893..5de6e65bb 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu @@ -13,8 +13,8 @@ */ #include "cuArrays.h" -#include "float2.h" -#include +#include "float2.h" +#include "data_types.h" #include "cudaError.h" #include "cudaUtil.h" #include "cuAmpcorUtil.h" @@ -24,23 +24,23 @@ #include -// cuda does not have a good support on volatile vector struct, e.g. float2 -// have to use regular float type for shared memory (volatile) data -// the following methods are defined to operate float2/complex objects through float -inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block) +// cuda does not have a good support on volatile vector struct, e.g. real2_type +// have to use regular real_type type for shared memory (volatile) data +// the following methods are defined to operate real2_type/complex objects through real_type +inline static __device__ void copyToShared(volatile real_type *s, const int i, const real2_type x, const int block) { s[i] = x.x; s[i+block] = x.y; } -inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block) +inline static __device__ void copyFromShared(real2_type &x, volatile real_type *s, const int i, const int block) { x.x = s[i]; x.y = s[i+block]; } -inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block) +inline static __device__ void addInShared(volatile real_type *s, const int i, const int j, const int block) { s[i] += s[i+j]; s[i+block] += s[i+j+block];} -// kernel to do sum reduction for float2 within a block +// kernel to do sum reduction for real2_type within a block template -__device__ void complexSumReduceBlock(float2& sum, volatile float *shmem) +__device__ void complexSumReduceBlock(real2_type& sum, volatile real_type *shmem) { const int tid = threadIdx.x; copyToShared(shmem, tid, sum, nthreads); @@ -65,50 +65,56 @@ __device__ void complexSumReduceBlock(float2& sum, volatile float *shmem) // cuda kernel for cuDerampMethod1 template -__global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY, - const int imageSize, const int nImages, const float normCoef) +__global__ void cuDerampMethod1_kernel(real2_type *images, const int imageNX, int const imageNY, + const int imageSize, const int nImages, const real_type normCoef) { - __shared__ float shmem[2*nthreads]; + __shared__ real_type shmem[2*nthreads]; int pixelIdx, pixelIdxX, pixelIdxY; const int bid = blockIdx.x; if(bid >= nImages) return; - float2 *image = images+ bid*imageSize; + real2_type *image = images+ bid*imageSize; const int tid = threadIdx.x; - float2 phaseDiffY = make_float2(0.0f, 0.0f); + real2_type phaseDiffY = make_real2(0.0, 0.0); for (int i = tid; i < imageSize; i += nthreads) { pixelIdxY = i % imageNY; if(pixelIdxY < imageNY -1) { pixelIdx = i; - float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]); + real2_type cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]); phaseDiffY += cprod; } } complexSumReduceBlock(phaseDiffY, shmem); //phaseDiffY *= normCoef; - float phaseY=atan2f(phaseDiffY.y, phaseDiffY.x); + real_type phaseY=atan2(phaseDiffY.y, phaseDiffY.x); - float2 phaseDiffX = make_float2(0.0f, 0.0f); + real2_type phaseDiffX = make_real2(0.0, 0.0); for (int i = tid; i < imageSize; i += nthreads) { pixelIdxX = i / imageNY; if(pixelIdxX < imageNX -1) { pixelIdx = i; - float2 cprod = complexMulConj(image[i], image[i+imageNY]); + real2_type cprod = complexMulConj(image[i], image[i+imageNY]); phaseDiffX += cprod; } } - complexSumReduceBlock(phaseDiffX, shmem); - //phaseDiffX *= normCoef; - float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x); //+FLT_EPSILON - + real_type phaseX = atan2(phaseDiffX.y, phaseDiffX.x); //+FLT_EPSILON + +#ifdef CUAMPCOR_DEBUG + // output the phase ramp in debug mode + if (threadIdx.x==0) + { + printf("debug deramp az: %g rg: %g\n", phaseX, phaseY); + } +#endif + for (int i = tid; i < imageSize; i += nthreads) { pixelIdxX = i%imageNY; pixelIdxY = i/imageNY; - float phase = pixelIdxX*phaseX + pixelIdxY*phaseY; - float2 phase_factor = make_float2(cosf(phase), sinf(phase)); + real_type phase = pixelIdxX*phaseX + pixelIdxY*phaseY; + real2_type phase_factor = make_real2(cosf(phase), sinf(phase)); image[i] *= phase_factor; } } @@ -120,12 +126,12 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co * @param[in,out] images input/output complex signals * @param[in] stream cuda stream */ -void cuDerampMethod1(cuArrays *images, cudaStream_t stream) +void cuDerampMethod1(cuArrays *images, cudaStream_t stream) { const dim3 grid(images->count); const int imageSize = images->width*images->height; - const float invSize = 1.0f/imageSize; + const real_type invSize = 1.0f/imageSize; if(imageSize <=64) { cuDerampMethod1_kernel<64> <<>> @@ -147,7 +153,7 @@ void cuDerampMethod1(cuArrays *images, cudaStream_t stream) } -void cuDeramp(int method, cuArrays *images, cudaStream_t stream) +void cuDeramp(int method, cuArrays *images, cudaStream_t stream) { switch(method) { case 1: diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu index 18412bfe1..e8a92070d 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu @@ -7,7 +7,8 @@ #include "cuArrays.h" #include "float2.h" -#include +#include "float.h" +#include "data_types.h" #include "cudaUtil.h" #include "cudaError.h" #include "cuAmpcorUtil.h" @@ -17,14 +18,45 @@ #include // cuda kernel for cuEstimateSnr -__global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrValidCount, const float* maxval, float* snrValue, const int size) +__global__ void cudaKernel_estimateSnr(const real_type* corrSum, const real_type* maxval, real_type* snrValue, const int size, const int nImages) +{ + int idx = threadIdx.x + blockDim.x*blockIdx.x; + + if (idx >= nImages) return; + + real_type peak = maxval[idx]; + peak *= peak; + real_type mean = (corrSum[idx] - peak) / (size - 1); + snrValue[idx] = peak / mean; +#ifdef CUAMPCOR_DEBUG + if(threadIdx.x==0) + printf("debug snr %g %g %g\n", peak, mean, snrValue[idx]); +#endif +} + +/** + * Estimate the signal to noise ratio (SNR) of the correlation surface + * @param[in] corrSum the sum of the correlation surface + * @param[in] corrValidCount the number of valid pixels contributing to sum + * @param[out] snrValue return snr value + * @param[in] stream cuda stream + */ +void cuEstimateSnr(cuArrays *corrSum, cuArrays *maxval, cuArrays *snrValue, const int size, cudaStream_t stream) +{ + int nImages = corrSum->getSize(); + cudaKernel_estimateSnr<<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>> + (corrSum->devData, maxval->devData, snrValue->devData, size, nImages); + getLastCudaError("cuda kernel estimate stats error\n"); +} + +__global__ void cudaKernel_estimateSnr(const real_type* corrSum, const int* corrValidCount, const real_type* maxval, real_type* snrValue, const int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if (idx >= size) return; - float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1); + real_type mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1); snrValue[idx] = maxval[idx] * maxval[idx] / mean; } @@ -36,7 +68,7 @@ __global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrVali * @param[out] snrValue return snr value * @param[in] stream cuda stream */ -void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream) +void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream) { int size = corrSum->getSize(); @@ -46,8 +78,8 @@ void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuAr } // cuda kernel for cuEstimateVariance -__global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, const int NY, const int2* maxloc, - const float* maxval, const int templateSize, float3* covValue, const int size) +__global__ void cudaKernel_estimateVar(const real_type* corrBatchRaw, const int NX, const int NY, const int2* maxloc, + const real_type* maxval, const int templateSize, const int distance, real3_type* covValue, const int size) { // Find image id. @@ -58,53 +90,53 @@ __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, // Preparation. int px = maxloc[idxImage].x; int py = maxloc[idxImage].y; - float peak = maxval[idxImage]; + real_type peak = maxval[idxImage]; // Check if maxval is on the margin. - if (px-1 < 0 || py-1 <0 || px + 1 >=NX || py+1 >=NY) { + if (px-distance < 0 || py-distance <0 || px + distance >=NX || py+distance >=NY) { - covValue[idxImage] = make_float3(99.0, 99.0, 0.0); + covValue[idxImage] = make_real3(99.0, 99.0, 0.0); } else { int offset = NX * NY * idxImage; - int idx00 = offset + (px - 1) * NY + py - 1; - int idx01 = offset + (px - 1) * NY + py ; - int idx02 = offset + (px - 1) * NY + py + 1; - int idx10 = offset + (px ) * NY + py - 1; + int idx00 = offset + (px - distance) * NY + py - distance; + int idx01 = offset + (px - distance) * NY + py ; + int idx02 = offset + (px - distance) * NY + py + distance; + int idx10 = offset + (px ) * NY + py - distance; int idx11 = offset + (px ) * NY + py ; - int idx12 = offset + (px ) * NY + py + 1; - int idx20 = offset + (px + 1) * NY + py - 1; - int idx21 = offset + (px + 1) * NY + py ; - int idx22 = offset + (px + 1) * NY + py + 1; + int idx12 = offset + (px ) * NY + py + distance; + int idx20 = offset + (px + distance) * NY + py - distance; + int idx21 = offset + (px + distance) * NY + py ; + int idx22 = offset + (px + distance) * NY + py + distance; // second-order derivatives - float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2.0*corrBatchRaw[idx11] ); - float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2.0*corrBatchRaw[idx11] ) ; - float dxy = ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25; + real_type dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2.0*corrBatchRaw[idx11] ); + real_type dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2.0*corrBatchRaw[idx11] ) ; + real_type dxy = ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25; - float n2 = fmaxf(1.0 - peak, 0.0); + real_type n2 = max(1.0 - peak, 0.0); dxx = dxx * templateSize; dyy = dyy * templateSize; dxy = dxy * templateSize; - float n4 = n2*n2; + real_type n4 = n2*n2; n2 = n2 * 2; n4 = n4 * 0.5 * templateSize; - float u = dxy * dxy - dxx * dyy; - float u2 = u*u; + real_type u = dxy * dxy - dxx * dyy; + real_type u2 = u*u; // if the Gaussian curvature is too small if (fabsf(u) < 1e-2) { - covValue[idxImage] = make_float3(99.0, 99.0, 0.0); + covValue[idxImage] = make_real3(99.0, 99.0, 0.0); } else { - float cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2; - float cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2; - float cov_xy = ( n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2; - covValue[idxImage] = make_float3(cov_xx, cov_yy, cov_xy); + real_type cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2; + real_type cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2; + real_type cov_xy = ( n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2; + covValue[idxImage] = make_real3(cov_xx, cov_yy, cov_xy); } } } @@ -112,18 +144,19 @@ __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, /** * Estimate the variance of the correlation surface * @param[in] templateSize size of reference chip + * @param[in] distance distance between a pixel * @param[in] corrBatchRaw correlation surface * @param[in] maxloc maximum location * @param[in] maxval maximum value * @param[out] covValue variance value * @param[in] stream cuda stream */ -void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, int templateSize, cuArrays *covValue, cudaStream_t stream) +void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, const int templateSize, const int distance, cuArrays *covValue, cudaStream_t stream) { int size = corrBatchRaw->count; // One dimensional launching parameters to loop over every correlation surface. cudaKernel_estimateVar<<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>> - (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, templateSize, covValue->devData, size); + (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, templateSize, distance, covValue->devData, size); getLastCudaError("cudaKernel_estimateVar error\n"); } //end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu index ac77f131f..f6bca24da 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu @@ -6,11 +6,9 @@ // my module dependencies #include "cuAmpcorUtil.h" -// for FLT_MAX -#include // find the max between two elements -inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc, +inline static __device__ void maxPairReduce(volatile real_type* maxval, volatile int* maxloc, size_t gid, size_t strideid) { if(maxval[gid] < maxval[strideid]) { @@ -19,32 +17,38 @@ inline static __device__ void maxPairReduce(volatile float* maxval, volatile int } } -// max reduction kernel +// max reduction kernel for a 2d image +// start from (start.x, start.y) in a rectangle range (range.x, range.y) +// start + range <= imageSize template -__device__ void max_reduction(const float* const images, - const size_t imageSize, - const size_t nImages, - volatile float* shval, +__device__ void max_reduction_2d(const real_type* const image, + const int2 imageSize, + const int2 start, const int2 range, + volatile real_type* shval, volatile int* shloc) { int tid = threadIdx.x; - shval[tid] = -FLT_MAX; - int imageStart = blockIdx.x*imageSize; - int imagePixel; + shval[tid] = -REAL_MAX; // reduction for intra-block elements // i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ... - for(int gid = tid; gid < imageSize; gid+=blockDim.x) + for(int gid = tid; gid < range.x*range.y; gid+=blockDim.x) { - imagePixel = imageStart+gid; - if(shval[tid] < images[imagePixel]) { - shval[tid] = images[imagePixel]; - shloc[tid] = gid; + // gid is the flattened pixel id in the search range + // get the pixel 2d coordinate in whole image + int idx = start.x + gid / range.y; + int idy = start.y + gid % range.y; + // get the flattened 1d coordinate + int pixelId = IDX2R(idx, idy, imageSize.y); + real_type pixelValue = image[pixelId]; + if(shval[tid] < pixelValue) { + shval[tid] = pixelValue; + shloc[tid] = pixelId; } } __syncthreads(); - // reduction within a block + // reduction within a block with shared memory if (BLOCKSIZE >=1024){ if (tid < 512) { maxPairReduce(shval, shloc, tid, tid + 512); } __syncthreads(); } if (BLOCKSIZE >=512) { if (tid < 256) { maxPairReduce(shval, shloc, tid, tid + 256); } __syncthreads(); } if (BLOCKSIZE >=256) { if (tid < 128) { maxPairReduce(shval, shloc, tid, tid + 128); } __syncthreads(); } @@ -65,21 +69,24 @@ __device__ void max_reduction(const float* const images, // kernel for 2D array(image), find max location only template -__global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, float* maxval, - const size_t imageNX, const size_t imageNY, const size_t nImages) +__global__ void cudaKernel_maxloc2D(const real_type* const images, int2* maxloc, real_type* maxval, + const int2 imageSize, const size_t nImages, const int2 start, const int2 range) { - __shared__ float shval[BLOCKSIZE]; + __shared__ real_type shval[BLOCKSIZE]; __shared__ int shloc[BLOCKSIZE]; - int bid = blockIdx.x; - if(bid >= nImages) return; + int imageIdx = blockIdx.x; + if(imageIdx >= nImages) return; - const int imageSize = imageNX * imageNY; - max_reduction(images, imageSize, nImages, shval, shloc); + // get the starting pointer for this image + const real_type *image = images + imageIdx*imageSize.x*imageSize.y; + max_reduction_2d(image, imageSize, start, range, shval, shloc); + + // thread 0 contains the final result, convert it to 2d coordinate if (threadIdx.x == 0) { - maxloc[bid] = make_int2(shloc[0]/imageNY, shloc[0]%imageNY); - maxval[bid] = shval[0]; + maxloc[imageIdx] = make_int2(shloc[0]/imageSize.y, shloc[0]%imageSize.y); + maxval[imageIdx] = shval[0]; } } @@ -88,56 +95,92 @@ __global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, fl * @param[in] images input batch of images * @param[out] maxval arrays to hold the max values * @param[out] maxloc arrays to hold the max locations + * @param[in] start starting search pixel + * @param[in] range search range * @param[in] stream cudaStream * @note This routine is overloaded with the routine without maxval */ -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, - cuArrays *maxval, cudaStream_t stream) +void cuArraysMaxloc2D(cuArrays *images, + const int2 start, const int2 range, + cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream) { cudaKernel_maxloc2D<<count, NTHREADS, 0, stream>>> - (images->devData, maxloc->devData, maxval->devData, images->height, images->width, images->count); + (images->devData, + maxloc->devData, maxval->devData, + make_int2(images->height, images->width), images->count, + start, range + ); getLastCudaError("cudaKernel find max location 2D error\n"); } -//kernel and function for 2D array(image), find max location only, use overload -template -__global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, const size_t imageNX, const size_t imageNY, const size_t nImages) -{ - __shared__ float shval[BLOCKSIZE]; - __shared__ int shloc[BLOCKSIZE]; - - int bid = blockIdx.x; - if(bid >= nImages) return; - - const int imageSize = imageNX * imageNY; - max_reduction(images, imageSize, nImages, shval, shloc); - - if (threadIdx.x == 0) { - int xloc = shloc[0]/imageNY; - int yloc = shloc[0]%imageNY; - maxloc[bid] = make_int2(xloc, yloc); - } -} - /** - * Find (only) the maximum location for a batch of 2D images + * Find both the maximum value and the location for a batch of 2D images * @param[in] images input batch of images + * @param[out] maxval arrays to hold the max values * @param[out] maxloc arrays to hold the max locations * @param[in] stream cudaStream - * @note This routine is overloaded with the routine with maxval */ -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cudaStream_t stream) +void cuArraysMaxloc2D(cuArrays *images, + cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream) { + // if no start and range are provided, use the whole image + int2 start = make_int2(0, 0); + int2 imageSize = make_int2(images->height, images->width); cudaKernel_maxloc2D<<count, NTHREADS, 0, stream>>> - (images->devData, maxloc->devData, images->height, images->width, images->count); + (images->devData, + maxloc->devData, maxval->devData, + imageSize, images->count, + start, imageSize + ); getLastCudaError("cudaKernel find max location 2D error\n"); } // cuda kernel for cuSubPixelOffset __global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offsetZoomIn, - float2 *offsetFinal, - const float OSratio, - const float xoffset, const float yoffset, const int size) + real2_type *offsetFinal, const int size, + const int2 initOrigin, const real_type initRatio, + const int2 zoomInOrigin, const real_type zoomInRatio) +{ + int idx = threadIdx.x + blockDim.x*blockIdx.x; + if (idx >= size) return; + offsetFinal[idx].x = initRatio*(offsetInit[idx].x-initOrigin.x) + zoomInRatio*(offsetZoomIn[idx].x - zoomInOrigin.x); + offsetFinal[idx].y = initRatio*(offsetInit[idx].y-initOrigin.y) + zoomInRatio*(offsetZoomIn[idx].y - zoomInOrigin.y); +} + + +/** + * Determine the final offset value + * @param[in] offsetInit max location (adjusted to the starting location for extraction) determined from + * the cross-correlation before oversampling, in dimensions of pixel + * @param[in] offsetZoomIn max location from the oversampled cross-correlation surface + * @param[out] offsetFinal the combined offset value + */ +void cuSubPixelOffset(cuArrays *offsetInit, + cuArrays *offsetZoomIn, + cuArrays *offsetFinal, + const int2 initOrigin, const int initFactor, + const int2 zoomInOrigin, const int zoomInFactor, + cudaStream_t stream) +{ + int size = offsetInit->getSize(); + + // GPU performs multiplication faster + real_type initRatio = 1.0f/(real_type)(initFactor); + real_type zoomInRatio = 1.0f/(real_type)(zoomInFactor); + + + cuSubPixelOffset_kernel<<>> + (offsetInit->devData, offsetZoomIn->devData, + offsetFinal->devData, size, initOrigin, initRatio, zoomInOrigin, zoomInRatio); + getLastCudaError("cuSubPixelOffset_kernel"); + +} + +// cuda kernel for cuSubPixelOffset +__global__ void cuSubPixelOffset2Pass_kernel(const int2 *offsetInit, const int2 *offsetZoomIn, + real2_type *offsetFinal, + const real_type OSratio, + const real_type xoffset, const real_type yoffset, const int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if (idx >= size) return; @@ -168,8 +211,8 @@ __global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offs * = subpixel offset * Final offset = pixel size offset + subpixel offset */ -void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetZoomIn, - cuArrays *offsetFinal, +void cuSubPixelOffset2Pass(cuArrays *offsetInit, cuArrays *offsetZoomIn, + cuArrays *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit, int yHalfRangeInit, cudaStream_t stream) @@ -179,13 +222,14 @@ void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetZoomIn, float xoffset = xHalfRangeInit ; float yoffset = yHalfRangeInit ; - cuSubPixelOffset_kernel<<>> + cuSubPixelOffset2Pass_kernel<<>> (offsetInit->devData, offsetZoomIn->devData, offsetFinal->devData, OSratio, xoffset, yoffset, size); - getLastCudaError("cuSubPixelOffset_kernel"); + getLastCudaError("cuSubPixelOffset2Pass_kernel"); } + // cuda device function to compute the shift of center static inline __device__ int2 dev_adjustOffset( const int oldRange, const int newRange, const int maxloc) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cpp similarity index 68% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cpp index 1b6ab6267..9da46c58e 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cpp @@ -36,9 +36,9 @@ cuOverSamplerC2C::cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int */ // set up work arrays - workIn = new cuArrays(inNXp2, inNYp2, nImages); + workIn = new cuArrays(inNXp2, inNYp2, nImages); workIn->allocate(); - workOut = new cuArrays(outNXp2, outNYp2, nImages); + workOut = new cuArrays(outNXp2, outNYp2, nImages); workOut->allocate(); // set up fft plans @@ -47,8 +47,13 @@ cuOverSamplerC2C::cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int int fImageSize = inNXp2*inNYp2; int nOverSample[NRANK] = {outNXp2, outNYp2}; int fImageOverSampleSize = outNXp2*outNYp2; +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_Z2Z, nImages)); + cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nOverSample, NULL, 1, fImageOverSampleSize, NULL, 1, fImageOverSampleSize, CUFFT_Z2Z, nImages)); +#else cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_C2C, nImages)); cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nOverSample, NULL, 1, fImageOverSampleSize, NULL, 1, fImageOverSampleSize, CUFFT_C2C, nImages)); +#endif // set cuda stream setStream(stream_); } @@ -69,12 +74,23 @@ void cuOverSamplerC2C::setStream(cudaStream_t stream_) * @param[out] imagesOut output batch of images * @param[in] method phase deramping method */ -void cuOverSamplerC2C::execute(cuArrays *imagesIn, cuArrays *imagesOut, int method) +void cuOverSamplerC2C::execute(cuArrays *imagesIn, cuArrays *imagesOut, int method) { - cuDeramp(method, imagesIn, stream); - cufft_Error(cufftExecC2C(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_INVERSE )); - cuArraysPaddingMany(workIn, workOut, stream); - cufft_Error(cufftExecC2C(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_FORWARD)); + cuDeramp(method, imagesIn, stream); + + #ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2Z(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_FORWARD)); + #else + cufft_Error(cufftExecC2C(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_FORWARD)); + #endif + + cuArraysFFTPaddingMany(workIn, workOut, stream); + +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2Z(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_INVERSE)); +#else + cufft_Error(cufftExecC2C(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_INVERSE)); +#endif } /// destructor @@ -117,12 +133,17 @@ cuOverSamplerR2R::cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int int fImageSize = inNXp2*inNYp2; int nUpSample[NRANK] = {outNXp2, outNYp2}; int fImageUpSampleSize = outNXp2*outNYp2; - workSizeIn = new cuArrays(inNXp2, inNYp2, nImages); + workSizeIn = new cuArrays(inNXp2, inNYp2, nImages); workSizeIn->allocate(); - workSizeOut = new cuArrays(outNXp2, outNYp2, nImages); + workSizeOut = new cuArrays(outNXp2, outNYp2, nImages); workSizeOut->allocate(); +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_Z2Z, nImages)); + cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nUpSample, NULL, 1, fImageUpSampleSize, NULL, 1, outNX*outNY, CUFFT_Z2Z, nImages)); +#else cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_C2C, nImages)); cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nUpSample, NULL, 1, fImageUpSampleSize, NULL, 1, outNX*outNY, CUFFT_C2C, nImages)); +#endif setStream(stream); } @@ -138,12 +159,25 @@ void cuOverSamplerR2R::setStream(cudaStream_t stream_) * @param[in] imagesIn input batch of images * @param[out] imagesOut output batch of images */ -void cuOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut) +void cuOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut) { + cuArraysCopyPadded(imagesIn, workSizeIn, stream); - cufft_Error(cufftExecC2C(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_INVERSE)); - cuArraysPaddingMany(workSizeIn, workSizeOut, stream); - cufft_Error(cufftExecC2C(backwardPlan, workSizeOut->devData, workSizeOut->devData,CUFFT_FORWARD )); + +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2Z(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_FORWARD)); +#else + cufft_Error(cufftExecC2C(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_FORWARD)); +#endif + + cuArraysFFTPaddingMany(workSizeIn, workSizeOut, stream); + +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2Z(backwardPlan, workSizeOut->devData, workSizeOut->devData, CUFFT_INVERSE)); +#else + cufft_Error(cufftExecC2C(backwardPlan, workSizeOut->devData, workSizeOut->devData, CUFFT_INVERSE)); +#endif + cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0), stream); } @@ -156,9 +190,4 @@ cuOverSamplerR2R::~cuOverSamplerR2R() workSizeOut->deallocate(); } -// end of file - - - - - +// end of file \ No newline at end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h index 9ddce96b2..cfc202047 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h @@ -12,7 +12,8 @@ #define __CUOVERSAMPLER_H #include "cuArrays.h" -#include "cudaUtil.h" +#include "data_types.h" +#include // FFT Oversampler for complex images class cuOverSamplerC2C @@ -21,8 +22,8 @@ class cuOverSamplerC2C cufftHandle forwardPlan; // forward fft handle cufftHandle backwardPlan; // backward fft handle cudaStream_t stream; // cuda stream - cuArrays *workIn; // work array to hold forward fft data - cuArrays *workOut; // work array to hold padded data + cuArrays *workIn; // work array to hold forward fft data + cuArrays *workOut; // work array to hold padded data public: // disable the default constructor cuOverSamplerC2C() = delete; @@ -31,7 +32,7 @@ class cuOverSamplerC2C // set cuda stream void setStream(cudaStream_t stream_); // execute oversampling - void execute(cuArrays *imagesIn, cuArrays *imagesOut, int deramp_method=0); + void execute(cuArrays *imagesIn, cuArrays *imagesOut, int deramp_method=0); // destructor ~cuOverSamplerC2C(); }; @@ -43,14 +44,14 @@ class cuOverSamplerR2R cufftHandle forwardPlan; cufftHandle backwardPlan; cudaStream_t stream; - cuArrays *workSizeIn; - cuArrays *workSizeOut; + cuArrays *workSizeIn; + cuArrays *workSizeOut; public: cuOverSamplerR2R() = delete; cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_); void setStream(cudaStream_t stream_); - void execute(cuArrays *imagesIn, cuArrays *imagesOut); + void execute(cuArrays *imagesIn, cuArrays *imagesOut); ~cuOverSamplerR2R(); }; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu index e851eefdb..cc32aedb2 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu @@ -24,7 +24,7 @@ cuSincOverSamplerR2R::cuSincOverSamplerR2R(const int i_covs_, cudaStream_t strea stream = stream_; i_intplength = int(r_relfiltlen/r_beta+0.5f); i_filtercoef = i_intplength*i_decfactor; - checkCudaErrors(cudaMalloc((void **)&r_filter, (i_filtercoef+1)*sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&r_filter, (i_filtercoef+1)*sizeof(real_type))); cuSetupSincKernel(); } @@ -35,17 +35,17 @@ cuSincOverSamplerR2R::~cuSincOverSamplerR2R() } // cuda kernel for cuSetupSincKernel -__global__ void cuSetupSincKernel_kernel(float *r_filter_, const int i_filtercoef_, - const float r_soff_, const float r_wgthgt_, const int i_weight_, - const float r_soff_inverse_, const float r_beta_, const float r_decfactor_inverse_) +__global__ void cuSetupSincKernel_kernel(real_type *r_filter_, const int i_filtercoef_, + const real_type r_soff_, const real_type r_wgthgt_, const int i_weight_, + const real_type r_soff_inverse_, const real_type r_beta_, const real_type r_decfactor_inverse_) { int i = threadIdx.x + blockDim.x*blockIdx.x; if(i > i_filtercoef_) return; - float r_wa = i - r_soff_; - float r_wgt = (1.0f - r_wgthgt_) + r_wgthgt_*cos(PI*r_wa*r_soff_inverse_); - float r_s = r_wa*r_beta_*r_decfactor_inverse_*PI; - float r_fct; - if(r_s != 0.0f) { + real_type r_wa = i - r_soff_; + real_type r_wgt = (1.0f - r_wgthgt_) + r_wgthgt_*cos(PI*r_wa*r_soff_inverse_); + real_type r_s = r_wa*r_beta_*r_decfactor_inverse_*PI; + real_type r_fct; + if(r_s != 0.0) { r_fct = sin(r_s)/r_s; } else { @@ -68,10 +68,10 @@ void cuSincOverSamplerR2R::cuSetupSincKernel() const int nblocks = IDIVUP(i_filtercoef+1, nthreads); // compute some commonly used constants at first - float r_wgthgt = (1.0f - r_pedestal)/2.0f; - float r_soff = (i_filtercoef-1.0f)/2.0f; - float r_soff_inverse = 1.0f/r_soff; - float r_decfactor_inverse = 1.0f/i_decfactor; + real_type r_wgthgt = (1.0f - r_pedestal)/2.0f; + real_type r_soff = (i_filtercoef-1.0f)/2.0f; + real_type r_soff_inverse = 1.0f/r_soff; + real_type r_decfactor_inverse = 1.0f/i_decfactor; cuSetupSincKernel_kernel<<>> ( r_filter, i_filtercoef, r_soff, r_wgthgt, i_weight, @@ -81,11 +81,12 @@ void cuSincOverSamplerR2R::cuSetupSincKernel() // cuda kernel for cuSincOverSamplerR2R::execute -__global__ void cuSincInterpolation_kernel(const int nImages, - const float * imagesIn, const int inNX, const int inNY, - float * imagesOut, const int outNX, const int outNY, - int2 *centerShift, int factor, - const float * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_, +// define a device function to overload the cases with constant or varying center shifts +__device__ void cuSincInterpolation_kernel_common(const int nImages, + const real_type * imagesIn, const int inNX, const int inNY, + real_type * imagesOut, const int outNX, const int outNY, + int2 centerShift, int factor, + const real_type * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_, const int i_startX, const int i_startY, const int i_int_size) { // get image index @@ -95,35 +96,33 @@ __global__ void cuSincInterpolation_kernel(const int nImages, int idxY = threadIdx.y + blockDim.y*blockIdx.y; // cuda: to make sure extra allocated threads doing nothing if(idxImage >=nImages || idxX >= i_int_size || idxY >= i_int_size) return; - // decide the center shift - int2 shift = centerShift[idxImage]; // determine the output pixel indices - int outx = idxX + i_startX + shift.x*factor; + int outx = idxX + i_startX + centerShift.x*factor; if (outx >= outNX) outx-=outNX; - int outy = idxY + i_startY + shift.y*factor; + int outy = idxY + i_startY + centerShift.y*factor; if (outy >= outNY) outy-=outNY; // flattened to 1d int idxOut = idxImage*outNX*outNY + outx*outNY + outy; // index in input grids - float r_xout = (float)outx/i_covs_; + real_type r_xout = (real_type)outx/i_covs_; // integer part int i_xout = int(r_xout); // factional part - float r_xfrac = r_xout - i_xout; + real_type r_xfrac = r_xout - i_xout; // fractional part in terms of the interpolation kernel grids int i_xfrac = int(r_xfrac*i_decfactor_); // same procedure for y - float r_yout = (float)outy/i_covs_; + real_type r_yout = (real_type)outy/i_covs_; int i_yout = int(r_yout); - float r_yfrac = r_yout - i_yout; + real_type r_yfrac = r_yout - i_yout; int i_yfrac = int(r_yfrac*i_decfactor_); // temp variables - float intpData = 0.0f; // interpolated value - float r_sincwgt = 0.0f; // total filter weight - float r_sinc_coef; // filter weight + real_type intpData = 0.0; // interpolated value + real_type r_sincwgt = 0.0; // total filter weight + real_type r_sinc_coef; // filter weight // iterate over lines of input image // i=0 -> -i_intplength/2 @@ -135,7 +134,7 @@ __global__ void cuSincInterpolation_kernel(const int nImages, if(inx < 0) inx+= inNX; if(inx >= inNX) inx-= inNY; - float r_xsinc_coef = r_filter_[i*i_decfactor_+i_xfrac]; + real_type r_xsinc_coef = r_filter_[i*i_decfactor_+i_xfrac]; for(int j=0; j< i_intplength_; j++) { // find the corresponding pixel in input(unsampled) image @@ -143,7 +142,7 @@ __global__ void cuSincInterpolation_kernel(const int nImages, if(iny < 0) iny += inNY; if(iny >= inNY) iny -= inNY; - float r_ysinc_coef = r_filter_[j*i_decfactor_+i_yfrac]; + real_type r_ysinc_coef = r_filter_[j*i_decfactor_+i_yfrac]; // multiply the factors from xy r_sinc_coef = r_xsinc_coef*r_ysinc_coef; // add to total sinc weight @@ -156,6 +155,36 @@ __global__ void cuSincInterpolation_kernel(const int nImages, imagesOut[idxOut] = intpData/r_sincwgt; } +//constant shift +__global__ void cuSincInterpolation_kernel(const int nImages, + const real_type * imagesIn, const int inNX, const int inNY, + real_type * imagesOut, const int outNX, const int outNY, + int2 centerShift, int factor, + const real_type * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_, + const int i_startX, const int i_startY, const int i_int_size) + { + // pass through + cuSincInterpolation_kernel_common(nImages, imagesIn, inNX, inNY, + imagesOut, outNX, outNY, centerShift, factor, + r_filter_, i_covs_, i_decfactor_, i_intplength_, + i_startX, i_startY, i_int_size); + } + +__global__ void cuSincInterpolation_kernel(const int nImages, + const real_type * imagesIn, const int inNX, const int inNY, + real_type * imagesOut, const int outNX, const int outNY, + int2 *centerShift, int factor, + const real_type * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_, + const int i_startX, const int i_startY, const int i_int_size) + { + // get image index + int idxImage = blockIdx.z; + cuSincInterpolation_kernel_common(nImages, imagesIn, inNX, inNY, + imagesOut, outNX, outNY, centerShift[idxImage], factor, + r_filter_, i_covs_, i_decfactor_, i_intplength_, + i_startX, i_startY, i_int_size); + } + /** * Execute sinc interpolation * @param[in] imagesIn input images @@ -165,7 +194,7 @@ __global__ void cuSincInterpolation_kernel(const int nImages, * @note rawOversamplingFactor is for the centerShift, not the signal oversampling factor */ -void cuSincOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut, +void cuSincOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *centerShift, int rawOversamplingFactor) { const int nImages = imagesIn->count; @@ -194,4 +223,33 @@ void cuSincOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *i getLastCudaError("cuSincInterpolation_kernel"); } +void cuSincOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut, + int2 centerShift, int rawOversamplingFactor) +{ + const int nImages = imagesIn->count; + const int inNX = imagesIn->height; + const int inNY = imagesIn->width; + const int outNX = imagesOut->height; + const int outNY = imagesOut->width; + + // only compute the overampled signals within a window + const int i_int_range = i_sincwindow * i_covs; + // set the start pixel, will be shifted by centerShift*oversamplingFactor (from raw image) + const int i_int_startX = outNX/2 - i_int_range; + const int i_int_startY = outNY/2 - i_int_range; + const int i_int_size = 2*i_int_range + 1; + // preset all pixels in out image to 0 + imagesOut->setZero(stream); + + static const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads, 1); + dim3 blockspergrid (IDIVUP(i_int_size, nthreads), IDIVUP(i_int_size, nthreads), nImages); + cuSincInterpolation_kernel<<>>(nImages, + imagesIn->devData, inNX, inNY, + imagesOut->devData, outNX, outNY, + centerShift, rawOversamplingFactor, + r_filter, i_covs, i_decfactor, i_intplength, i_int_startX, i_int_startY, i_int_size); + getLastCudaError("cuSincInterpolation_kernel"); +} + // end of file \ No newline at end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h index 3755c2375..741fb90d8 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h @@ -23,6 +23,7 @@ // dependencites #include "cuArrays.h" #include "cudaUtil.h" +#include "data_types.h" #ifndef PI #define PI 3.14159265359f @@ -34,9 +35,9 @@ class cuSincOverSamplerR2R static const int i_sincwindow = 2; ///< the oversampling is only performed within \pm i_sincwindow*i_covs around the peak static const int i_weight = 1; ///< weight for cos() pedestal - const float r_pedestal = 0.0f; ///< height of pedestal - const float r_beta = 0.75f; ///< a low-band pass - const float r_relfiltlen = 6.0f; ///< relative filter length + const real_type r_pedestal = 0.0; ///< height of pedestal + const real_type r_beta = 0.75f; ///< a low-band pass + const real_type r_relfiltlen = 6.0f; ///< relative filter length static const int i_decfactor = 4096; ///< max decimals between original grid to set up the sinc kernel @@ -44,7 +45,7 @@ class cuSincOverSamplerR2R int i_intplength; ///< actual filter length = r_relfiltlen/r_beta int i_filtercoef; //< length of the sinc kernel i_intplength*i_decfactor+1 - float * r_filter; // sinc kernel with size i_filtercoef + real_type * r_filter; // sinc kernel with size i_filtercoef cudaStream_t stream; @@ -54,7 +55,9 @@ class cuSincOverSamplerR2R // set up sinc interpolation coefficients void cuSetupSincKernel(); // execute interface - void execute(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *center, int oversamplingFactor); + void execute(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *center, int oversamplingFactor); + void execute(cuArrays *imagesIn, cuArrays *imagesOut, int2 center, int oversamplingFactor); + // destructor ~cuSincOverSamplerR2R(); }; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp new file mode 100644 index 000000000..183fa4cc0 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp @@ -0,0 +1,44 @@ +#include "cudaError.h" + +#include +#include +#include +#include + +#ifdef __DRIVER_TYPES_H__ +#ifndef DEVICE_RESET +#define DEVICE_RESET cudaDeviceReset(); +#endif +#else +#ifndef DEVICE_RESET +#define DEVICE_RESET +#endif +#endif + +template +void check(T result, char const *const func, const char *const file, int const line) +{ + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", + file, line, static_cast(result), func); + DEVICE_RESET + // Make sure we call CUDA Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +template void check(cudaError_t, char const *const, const char *const, int const); +template void check(cufftResult_t, char const *const, const char *const, int const); + +void __getLastCudaError(const char *errorMessage, const char *file, const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, "%s(%i) : CUDA error : %s : (%d) %s.\n", + file, line, errorMessage, (int)err, cudaGetErrorString(err)); + DEVICE_RESET + exit(EXIT_FAILURE); + } +} diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h index 5e857cba0..f4f9524be 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h @@ -10,51 +10,13 @@ #pragma once -#include -#include -#include -#include #include "debug.h" -#include - - -#ifdef __DRIVER_TYPES_H__ -#ifndef DEVICE_RESET -#define DEVICE_RESET cudaDeviceReset(); -#endif -#else -#ifndef DEVICE_RESET -#define DEVICE_RESET -#endif -#endif template -void check(T result, char const *const func, const char *const file, int const line) -{ - if (result) - { - - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", - file, line, static_cast(result), func); - DEVICE_RESET - // Make sure we call CUDA Device Reset before exiting - exit(EXIT_FAILURE); - } -} +void check(T result, char const *const func, const char *const file, int const line); // This will output the proper error string when calling cudaGetLastError -inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) -{ - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) - { - fprintf(stderr, "%s(%i) : CUDA error : %s : (%d) %s.\n", - file, line, errorMessage, (int)err, cudaGetErrorString(err)); - DEVICE_RESET - exit(EXIT_FAILURE); - } -} +void __getLastCudaError(const char *errorMessage, const char *file, const int line); // This will output the proper CUDA error strings in the event that a CUDA host call returns an error #ifdef CUDA_ERROR_CHECK diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp new file mode 100644 index 000000000..c96a12947 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp @@ -0,0 +1,58 @@ +#include "cudaUtil.h" + +#include +#include +#include +#include "cudaError.h" + +int gpuDeviceInit(int devID) +{ + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0 || devID > device_count - 1) { + fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("Using CUDA Device %d ...\n", devID); + + return devID; +} + +void gpuDeviceList() +{ + int device_count = 0; + int current_device = 0; + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + fprintf(stderr, "Detecting all CUDA devices ...\n"); + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + while (current_device < device_count) { + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); + if (deviceProp.computeMode == cudaComputeModeProhibited) { + fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: " + "device is running in \n", + current_device, deviceProp.name); + } else if (deviceProp.major < 1) { + fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: " + "device does not support CUDA \n", + current_device, deviceProp.name); + } else { + fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", + current_device, deviceProp.name); + } + current_device++; + } +} diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h index cc174e009..7516bc9ca 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h @@ -10,9 +10,6 @@ #ifndef __CUDAUTIL_H #define __CUDAUTIL_H -#include -#include "cudaError.h" - // for 2D FFT #define NRANK 2 @@ -47,12 +44,6 @@ #define MIN(a,b) (a > b ? b: a) #endif -// Float To Int conversion -inline int ftoi(float value) -{ - return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5)); -} - // compute the next integer in power of 2 inline int nextpower2(int value) { @@ -61,63 +52,11 @@ inline int nextpower2(int value) return r; } - // General GPU Device CUDA Initialization -inline int gpuDeviceInit(int devID) -{ - int device_count; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) - { - fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - if (devID < 0 || devID > device_count-1) - { - fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID); - exit(EXIT_FAILURE); - } - - checkCudaErrors(cudaSetDevice(devID)); - printf("Using CUDA Device %d ...\n", devID); - - return devID; -} +int gpuDeviceInit(int devID); // This function lists all available GPUs -inline void gpuDeviceList() -{ - int device_count = 0; - int current_device = 0; - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - fprintf(stderr, "Detecting all CUDA devices ...\n"); - if (device_count == 0) - { - fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - while (current_device < device_count) - { - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); - if (deviceProp.computeMode == cudaComputeModeProhibited) - { - fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device is running in \n", current_device, deviceProp.name); - } - else if (deviceProp.major < 1) - { - fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device does not support CUDA \n", current_device, deviceProp.name); - } - else { - fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", current_device, deviceProp.name); - } - current_device++; - } -} +void gpuDeviceList(); #endif //__CUDAUTIL_H //end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h new file mode 100644 index 000000000..134deafdc --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h @@ -0,0 +1,46 @@ +/** + * @file debug.h + * @brief Define flags to control the debugging + * + * CUAMPCOR_DEBUG is used to output debugging information and intermediate results, + * disabled when NDEBUG macro is defined. + * CUDA_ERROR_CHECK is always enabled, to check CUDA routine errors + * + */ + +// code guard +#ifndef __CUAMPCOR_DATA_TYPES_H +#define __CUAMPCOR_DATA_TYPES_H + +#include + +// disable this for single precision version +// #define CUAMPCOR_DOUBLE + +#ifdef CUAMPCOR_DOUBLE + #define real_type double + #define complex_type double2 + #define real2_type double2 + #define real3_type double3 + #define make_real2 make_double2 + #define make_real3 make_double3 + #define make_complex_type make_double2 + #define EPSILON DBL_EPSILON + #define REAL_MAX DBL_MAX +#else + #define real_type float + #define complex_type float2 + #define real2_type float2 + #define real3_type float3 + #define make_real2 make_float2 + #define make_real3 make_float3 + #define make_complex_type make_float2 + #define EPSILON FLT_EPSILON + #define REAL_MAX FLT_MAX +#endif + +#define image_complex_type float2 +#define image_real_type float + +#endif //__CUAMPCOR_DATA_TYPES_H +//end of file \ No newline at end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h index e43e8e44a..1879bb91e 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h @@ -7,9 +7,10 @@ #ifndef __FLOAT2_H #define __FLOAT2_H -#include +#include +#include -inline __host__ __device__ void zero(float2 &a) { a.x = 0.0f; a.y = 0.0f; } +inline __host__ __device__ void zero(float2 &a) { a.x = 0.0; a.y = 0.0; } // negative inline __host__ __device__ float2 operator-(float2 &a) @@ -125,5 +126,124 @@ inline __host__ __device__ float2 complexExp(float arg) return make_float2(cosf(arg), sinf(arg)); } + +// double precision +inline __host__ __device__ void zero(double2 &a) { a.x = 0.0; a.y = 0.0; } + +// negative +inline __host__ __device__ double2 operator-(double2 &a) +{ + return make_double2(-a.x, -a.y); +} + +// complex conjugate +inline __host__ __device__ double2 conjugate(double2 a) +{ + return make_double2(a.x, -a.y); +} + +// addition +inline __host__ __device__ double2 operator+(double2 a, double2 b) +{ + return make_double2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(double2 &a, double2 b) +{ + a.x += b.x; + a.y += b.y; +} + +inline __host__ __device__ double2 operator+(double2 a, double b) +{ + return make_double2(a.x + b, a.y); +} +inline __host__ __device__ void operator+=(double2 &a, double b) +{ + a.x += b; +} + +// subtraction +inline __host__ __device__ double2 operator-(double2 a, double2 b) +{ + return make_double2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(double2 &a, double2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ double2 operator-(double2 a, double b) +{ + return make_double2(a.x - b, a.y); +} +inline __host__ __device__ void operator-=(double2 &a, double b) +{ + a.x -= b; +} + +// multiplication +inline __host__ __device__ double2 operator*(double2 a, double2 b) +{ + return make_double2(a.x*b.x - a.y*b.y, a.y*b.x + a.x*b.y); +} +inline __host__ __device__ void operator*=(double2 &a, double2 b) +{ + a.x = a.x*b.x - a.y*b.y; + a.y = a.y*b.x + a.x*b.y; +} +inline __host__ __device__ double2 operator*(double2 a, double b) +{ + return make_double2(a.x * b, a.y * b); +} +inline __host__ __device__ void operator*=(double2 &a, double b) +{ + a.x *= b; + a.y *= b; +} +inline __host__ __device__ double2 operator*(double2 a, int b) +{ + return make_double2(a.x * b, a.y * b); +} +inline __host__ __device__ void operator*=(double2 &a, int b) +{ + a.x *= b; + a.y *= b; +} +inline __host__ __device__ double2 complexMul(double2 a, double2 b) +{ + return a*b; +} +inline __host__ __device__ double2 complexMulConj(double2 a, double2 b) +{ + return make_double2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y); +} + +inline __host__ __device__ double2 operator/(double2 a, double b) +{ + return make_double2(a.x / b, a.y / b); +} +inline __host__ __device__ void operator/=(double2 &a, double b) +{ + a.x /= b; + a.y /= b; +} + +// abs, arg +inline __host__ __device__ double complexAbs(double2 a) +{ + return sqrt(a.x*a.x+a.y*a.y); +} +inline __host__ __device__ double complexArg(double2 a) +{ + return atan2(a.y, a.x); +} + +// make a complex number from phase +inline __host__ __device__ double2 complexExp(double arg) +{ + return make_double2(cos(arg), sin(arg)); +} + + #endif //__FLOAT2_H -// end of file \ No newline at end of file +// end of file diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp index 7e884ba45..983b78e14 100644 --- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp +++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp @@ -32,13 +32,16 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_PARAM(int, deviceID) .DEF_PARAM(int, nStreams) .DEF_PARAM(int, derampMethod) + .DEF_PARAM(int, workflow) .DEF_PARAM(str, referenceImageName) .DEF_PARAM(int, referenceImageHeight) .DEF_PARAM(int, referenceImageWidth) + .DEF_PARAM(int, referenceImageDataType) .DEF_PARAM(str, secondaryImageName) .DEF_PARAM(int, secondaryImageHeight) .DEF_PARAM(int, secondaryImageWidth) + .DEF_PARAM(int, secondaryImageDataType) .DEF_PARAM(int, numberWindowDown) .DEF_PARAM(int, numberWindowAcross) @@ -51,7 +54,7 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_PARAM(int, mergeGrossOffset) .DEF_PARAM(str, snrImageName) .DEF_PARAM(str, covImageName) - .DEF_PARAM(str, corrImageName) + .DEF_PARAM(str, peakValueImageName) .DEF_PARAM(int, rawDataOversamplingFactor) .DEF_PARAM(int, corrStatWindowSize) @@ -67,8 +70,8 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_PARAM_RENAME(int, referenceStartPixelAcrossStatic, referenceStartPixelAcross0) .DEF_PARAM_RENAME(int, referenceStartPixelDownStatic, referenceStartPixelDown0) - .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingMethod, oversamplingMethod) - .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingFactor, oversamplingFactor) + .DEF_PARAM(int, corrSurfaceOverSamplingMethod) + .DEF_PARAM(int, corrSurfaceOverSamplingFactor) .DEF_PARAM_RENAME(int, mmapSize, mmapSizeInGB) @@ -78,6 +81,8 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_METHOD(runAmpcor) + .DEF_METHOD(isDoublePrecision) + .def("checkPixelInImageRange", [](const cls& self) { self.param->checkPixelInImageRange(); }) diff --git a/python/packages/nisar/workflows/dense_offsets.py b/python/packages/nisar/workflows/dense_offsets.py index e897b19cd..33d583b2a 100644 --- a/python/packages/nisar/workflows/dense_offsets.py +++ b/python/packages/nisar/workflows/dense_offsets.py @@ -101,7 +101,7 @@ def run(cfg: dict): ampcor.grossOffsetImageName = str(out_dir / 'gross_offset') ampcor.snrImageName = str(out_dir / 'snr') ampcor.covImageName = str(out_dir / 'covariance') - ampcor.corrImageName = str(out_dir / 'correlation_peak') + ampcor.peakValueImageName = str(out_dir / 'correlation_peak') # Create empty ENVI datasets. PyCuAmpcor will overwrite the # binary files. Note, use gdal to pass interleave option diff --git a/python/packages/nisar/workflows/offsets_product.py b/python/packages/nisar/workflows/offsets_product.py index ed5ef8358..44d523f2f 100644 --- a/python/packages/nisar/workflows/offsets_product.py +++ b/python/packages/nisar/workflows/offsets_product.py @@ -158,7 +158,7 @@ def run(cfg: dict, output_hdf5: str = None): layer_scratch_path / 'gross_offset') ampcor.snrImageName = str(layer_scratch_path / 'snr') ampcor.covImageName = str(layer_scratch_path / 'covariance') - ampcor.corrImageName = str(layer_scratch_path/ 'correlation_peak') + ampcor.peakValueImageName = str(layer_scratch_path/ 'correlation_peak') create_empty_dataset(str(layer_scratch_path / 'dense_offsets'), ampcor.numberWindowAcross,