Separable 2d convolution with nxn filter

Separable 2d convolution with nxn filter

Does anybody has a simple example of how to perform a 2d separable convolution with an nxn filter.
I do not understand welll how IPP works.

Is it better to add the border to the image and then to use FilterRow and FilterColumnone after the other with the entire image as ROI, or calculate the border on fly using ippiFilterRowBorderPipeline? In the last case how can I deal with the top and bottom border since ippiFilterColumnPipeline does not include the on fly calculation of the borders.

I thinck I miss something, I have studied the example at the end of the section in the documentation, but that one is limitated by using a 3x3 kernel.

Thanks

Emilio

17 posts / 0 new
Last post
For more complete information about compiler optimizations, see our Optimization Notice.

Hi,

You can use ippiFilterRowBorderPipeline and ippiFilterColumnPipeline. For ippiFilterColumnPipeline you have to prepare borders manually

We prepared simple example for you, please take a look on attached source file. (border - ippBorderRepl, kernelSize - 5, anchor - 2)

Regards,
Michael

Hi,

You can use ippiFilterRowBorderPipeline and ippiFilterColumnPipeline. For ippiFilterColumnPipeline you have to prepare borders manually

We prepared simple example for you, please take a look on attached source file. (border - ippBorderRepl, kernelSize - 5, anchor - 2)

Regards,
Michael

Hallo Michael,

Thanks for posting a general example regarding the convolution with separable filters. I am trying to modify your code so that it can be called from within Matlab (in a MEX file). There are a few things that still are not clear to me. I hope you can help me.

I do not clearly understand what this section of the code does:

/* organize dst buffer */
pTmp = (Ipp16s*)(ppDst+size.height);
for(i=0;i ppDst[i] = pTmp;
ppSrc[i+2]=pTmp;
}
/* organize replicate border for ippiFilterColumnPipeline_16s_C1R */
/* top */
ppSrc[0]=ppSrc[2];
ppSrc[1]=ppSrc[2];
/* bottom */
ppSrc[i-1+4]=ppDst[size.height-1];
ppSrc[i-2+4]=ppDst[size.height-1];

I understand that you are arranging the pointers to the image borders for the replication but the whole process is not very clear. I would appreciate some more specific comments i nthe codeso that I can generalize this to any kernel size (I suspect that some coefficients like the 2 and the 4 in ppSrc[2] and ppSrc[i-2+4] are somehow related to the semisize of the filter that you using in this example but it is not clear why and how).

I also include my version of the code so that you may spot major mistakes and bugs:

#ifdef MEX_CONV2_SEP_SINGLE_USE_IPP

inline void CheckIPPStatus(IppStatus status)

{

if(status != ippStsNoErr)

{

mexPrintf("
status = %s ", ippGetStatusString(status));

mexErrMsgTxt("IPP error");

}

}

// Adapted from:

// http://softwarecommunity.intel.com/isn/Community/en-US/forums/5482632/Po...

//

// INTEL CORPORATION PROPRIETARY INFORMATION

// This software is supplied under the terms of a license agreement or

// nondisclosure agreement with Intel Corporation and may not be copied

// or disclosed except in accordance with the terms of that agreement.

// Copyright (c) 2005 Intel Corporation. All Rights Reserved.

//

// Separable 2D convolution example

//

void SepConv(const Ipp32f *pSrcBuffer, Ipp32f *pDstBuffer, IppiSize *size, Ipp32f *hc, int Nc, Ipp32f *hr, int Nr)

{

int sizerow, sizecol, i;

int maxKernelSize = (Nc > Nr) ? Nc : Nr;

Ipp32f **ppDst, **ppSrc, *pTmp;

Ipp8u *pBufferCol, *pBufferRow;

IppStatus status;

//int xAnchor = (Nr >> 1) + 1;

int yAnchor = (Nc >> 1) + 1;

// allocate temporary dst buffer

ppDst = (Ipp32f**)ippsMalloc_8u(size->width*size->height*sizeof(Ipp32f) + (size->height)*sizeof(Ipp32f*));

ppSrc = (Ipp32f**)ippsMalloc_8u((size->height+maxKernelSize)*sizeof(Ipp32f*));

// size of temporary buffers

status = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(*size, maxKernelSize, &sizerow);

CheckIPPStatus(status);

status = ippiFilterColumnPipelineGetBufferSize_32f_C1R(*size, maxKernelSize, &sizecol);

CheckIPPStatus(status);

//mexPrintf("
size = {%d,%d}", size->height, size->width);

// allocate temporary buffers

pBufferCol = ippsMalloc_8u(sizecol);

if (pBufferCol == NULL) mexErrMsgTxt("IPP error: failed to allocate column buffer");

pBufferRow = ippsMalloc_8u(sizerow);

if (pBufferRow == NULL) mexErrMsgTxt("IPP error: failed to allocate row buffer");

// organize dst buffer

pTmp = (Ipp32f*)(ppDst+size->height);

for(i = 0; iheight; i++, pTmp += size->width)

{

ppDst[i] = pTmp;

ppSrc[i+2] = pTmp;

}

// organize replicate border for ippiFilterColumnPipeline_32f_C1R

// top

ppSrc[0] = ppSrc[2];

ppSrc[1] = ppSrc[2];

// bottom

ppSrc[i-1+4] = ppDst[size->height-1];

ppSrc[i-2+4] = ppDst[size->height-1];

// perform the convolution

/*

mexPrintf("
pSrcBuffer = {%f, %f, %f, %f}", pSrcBuffer[0], pSrcBuffer[1], pSrcBuffer[2], pSrcBuffer[3]);

mexPrintf("
hc = {%f, %f, %f, %f}", hc[0], hc[1], hc[2], hc[3]);

mexPrintf("
size = {%d, %d}", size->height, size->width);

mexPrintf("
sizerow = %d, sizecol = %d", sizerow, sizecol);

*/

status = ippiFilterRowBorderPipeline_32f_C1R((const Ipp32f*) pSrcBuffer, size->width*sizeof(Ipp32f), ppDst,

*size, hc, Nc, yAnchor, ippBorderRepl, 0, pBufferRow);

CheckIPPStatus(status);

status = ippiFilterColumnPipeline_32f_C1R((const Ipp32f **)ppSrc, pDstBuffer, size->width*sizeof(Ipp32f), *size,

hr, Nr, pBufferCol);

CheckIPPStatus(status);

ippsF
ree(ppSrc);

ippsFree(ppDst);

ippsFree(pBufferCol);

ippsFree(pBufferRow);

}

#endif

Thanks in advance for your help,

Marco

This is the version of the code that I would expect to generalize the one originally posted by Michael. In particular I modified the section for the organization of the buffers as follows:

// organize dst buffer

// (the offset takes care of the double pointer structure)

pTmp = (Ipp32f*)(ppDst+size->height);

for(i = 0; iheight; i++, pTmp += size->width)

{

ppDst[i] = pTmp;

ppSrc[i+yAnchor] = pTmp;

}

for(i = 0; i < yAnchor; i++)

{

// top

ppSrc[i] = ppSrc[yAnchor];

// bottom

ppSrc[i + size->height] = ppSrc[size->height-1];

}

Unfortunately performing the second convolution (within the MEX file, ippiFilterColumnPipeline_32f_C1R) the status returned by IPP says Null pointer error

I am reposting the original code for your convenience. Note that hxc and hr are the convolution kernels whose lenght is respectively Nc and Nr.

#ifdef MEX_CONV2_SEP_SINGLE_USE_IPP

inline void CheckIPPStatus(IppStatus status)

{

if(status != ippStsNoErr)

{

mexPrintf(" status = %s ", ippGetStatusString(status));

mexErrMsgTxt("IPP error");

}

}

// Adapted from:

// http://softwarecommunity.intel.com/isn/Community/en-US/forums/5482632/PostAttachment.aspx

//

// INTEL CORPORATION PROPRIETARY INFORMATION

// This software is supplied under the terms of a license agreement or

// nondisclosure agreement with Intel Corporation and may not be copied

// or disclosed except in accordance with the terms of that agreement.

// Copyright (c) 2005 Intel Corporation. All Rights Reserved.

//

// Separable 2D convolution example

//

void SepConv(const Ipp32f *pSrcBuffer, Ipp32f *pDstBuffer, IppiSize *size, Ipp32f *hc, int Nc, Ipp32f *hr, int Nr)

{

int sizerow, sizecol, i;

int maxKernelSize = (Nc > Nr) ? Nc : Nr;

Ipp32f **ppDst, **ppSrc, *pTmp;

Ipp8u *pBufferCol, *pBufferRow;

IppStatus status;

//int xAnchor = (Nr >> 1) + 1;

int yAnchor = (Nc >> 1) + 1;

// allocate temporary dst buffer

ppDst = (Ipp32f**)ippsMalloc_8u(size->width*size->height*sizeof(Ipp32f) + (size->height)*sizeof(Ipp32f*));

ppSrc = (Ipp32f**)ippsMalloc_8u((size->height+maxKernelSize)*sizeof(Ipp32f*));

// size of temporary buffers

status = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(*size, maxKernelSize, &sizerow);

CheckIPPStatus(status);

status = ippiFilterColumnPipelineGetBufferSize_32f_C1R(*size, maxKernelSize, &sizecol);

CheckIPPStatus(status);

// allocate temporary buffers

pBufferCol = ippsMalloc_8u(sizecol);

if (pBufferCol == NULL) mexErrMsgTxt("IPP error: failed to allocate column buffer");

pBufferRow = ippsMalloc_8u(sizerow);

if (pBufferRow == NULL) mexErrMsgTxt("IPP error: failed to allocate row buffer");

// organize dst buffer (the offset takes care of the double pointer structure)

pTmp = (Ipp32f*)(ppDst+size->height);

for(i = 0; iheight; i++, pTmp += size->width)

{

ppDst[i] = pTmp;

ppSrc[i+yAnchor] = pTmp;

}

for(i = 0; i < yAnchor; i++)

{

// top

ppSrc[i] = ppSrc[yAnchor];

// bottom

ppSrc[i + size->height] = ppSrc[size->height-1];

}

// perform the convolutions

status = ippiFilterRowBorderPipeline_32f_C1R((const Ipp32f*) pSrcBuffer, size->width*sizeof(Ipp32f), ppDst, *size, hc, Nc, yAnchor, ippBorderRepl, 0, pBufferRow);

CheckIPPStatus(status);

status = ippiFilterColumnPipeline_32f_C1R((const Ipp32f **)ppSrc, pDstBuffer, size->width*sizeof(Ipp32f), *size, hr, Nr, pBufferCol);

CheckIPPStatus(status);

ippsFree(ppSrc);

ippsFree(ppDst);

ippsFree(pBufferCol);

ippsFree(pBufferRow);

}

#endif

The above function is called as:

// in version for the different byte ordering between Matlab and IPP

IppiSize size;

size.height = width;

size.width = height;

SepConv((Ipp32f *)f, (Ipp32f *)g, &size, (Ipp32f *)hc, Nc, (Ipp32f *)hr, Nr);

Again thanks in advance for any useful insight,

Marco

Hi,

Example 9-4 (p 9-55 of IPP manual vol2, Jan 2007) described how to use ippiFilterRow/ColumnBorderPipeline functions to calculate the separable convolution without the intermediate buffer for the whole image.

The ring buffer (the double pointer) for convolved rows is used there. You should befine the border type for row convolution because there is no data outside the image. But border rows for column convolution are formed manually, so you need not extra argument for them.

Eg for the 3x3 convolution with replicate border you need to replicate the pointer to the first convolved row.

Thanks,

Alexander

Dear Alexander,

thanks for your prompt response. I must say that I found the Example 9-4 extremely criptic (30 lines of dense code without a single comment...). However I came up with a routine that seems to achive the task. Note that there are a bunch of tricks to handle kernel sizes that can be even or odd. The flipping of the kernel is meant to adapt my routine with the conv2 routine of Matlab (and for the same reasonone might notice "inversion" between row and columns, since Matlabuses column-major ordering of the data, as in Fortran).

I hope that this can be of some help. I would appreciate any help from the community as far asbugs/improvments are concerned.

#ifdef MEX_CONV2_SEP_SINGLE_USE_IPP

inline void CheckIPPStatus(IppStatus status)

{

if(status != ippStsNoErr)

{

mexPrintf(" status = %s ", ippGetStatusString(status));

mexErrMsgTxt("IPP error");

}

}

// Adapted by Marco Zuliani (zuliani@mayachitra.com) from:

//

// http://softwarecommunity.intel.com/isn/Community/en-US/forums/5482632/PostAttachment.aspx

//

// INTEL CORPORATION PROPRIETARY INFORMATION

// This software is supplied under the terms of a license agreement or

// nondisclosure agreement with Intel Corporation and may not be copied

// or disclosed except in aNcssordance with the terms of that agreement.

// Copyright (c) 2005 Intel Corporation. All Rights Reserved.

//

// Separable 2D convolution example

//

void SepConv(const Ipp32f *pSrcBuffer, Ipp32f *pDstBuffer, IppiSize *size, Ipp32f *hc, int Nc, Ipp32f *hr, int Nr)

{

int sizerow, sizecol, i, j;

Ipp32f **ppDst, **ppSrc, *pTmp;

Ipp8u *pBufferCol, *pBufferRow;

// flip the kernels and align the memory to please IPP

Ipp32f *hc_flipped = (Ipp32f *)ippsMalloc_32f(Nc * sizeof(float));

Ipp32f *hr_flipped = (Ipp32f *)ippsMalloc_32f(Nr * sizeof(float));

for(i = 0; i < Nc; i++) hc_flipped[i] = hc[Nc-i-1];

for(j = 0; j < Nr; j++) hr_flipped[j] = hr[Nr- j-1];

// compute the kernel semisizes

int Ncss = Nc >> 1;

int Nrss = Nr >> 1;

// compute the kernel offsets (0 -> odd, 1 -> even)

int co = 1-(Nc%2);

int ro = 1-(Nr%2);

// allocate temporary dst buffer

ppDst = (Ipp32f**)ippsMalloc_8u(size->width*size->height*sizeof(Ipp32f)+(size->height)*sizeof(Ipp32f*));

ppSrc = (Ipp32f**)ippsMalloc_8u((size->height+Nc)*sizeof(Ipp32f*));

// size of temporary buffers

ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(*size, Nc, &sizerow);

ippiFilterColumnPipelineGetBufferSize_32f_C1R(*size, Nr, &sizecol);

// allocate temporary buffers

pBufferCol = ippsMalloc_8u(sizecol);

pBufferRow = ippsMalloc_8u(sizerow);

// organize dst buffer

pTmp = (Ipp32f*)(ppDst + size->height);

for(i = 0; i < size->height; i++, pTmp += size->width)

{

ppDst[i] = pTmp;

ppSrc[i+Nrss-ro] = pTmp;

}

// organize replicate border for ippiFilterColumnPipeline_32f_C1R

for(j = 0; j < Nrss; j++)

{

// top

ppSrc[j] = ppSrc[Nrss-ro];

// bottom

ppSrc[i-j+Nr-2] = ppDst[size->height-1];

}

// perform the actual convolutions

ippiFilterRowBorderPipeline_32f_C1R((const Ipp32f*) pSrcBuffer, size->width*sizeof(Ipp32f), ppDst,

*size, hc_flipped, Nc, Ncss-co, ippBorderConst, 0, pBufferRow);

ippiFilterColumnPipeline_32f_C1R((const Ipp32f**)ppSrc, pDstBuffer, size->width*sizeof(Ipp32f), *size,

hr_flipped, Nr, pBufferCol);

// release some memory

ippsFree(hc_flipped);

ippsFree(hr_flipped);

ippsFree(ppSrc);

ippsFree(ppDst);

ippsFree(pBufferCol);

ippsFree(pBufferRow);

}

#endif

/*********************************************************************************

Copyright(C) 2004-2009, Riverain Medical Group LLC. All Rights Reserved.

This is UNPUBLISHED PROPRIETARY SOURCE CODE of Riverain Medical Group, LLC.
The contents of this file may not be disclosed to third parties, copied or
duplicated in any form, in whole or in part, for use or transmittal, without
the prior written permission of Riverain Medical Group LLC

**********************************************************************************/

/*!
* file IppSepFilter.cpp
* brief Implementation of Separable Filter with IPP
* author J. Schamus, jschamus@riverainmedical.com
*/

# pragma once
# include "Image.h"

static inline IppStatus ippSepFilter(
const Ipp32f* pSrc, // Sourse Image
const int srcStep, // Source step
Ipp32f* pDst, // Destination Image
const int dstStep, // Destination step
const IppiSize roiSize, // Source/Destination size
const Ipp32f* hc, // Column filter
const int Nc, // Column filter size
const Ipp32f* hr, // Row filter
const int Nr, // Row filter size
const PadType& padType = CONSTANT, // Padding type
const Ipp32f val = 0.0f ) // Value to use with CONSTANT padding
{
IppStatus sts;

int sizerow, sizecol;
Ipp32f **ppDst, **ppSrc, *pTmp = NULL;
Ipp8u *pBufferCol = NULL, *pBufferRow = NULL;

// compute the kernel semisizes
int Ncss = Nc >> 1;
int Nrss = Nr >> 1;

// compute the kernel offsets (0 -> odd, 1 -> even)
int co = 1 - ( Nc % 2 );
int ro = 1 - ( Nr % 2 );

// allocate temporary dst buffer
int tmpStep;
pTmp = ippiMalloc_32f_C1( roiSize.width, roiSize.height + (Ncss * 2), &tmpStep );
if( !pTmp ) return ippStsMemAllocErr;

int tmpw = tmpStep / sizeof(Ipp32f);
IppiSize tmpSize;
tmpSize.height = roiSize.height + (Ncss * 2) - co; tmpSize.width = roiSize.width;
ippiSet_32f_C1R( 0.0f, pTmp, tmpStep, tmpSize );

ppDst = new Ipp32f*[roiSize.height];
ppSrc = new Ipp32f*[roiSize.height + (Nrss * 2) - co];

// size of temporary buffers
if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizerow) )
return sts;
if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizecol) )
return sts;

// allocate temporary buffers
pBufferCol = ippsMalloc_8u( sizecol );
if( !pBufferCol ) return ippStsMemAllocErr;
pBufferRow = ippsMalloc_8u( sizerow );
if( !pBufferRow ) return ippStsMemAllocErr;

Nrss -= ro;
Ncss -= co;
// organize dst buffer
for( int ii=0,jj=Ncss;ii {
ppDst[ii] = pTmp + jj * tmpw;
ppSrc[jj] = pTmp + jj * tmpw;
}

IppiBorderType borderType;

switch( padType )
{
case CONSTANT:
for( int ii=0,jj=roiSize.height+Ncss;ii {
ppSrc[ii] = pTmp + ii * tmpw;
ppSrc[jj] = pTmp + jj * tmpw;
if( val )
{
ippsSet_32f( val, ppSrc[ii], roiSize.width );
ippsSet_32f( val, ppSrc[jj], roiSize.width );
}
else
{
ippsZero_32f( ppSrc[ii], roiSize.width );
ippsZero_32f( ppSrc[jj], roiSize.width );
}
}
if( co )
{
ppSrc[roiSize.height+(Ncss*2)] = pTmp + (roiSize.height+(Ncss*2)) * tmpw;
ippsSet_32f( val, ppSrc[roiSize.height+(Ncss*2)], roiSize.width );
}
borderType = ippBorderConst;
break;

case REPLICATE:
for( int ii=0,jj=roiSize.height+Ncss;ii {
ppSrc[ii] = ppSrc[Ncss];
ppSrc[jj] = ppSrc[roiSize.height+Ncss-1];
}
if( co )
{
ppSrc[roiSize.height+(Ncss*2)] = ppSrc[roiSize.height+Ncss-1];
}
borderType = ippBorderRepl;
break;

case SYMMETRIC:
for( int ii=0,jj=roiSize.height+Ncss;ii {
ppSrc[ii] = ppSrc[(Ncss*2)-ii-1];
ppSrc[jj] = ppSrc[roiSize.height+Ncss-ii-1];
}
if( co )
{
ppSrc[roiSize.height+(Ncss*2)] = ppSrc[roiSize.height-1];
}
borderType = ippBorderMirrorR;
break;

case CIRCULAR:
for( int ii=0,jj=roiSize.height+Ncss;ii {
ppSrc[ii] = ppSrc[roiSize.height+ii];
ppSrc[jj] = ppSrc[ii+Ncss];
}
if( co )
{
ppSrc[roiSize.height+(Ncss*2)] = ppSrc[Ncss*2];
}
borderType = ippBorderWrap;
break;

default:
return ippStsPaddingSchemeErr; // ippStsBorderErr missing from ippdefs.h
}

// perform the actual convolutions
if( sts = ippiFilterRowBorderPipeline_32f_C1R((const Ipp32f*) pSrc, srcStep,
ppDst, roiSize, hr, Nr, Nrss, borderType, val, pBufferRow) )
return sts;

if( sts = ippiFilterColumnPipeline_32f_C1R((const Ipp32f**)ppSrc, pDst, dstStep,
roiSize, hc, Nc, pBufferCol) )
return sts;

ippsFree(pTmp);
ippsFree(pBufferCol);
ippsFree(pBufferRow);
delete []ppSrc;
delete []ppDst;

return sts;
}

No Guarntees with this, but it has been tested with both even and odd sized kernals and for all padding types.

Enjoy,
Jay Schamus

Hi Jay,

thanks for sharing of your expertise with IPP developers community. Although the copyright notice of your code may look misleading. Is it possible to disclosure this code?

Regards,
Vladimir

Quoting - Vladimir Dudnik (Intel)
Hi Jay,

thanks for sharing of your expertise with IPP developers community. Although the copyright notice of your code may look misleading. Is it possible to disclosure this code?

Regards,
Vladimir

Sure fine. Just acknowledge the source.

Hi Jay

Thanks alot for sharing your improved version of the convolution code. I was wondering if you can check a couple of things:

1] Should the line

ppSrc = new Ipp32f*[roiSize.height + (Nrss * 2) - co];

read instead as:

ppSrc = new Ipp32f*[roiSize.height + (Ncss * 2) - co];

so that the loop:

for( int ii=0,jj=roiSize.height+Ncss;ii{
ppSrc[ii] = pTmp + ii * tmpw;
ppSrc[jj] = pTmp + jj * tmpw;

(...)

will not exceed the boundaries?

2] Should the lines:

// size of temporary buffers
if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizerow) )
return sts;
if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizecol) )
return sts;

be instead:

// size of temporary buffers
if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizerow) )
return sts;
if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizecol) )
return sts;

i.e. with Nr and Nc swapped?

Thanks,
Marco

Quoting - zuliani@mayachitra.com

Hi Jay

Thanks alot for sharing your improved version of the convolution code. I was wondering if you can check a couple of things:

1] Should the line

ppSrc = new Ipp32f*[roiSize.height + (Nrss * 2) - co];

read instead as:

ppSrc = new Ipp32f*[roiSize.height + (Ncss * 2) - co];

so that the loop:

for( int ii=0,jj=roiSize.height+Ncss;ii{
ppSrc[ii] = pTmp + ii * tmpw;
ppSrc[jj] = pTmp + jj * tmpw;

(...)

will not exceed the boundaries?

2] Should the lines:

// size of temporary buffers
if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizerow) )
return sts;
if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizecol) )
return sts;

be instead:

// size of temporary buffers
if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizerow) )
return sts;
if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizecol) )
return sts;

i.e. with Nr and Nc swapped?

Thanks,
Marco

Marco,
sorry to take so long to get back, but I've been busy. Here is the corrected code for this (can't use new and delete ro it will crash once every ~1000 times you run it ). Also, I included a version that runs the filter in reverse order (CR vs. RC), for those that need to match MatLab. Note the Copyright is there but it means that if you use this code please acknowledge the source. And also, one of our guys who is experimenting with 64-bit under Windows 2008 gets a crash every time at the call to ippiFilterRowBorderPipelineGetBufferSize_32f_C1R when builds this as a 64-bit DLL.

/*********************************************************************************

Copyright(C) 2004-2009, Riverain Medical Group LLC.  All Rights Reserved.

This is UNPUBLISHED PROPRIETARY SOURCE CODE of Riverain Medical Group, LLC.
The contents of this file may not be disclosed to third parties,  copied or
duplicated in any form, in whole or in part, for use or transmittal, without
the prior written permission of Riverain Medical Group LLC

**********************************************************************************/

/*!
* file   IppSepFilter.cpp
* brief  Implementation of Separable Filter with IPP
* author J. Schamus, jschamus@riverainmedical.com
*/

# pragma once
# include 



// Row major version
static inline IppStatus IppSepFilterRC(	
									   Ipp32f*			pDst,					// Destination Image
									   const int&		dstStep,				// Destination step
									   const Ipp32f*	        pSrc,					// Source Image 
									   const int&		srcStep,				// Source step
									   const IppiSize&	roiSize,				// Source/Destination size
									   const Ipp32f*	        hr,					// Row filter
									   const int&		Nr,					// Row filter size
									   const Ipp32f*	        hc,					// Column filter
									   const int&		Nc,					// Column filter size
									   const PadType&	padType = CONSTANT,		// Padding type
									   const Ipp32f&	        val	= 0.0f )			// Value to use with CONSTANT padding
{
	TRY_AUTO
	{
		IppStatus sts;

		int sizerow, sizecol;
		Ipp32f *pTmp = NULL;
		Ipp32f *pTmpLocal = NULL;
		Ipp8u *pBufferCol = NULL, *pBufferRow = NULL;

		//	flip the kernels and align the memory to please IPP 
		Ipp32f *hc_flipped = (Ipp32f *)ippsMalloc_32f( Nc );
		Ipp32f *hr_flipped = (Ipp32f *)ippsMalloc_32f( Nr );

		ippsFlip_32f((const Ipp32f*)hc, hc_flipped, Nc );
		ippsFlip_32f((const Ipp32f*)hr, hr_flipped, Nr );

		// 	compute the kernel semisizes
		int Ncss = Nc >> 1;
		int Nrss = Nr >> 1;

		// 	compute the kernel offsets (0 -> odd, 1 -> even)
		int co = 1 - ( Nc % 2 );
		int ro = 1 - ( Nr % 2 );

		//	allocate temporary dst buffer
		int tmpStep;
		int tmpw;

		// 	The IPP filter functions seem to need 1 more row allocated
		// 	than is obvious or they sometimes crash.
		int tmpHeight = roiSize.height+Nc+1;
		int tmpWidth  = roiSize.width;

		if( !( pTmpLocal = ippiMalloc_32f_C1( roiSize.width, roiSize.height + Nc + 1, &tmpStep ) ) )
			throw exception( "nIppSepFilterRC, mem-alloc error. " );
		pTmp = pTmpLocal;
		tmpw = tmpStep / sizeof(Ipp32f);

		Ipp32f **ppSrc, **ppDst;
		ppSrc = (Ipp32f**) ippsMalloc_32f( roiSize.height + Nc + 1 );
		ppDst = (Ipp32f**) ippsMalloc_32f( roiSize.height );

		if( padType == CONSTANT )
		{
			IppiSize tmpSize;
			tmpSize.height = roiSize.height + Nc + 1; 
			tmpSize.width  = roiSize.width;
			ippiSet_32f_C1R( val, pTmp, tmpStep, tmpSize );
		}

		// 	size of temporary buffers
		if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizerow) )
			throw exception( "nIppSepFilterRC, ipp-row-mem-size error. ");

		if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizecol) )
			throw exception( "nIppSepFilterRC, ipp-col-mem-size error. ");

		//	allocate temporary buffers
		if( !( pBufferCol = ippsMalloc_8u( sizecol ) ) )
			throw exception( "nIppSepFilterRC, ipp-col-temp mem-alloc error. ");

		if( !( pBufferRow = ippsMalloc_8u( sizerow ) ) )
			throw exception( "nIppSepFilterRC, ipp-row-temp mem-alloc error. ");

		Nrss -= ro;
		Ncss -= co;

		// organize dst buffer
		for( int ii=0,jj=Ncss;ii<< "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterRC, ipp-row-filter error. ");
			return sts;
		}

		if( sts = ippiFilterColumnPipeline_32f_C1R( (const Ipp32f**)ppSrc, pDst, dstStep, 
			roiSize, hc_flipped, Nc, pBufferCol) )
		{
			cout << "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterRC, ipp-column-filter error. ");
			return sts;
		}

		if( ppSrc )		 { ippsFree(ppSrc);		 ppSrc 		= NULL; }
		if( ppDst )		 { ippsFree(ppDst);		 ppDst 		= NULL; }
		if( pTmpLocal )	 { ippiFree(pTmpLocal);	 pTmpLocal	= NULL; }
		if( pBufferCol )    { ippsFree(pBufferCol); pBufferCol = NULL; }
		if( pBufferRow )   { ippsFree(pBufferRow); pBufferRow = NULL; }
		if( hr_flipped )     { ippsFree(hr_flipped); hr_flipped = NULL; }
		if( hc_flipped )     { ippsFree(hc_flipped); hc_flipped = NULL; }

		return sts;
	}
	CATCH_AUTO
}



//
//
// Column major version
static inline IppStatus IppSepFilterCR(	
									   Ipp32f*			pDst,					// Destination Image
									   const int&		dstStep,				// Destination step
									   const Ipp32f*	        pSrc,					// Source Image 
									   const int&		srcStep,				// Source step
									   const IppiSize&	roiSize,				// Source/Destination size
									   const Ipp32f*	        hc,					// Column filter
									   const int&		Nc,					// Column filter size
									   const Ipp32f*	        hr,					// Row filter
									   const int&		Nr,					// Row filter size
									   const PadType&	padType = CONSTANT,		// Padding type
									   const Ipp32f&	        val	= 0.0f )			// Value to use with CONSTANT padding
{
	TRY_AUTO
	{
		IppStatus sts;

		int sizerow, sizecol;
		Ipp32f *pTmp = NULL, *pPad = NULL;
		Ipp8u  *pBufferCol = NULL, *pBufferRow = NULL;

		// flip the kernels and align the memory to please IPP 
		Ipp32f *hc_flipped = (Ipp32f *)ippsMalloc_32f( Nc );
		Ipp32f *hr_flipped = (Ipp32f *)ippsMalloc_32f( Nr );

		ippsFlip_32f((const Ipp32f*)hc, hc_flipped, Nc );
		ippsFlip_32f((const Ipp32f*)hr, hr_flipped, Nr );

		// compute the kernel semisizes
		int Ncss = Nc >> 1;
		int Nrss = Nr >> 1;

		// compute the kernel offsets (0 -> odd, 1 -> even)
		int co = 1 - ( Nc % 2 );
		int ro = 1 - ( Nr % 2 );

		// allocate temporary dst buffer
		int tmpStep, padStep;
		// The IPP filter functions seem to need 1 more row allocated
		// than is obvious or they sometimes crash.
		IppiSize tmpSize; 
		tmpSize.width = roiSize.width; tmpSize.height = roiSize.height + Nc + 1;
		if( !( pTmp = ippiMalloc_32f_C1( tmpSize.width, tmpSize.height, &tmpStep ) ) )
			throw exception( "nIppSepFilterCR mem-alloc error." );

		int srcw = srcStep / sizeof(Ipp32f);
		int dstw = dstStep / sizeof(Ipp32f);
		int tmpw = tmpStep / sizeof(Ipp32f);
		ippiSet_32f_C1R( 0.0f, pTmp, tmpStep, tmpSize );

		int padw;
		IppiSize padSize;

		//	Only need pad space for CONSTANT
		if( padType == CONSTANT )
		{
			if( !( pPad = ippiMalloc_32f_C1( roiSize.width, (Ncss*2) - co, &padStep ) ) )
				throw exception( "nIppSepFilterCR mem-alloc error." );

			padw = padStep / sizeof(Ipp32f);
			padSize.height = (Ncss*2) - co; padSize.width = roiSize.width;
			ippiSet_32f_C1R( val, pPad, padStep, padSize );
		}

		Ipp32f **ppSrc, **ppDst;
		ppSrc = (Ipp32f**) ippsMalloc_32f( roiSize.height + Nc + 1 );
		ppDst = (Ipp32f**) ippsMalloc_32f( roiSize.height );

		// size of temporary buffers
		if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizerow) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-size error. ");

		if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizecol) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-size error. ");

		// allocate temporary buffers
		if( !( pBufferCol = ippsMalloc_8u( sizecol ) ) )
			throw exception( "nIppSepFilterCR, ipp-col-mem-alloc error. ");

		if( !( pBufferRow = ippsMalloc_8u( sizerow ) ) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-alloc error. ");

		Nrss -= ro;
		Ncss -= co;

		//	organize dst buffer
		for( int ii = 0, jj = Ncss; ii < roiSize.height; ++ii, ++jj )
		{
			ppDst[ii] = pDst + ii * dstw;
			ppSrc[jj] = (Ipp32f *)pSrc + ii * srcw;
		}

		IppiBorderType borderType;

		switch( padType )
		{
		case CONSTANT:
			for( int ii=0,jj=roiSize.height+Ncss;ii<< "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterCR col-filter error." );
			return sts;
		}

		if( sts = ippiFilterRowBorderPipeline_32f_C1R( (const Ipp32f*)pTmp, tmpStep, 
			ppDst, roiSize, hr_flipped, Nr, Nrss, borderType, val, pBufferRow) )
		{
			cout << "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterCR row-filter error." );
			return sts;
		}

		if( ppSrc )		 { ippsFree(ppSrc);		 ppSrc 		= NULL; }
		if( ppDst )		 { ippsFree(ppDst);		 ppDst 		= NULL; }
		if( pTmp )	 	 { ippiFree(pTmp);	 	 pTmp 		= NULL; }
		if( pPad )		 { ippsFree(pPad);		 pPad       = NULL; };
		if( pBufferCol )    { ippsFree(pBufferCol); pBufferCol = NULL; };
		if( pBufferRow )   { ippsFree(pBufferRow); pBufferRow = NULL; };
		if( hr_flipped )     { ippsFree(hr_flipped); hr_flipped = NULL; };
		if( hc_flipped )     { ippsFree(hc_flipped); hc_flipped = NULL; };

		return sts;

	}
	CATCH_AUTO
}

Let me repost the Column major version. I see the code insertion thnigy still has problems. :-(

//
//
// Column major version
static inline IppStatus IppSepFilterCR(	
									   Ipp32f*			pDst,					// Destination Image
									   const int&		dstStep,				// Destination step
									   const Ipp32f*	pSrc,					// Source Image 
									   const int&		srcStep,				// Source step
									   const IppiSize&	roiSize,				// Source/Destination size
									   const Ipp32f*	hc,						// Column filter
									   const int&		Nc,						// Column filter size
									   const Ipp32f*	hr,						// Row filter
									   const int&		Nr,						// Row filter size
									   const PadType&	padType = CONSTANT,		// Padding type
									   const Ipp32f&	val	= 0.0f )			// Value to use with CONSTANT padding
{
	TRY_AUTO
	{
		IppStatus sts;

		int sizerow, sizecol;
		Ipp32f *pTmp = NULL, *pPad = NULL;
		Ipp8u  *pBufferCol = NULL, *pBufferRow = NULL;

		// flip the kernels and align the memory to please IPP 
		Ipp32f *hc_flipped = (Ipp32f *)ippsMalloc_32f( Nc );
		Ipp32f *hr_flipped = (Ipp32f *)ippsMalloc_32f( Nr );

		ippsFlip_32f((const Ipp32f*)hc, hc_flipped, Nc );
		ippsFlip_32f((const Ipp32f*)hr, hr_flipped, Nr );

		// compute the kernel semisizes
		int Ncss = Nc >> 1;
		int Nrss = Nr >> 1;

		// compute the kernel offsets (0 -> odd, 1 -> even)
		int co = 1 - ( Nc % 2 );
		int ro = 1 - ( Nr % 2 );

		// allocate temporary dst buffer
		int tmpStep, padStep;
		// The IPP filter functions seem to need 1 more row allocated
		// than is obvious or they sometimes crash.
		IppiSize tmpSize; 
		tmpSize.width = roiSize.width; tmpSize.height = roiSize.height + Nc + 1;
		if( !( pTmp = ippiMalloc_32f_C1( tmpSize.width, tmpSize.height, &tmpStep ) ) )
			throw exception( "nIppSepFilterCR mem-alloc error." );

		int srcw = srcStep / sizeof(Ipp32f);
		int dstw = dstStep / sizeof(Ipp32f);
		int tmpw = tmpStep / sizeof(Ipp32f);
		ippiSet_32f_C1R( 0.0f, pTmp, tmpStep, tmpSize );

		int padw;
		IppiSize padSize;

		//	Only need pad space for CONSTANT
		if( padType == CONSTANT )
		{
			if( !( pPad = ippiMalloc_32f_C1( roiSize.width, (Ncss*2) - co, &padStep ) ) )
				throw exception( "nIppSepFilterCR mem-alloc error." );

			padw = padStep / sizeof(Ipp32f);
			padSize.height = (Ncss*2) - co; padSize.width = roiSize.width;
			ippiSet_32f_C1R( val, pPad, padStep, padSize );
		}

		Ipp32f **ppSrc, **ppDst;
		ppSrc = (Ipp32f**) ippsMalloc_32f( roiSize.height + Nc + 1 );
		ppDst = (Ipp32f**) ippsMalloc_32f( roiSize.height );

		// size of temporary buffers
		if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizerow) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-size error. ");

		if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizecol) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-size error. ");

		// allocate temporary buffers
		if( !( pBufferCol = ippsMalloc_8u( sizecol ) ) )
			throw exception( "nIppSepFilterCR, ipp-col-mem-alloc error. ");

		if( !( pBufferRow = ippsMalloc_8u( sizerow ) ) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-alloc error. ");

		Nrss -= ro;
		Ncss -= co;

		//	organize dst buffer
		for( int ii = 0, jj = Ncss; ii < roiSize.height; ++ii, ++jj )
		{
			ppDst[ii] = pDst + ii * dstw;
			ppSrc[jj] = (Ipp32f *)pSrc + ii * srcw;
		}

		IppiBorderType borderType;

		switch( padType )
		{
		case CONSTANT:
			for( int ii=0,jj=roiSize.height+Ncss;ii<< "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterCR col-filter error." );
			return sts;
		}

		if( sts = ippiFilterRowBorderPipeline_32f_C1R( (const Ipp32f*)pTmp, tmpStep, 
			ppDst, roiSize, hr_flipped, Nr, Nrss, borderType, val, pBufferRow) )
		{
			cout << "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterCR row-filter error." );
			return sts;
		}

		if( ppSrc )		 { ippsFree(ppSrc);		 ppSrc 		= NULL; }
		if( ppDst )		 { ippsFree(ppDst);		 ppDst 		= NULL; }
		if( pTmp )	 	 { ippiFree(pTmp);	 	 pTmp 		= NULL; }
		if( pPad )		 { ippsFree(pPad);		 pPad       = NULL; };
		if( pBufferCol ) { ippsFree(pBufferCol); pBufferCol = NULL; };
		if( pBufferRow ) { ippsFree(pBufferRow); pBufferRow = NULL; };
		if( hr_flipped ) { ippsFree(hr_flipped); hr_flipped = NULL; };
		if( hc_flipped ) { ippsFree(hc_flipped); hc_flipped = NULL; };

		return sts;

	}
	CATCH_AUTO
}

Holy Crap! The code looks okay in the editor window, then it gets cut up when it gets posted. Trying again!

//
//
// Column major version
static inline IppStatus IppSepFilterCR(	
									   Ipp32f*			pDst,					// Destination Image
									   const int&		dstStep,				// Destination step
									   const Ipp32f*	pSrc,					// Source Image 
									   const int&		srcStep,				// Source step
									   const IppiSize&	roiSize,				// Source/Destination size
									   const Ipp32f*	hc,						// Column filter
									   const int&		Nc,						// Column filter size
									   const Ipp32f*	hr,						// Row filter
									   const int&		Nr,						// Row filter size
									   const PadType&	padType = CONSTANT,		// Padding type
									   const Ipp32f&	val	= 0.0f )			// Value to use with CONSTANT padding
{
	TRY_AUTO
	{
		IppStatus sts;

		int sizerow, sizecol;
		Ipp32f *pTmp = NULL, *pPad = NULL;
		Ipp8u  *pBufferCol = NULL, *pBufferRow = NULL;

		// flip the kernels and align the memory to please IPP 
		Ipp32f *hc_flipped = (Ipp32f *)ippsMalloc_32f( Nc );
		Ipp32f *hr_flipped = (Ipp32f *)ippsMalloc_32f( Nr );

		ippsFlip_32f((const Ipp32f*)hc, hc_flipped, Nc );
		ippsFlip_32f((const Ipp32f*)hr, hr_flipped, Nr );

		// compute the kernel semisizes
		int Ncss = Nc >> 1;
		int Nrss = Nr >> 1;

		// compute the kernel offsets (0 -> odd, 1 -> even)
		int co = 1 - ( Nc % 2 );
		int ro = 1 - ( Nr % 2 );

		// allocate temporary dst buffer
		int tmpStep, padStep;
		// The IPP filter functions seem to need 1 more row allocated
		// than is obvious or they sometimes crash.
		IppiSize tmpSize; 
		tmpSize.width = roiSize.width; tmpSize.height = roiSize.height + Nc + 1;
		if( !( pTmp = ippiMalloc_32f_C1( tmpSize.width, tmpSize.height, &tmpStep ) ) )
			throw exception( "nIppSepFilterCR mem-alloc error." );

		int srcw = srcStep / sizeof(Ipp32f);
		int dstw = dstStep / sizeof(Ipp32f);
		int tmpw = tmpStep / sizeof(Ipp32f);
		ippiSet_32f_C1R( 0.0f, pTmp, tmpStep, tmpSize );

		int padw;
		IppiSize padSize;

		//	Only need pad space for CONSTANT
		if( padType == CONSTANT )
		{
			if( !( pPad = ippiMalloc_32f_C1( roiSize.width, (Ncss*2) - co, &padStep ) ) )
				throw exception( "nIppSepFilterCR mem-alloc error." );

			padw = padStep / sizeof(Ipp32f);
			padSize.height = (Ncss*2) - co; padSize.width = roiSize.width;
			ippiSet_32f_C1R( val, pPad, padStep, padSize );
		}

		Ipp32f **ppSrc, **ppDst;
		ppSrc = (Ipp32f**) ippsMalloc_32f( roiSize.height + Nc + 1 );
		ppDst = (Ipp32f**) ippsMalloc_32f( roiSize.height );

		// size of temporary buffers
		if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizerow) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-size error. ");

		if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizecol) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-size error. ");

		// allocate temporary buffers
		if( !( pBufferCol = ippsMalloc_8u( sizecol ) ) )
			throw exception( "nIppSepFilterCR, ipp-col-mem-alloc error. ");

		if( !( pBufferRow = ippsMalloc_8u( sizerow ) ) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-alloc error. ");

		Nrss -= ro;
		Ncss -= co;

		//	organize dst buffer
		for( int ii = 0, jj = Ncss; ii < roiSize.height; ++ii, ++jj )
		{
			ppDst[ii] = pDst + ii * dstw;
			ppSrc[jj] = (Ipp32f *)pSrc + ii * srcw;
		}

		IppiBorderType borderType;

		switch( padType )
		{
		case CONSTANT:
			for( int ii=0,jj=roiSize.height+Ncss;ii,++jj )
			{
				ppSrc[ii] = pPad + ii * padw;
				ppSrc[jj] = pPad + (ii + Ncss) * padw;
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = pPad + (Ncss*2) * srcw;
			}
			borderType = ippBorderConst;
			break;

		case REPLICATE:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = ppSrc[Ncss];
				ppSrc[jj] = ppSrc[roiSize.height+Ncss-1];
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = ppSrc[roiSize.height+Ncss-1];
			}
			borderType = ippBorderRepl;
			break;

		case SYMMETRIC:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = ppSrc[(Ncss*2)-ii-1];
				ppSrc[jj] = ppSrc[roiSize.height+Ncss-ii-1];
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = ppSrc[roiSize.height-1];
			}
			borderType = ippBorderMirrorR;
			break;

		case CIRCULAR:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = ppSrc[roiSize.height+ii];
				ppSrc[jj] = ppSrc[ii+Ncss];
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = ppSrc[Ncss*2];
			}
			borderType = ippBorderWrap;
			break;

		default:
			// ippStsBorderErr missing from ippdefs.h
			return ippStsPaddingSchemeErr;  
		}


		// perform the actual convolutions
		if( sts = ippiFilterColumnPipeline_32f_C1R( (const Ipp32f**)ppSrc, pTmp, tmpStep, 
			roiSize, hc_flipped, Nc, pBufferCol) )
		{
			//cout << "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterCR col-filter error." );
			return sts;
		}

		if( sts = ippiFilterRowBorderPipeline_32f_C1R( (const Ipp32f*)pTmp, tmpStep, 
			ppDst, roiSize, hr_flipped, Nr, Nrss, borderType, val, pBufferRow) )
		{
			//cout << "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterCR row-filter error." );
			return sts;
		}

		if( ppSrc )		 { ippsFree(ppSrc);		 ppSrc 		= NULL; }
		if( ppDst )		 { ippsFree(ppDst);		 ppDst 		= NULL; }
		if( pTmp )	 	 { ippiFree(pTmp);	 	 pTmp 		= NULL; }
		if( pPad )		 { ippsFree(pPad);		 pPad       = NULL; };
		if( pBufferCol ) { ippsFree(pBufferCol); pBufferCol = NULL; };
		if( pBufferRow ) { ippsFree(pBufferRow); pBufferRow = NULL; };
		if( hr_flipped ) { ippsFree(hr_flipped); hr_flipped = NULL; };
		if( hc_flipped ) { ippsFree(hc_flipped); hc_flipped = NULL; };

		return sts;

	}
	CATCH_AUTO
}

Quoting - jschamus
Holy Crap! The code looks okay in the editor window, then it gets cut up when it gets posted. Trying again!

//
//
// Column major version
static inline IppStatus IppSepFilterCR(	
									   Ipp32f*			pDst,					// Destination Image
									   const int&		dstStep,				// Destination step
									   const Ipp32f*	pSrc,					// Source Image 
									   const int&		srcStep,				// Source step
									   const IppiSize&	roiSize,				// Source/Destination size
									   const Ipp32f*	hc,						// Column filter
									   const int&		Nc,						// Column filter size
									   const Ipp32f*	hr,						// Row filter
									   const int&		Nr,						// Row filter size
									   const PadType&	padType = CONSTANT,		// Padding type
									   const Ipp32f&	val	= 0.0f )			// Value to use with CONSTANT padding
{
	TRY_AUTO
	{
		IppStatus sts;

		int sizerow, sizecol;
		Ipp32f *pTmp = NULL, *pPad = NULL;
		Ipp8u  *pBufferCol = NULL, *pBufferRow = NULL;

		// flip the kernels and align the memory to please IPP 
		Ipp32f *hc_flipped = (Ipp32f *)ippsMalloc_32f( Nc );
		Ipp32f *hr_flipped = (Ipp32f *)ippsMalloc_32f( Nr );

		ippsFlip_32f((const Ipp32f*)hc, hc_flipped, Nc );
		ippsFlip_32f((const Ipp32f*)hr, hr_flipped, Nr );

		// compute the kernel semisizes
		int Ncss = Nc >> 1;
		int Nrss = Nr >> 1;

		// compute the kernel offsets (0 -> odd, 1 -> even)
		int co = 1 - ( Nc % 2 );
		int ro = 1 - ( Nr % 2 );

		// allocate temporary dst buffer
		int tmpStep, padStep;
		// The IPP filter functions seem to need 1 more row allocated
		// than is obvious or they sometimes crash.
		IppiSize tmpSize; 
		tmpSize.width = roiSize.width; tmpSize.height = roiSize.height + Nc + 1;
		if( !( pTmp = ippiMalloc_32f_C1( tmpSize.width, tmpSize.height, &tmpStep ) ) )
			throw exception( "nIppSepFilterCR mem-alloc error." );

		int srcw = srcStep / sizeof(Ipp32f);
		int dstw = dstStep / sizeof(Ipp32f);
		int tmpw = tmpStep / sizeof(Ipp32f);
		ippiSet_32f_C1R( 0.0f, pTmp, tmpStep, tmpSize );

		int padw;
		IppiSize padSize;

		//	Only need pad space for CONSTANT
		if( padType == CONSTANT )
		{
			if( !( pPad = ippiMalloc_32f_C1( roiSize.width, (Ncss*2) - co, &padStep ) ) )
				throw exception( "nIppSepFilterCR mem-alloc error." );

			padw = padStep / sizeof(Ipp32f);
			padSize.height = (Ncss*2) - co; padSize.width = roiSize.width;
			ippiSet_32f_C1R( val, pPad, padStep, padSize );
		}

		Ipp32f **ppSrc, **ppDst;
		ppSrc = (Ipp32f**) ippsMalloc_32f( roiSize.height + Nc + 1 );
		ppDst = (Ipp32f**) ippsMalloc_32f( roiSize.height );

		// size of temporary buffers
		if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizerow) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-size error. ");

		if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizecol) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-size error. ");

		// allocate temporary buffers
		if( !( pBufferCol = ippsMalloc_8u( sizecol ) ) )
			throw exception( "nIppSepFilterCR, ipp-col-mem-alloc error. ");

		if( !( pBufferRow = ippsMalloc_8u( sizerow ) ) )
			throw exception( "nIppSepFilterCR, ipp-row-mem-alloc error. ");

		Nrss -= ro;
		Ncss -= co;

		//	organize dst buffer
		for( int ii = 0, jj = Ncss; ii < roiSize.height; ++ii, ++jj )
		{
			ppDst[ii] = pDst + ii * dstw;
			ppSrc[jj] = (Ipp32f *)pSrc + ii * srcw;
		}

		IppiBorderType borderType;

		switch( padType )
		{
		case CONSTANT:
			for( int ii=0,jj=roiSize.height+Ncss;ii,++jj )
			{
				ppSrc[ii] = pPad + ii * padw;
				ppSrc[jj] = pPad + (ii + Ncss) * padw;
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = pPad + (Ncss*2) * srcw;
			}
			borderType = ippBorderConst;
			break;

		case REPLICATE:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = ppSrc[Ncss];
				ppSrc[jj] = ppSrc[roiSize.height+Ncss-1];
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = ppSrc[roiSize.height+Ncss-1];
			}
			borderType = ippBorderRepl;
			break;

		case SYMMETRIC:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = ppSrc[(Ncss*2)-ii-1];
				ppSrc[jj] = ppSrc[roiSize.height+Ncss-ii-1];
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = ppSrc[roiSize.height-1];
			}
			borderType = ippBorderMirrorR;
			break;

		case CIRCULAR:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = ppSrc[roiSize.height+ii];
				ppSrc[jj] = ppSrc[ii+Ncss];
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = ppSrc[Ncss*2];
			}
			borderType = ippBorderWrap;
			break;

		default:
			// ippStsBorderErr missing from ippdefs.h
			return ippStsPaddingSchemeErr;  
		}


		// perform the actual convolutions
		if( sts = ippiFilterColumnPipeline_32f_C1R( (const Ipp32f**)ppSrc, pTmp, tmpStep, 
			roiSize, hc_flipped, Nc, pBufferCol) )
		{
			//cout << "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterCR col-filter error." );
			return sts;
		}

		if( sts = ippiFilterRowBorderPipeline_32f_C1R( (const Ipp32f*)pTmp, tmpStep, 
			ppDst, roiSize, hr_flipped, Nr, Nrss, borderType, val, pBufferRow) )
		{
			//cout << "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterCR row-filter error." );
			return sts;
		}

		if( ppSrc )		 { ippsFree(ppSrc);		 ppSrc 		= NULL; }
		if( ppDst )		 { ippsFree(ppDst);		 ppDst 		= NULL; }
		if( pTmp )	 	 { ippiFree(pTmp);	 	 pTmp 		= NULL; }
		if( pPad )		 { ippsFree(pPad);		 pPad       = NULL; };
		if( pBufferCol ) { ippsFree(pBufferCol); pBufferCol = NULL; };
		if( pBufferRow ) { ippsFree(pBufferRow); pBufferRow = NULL; };
		if( hr_flipped ) { ippsFree(hr_flipped); hr_flipped = NULL; };
		if( hc_flipped ) { ippsFree(hc_flipped); hc_flipped = NULL; };

		return sts;

	}
	CATCH_AUTO
}

A small correction to the correction to correction. The section of the code where it messed up should read:
switch( padType )
{
case CONSTANT:
for( int ii=0,jj=roiSize.height+Ncss;ii {

On closer examination I see that the RC function got chopped up too. Sigh!

// Row major version
static inline IppStatus IppSepFilterRC(	
									   Ipp32f*			pDst,					// Destination Image
									   const int&		dstStep,				// Destination step
									   const Ipp32f*	pSrc,					// Source Image 
									   const int&		srcStep,				// Source step
									   const IppiSize&	roiSize,				// Source/Destination size
									   const Ipp32f*	hr,						// Row filter
									   const int&		Nr,						// Row filter size
									   const Ipp32f*	hc,						// Column filter
									   const int&		Nc,						// Column filter size
									   const PadType&	padType = CONSTANT,		// Padding type
									   const Ipp32f&	val	= 0.0f )			// Value to use with CONSTANT padding
{
	TRY_AUTO
	{
		IppStatus sts;

		int sizerow, sizecol;
		Ipp32f *pTmp = NULL;
		Ipp32f *pTmpLocal = NULL;
		Ipp8u *pBufferCol = NULL, *pBufferRow = NULL;

		//	flip the kernels and align the memory to please IPP 
		Ipp32f *hc_flipped = (Ipp32f *)ippsMalloc_32f( Nc );
		Ipp32f *hr_flipped = (Ipp32f *)ippsMalloc_32f( Nr );

		ippsFlip_32f((const Ipp32f*)hc, hc_flipped, Nc );
		ippsFlip_32f((const Ipp32f*)hr, hr_flipped, Nr );

		// 	compute the kernel semisizes
		int Ncss = Nc >> 1;
		int Nrss = Nr >> 1;

		// 	compute the kernel offsets (0 -> odd, 1 -> even)
		int co = 1 - ( Nc % 2 );
		int ro = 1 - ( Nr % 2 );

		//	allocate temporary dst buffer
		int tmpStep;
		int tmpw;

		// 	The IPP filter functions seem to need 1 more row allocated
		// 	than is obvious or they sometimes crash.
		int tmpHeight = roiSize.height+Nc+1;
		int tmpWidth  = roiSize.width;

		if( !( pTmpLocal = ippiMalloc_32f_C1( roiSize.width, roiSize.height + Nc + 1, &tmpStep ) ) )
			throw exception( "nIppSepFilterRC, mem-alloc error. " );
		pTmp = pTmpLocal;
		tmpw = tmpStep / sizeof(Ipp32f);

		Ipp32f **ppSrc, **ppDst;
		ppSrc = (Ipp32f**) ippsMalloc_32f( roiSize.height + Nc + 1 );
		ppDst = (Ipp32f**) ippsMalloc_32f( roiSize.height );

		if( padType == CONSTANT )
		{
			IppiSize tmpSize;
			tmpSize.height = roiSize.height + Nc + 1; 
			tmpSize.width  = roiSize.width;
			ippiSet_32f_C1R( val, pTmp, tmpStep, tmpSize );
		}

		// 	size of temporary buffers
		if( sts = ippiFilterRowBorderPipelineGetBufferSize_32f_C1R( roiSize, Nr, &sizerow) )
			throw exception( "nIppSepFilterRC, ipp-row-mem-size error. ");

		if( sts = ippiFilterColumnPipelineGetBufferSize_32f_C1R( roiSize, Nc, &sizecol) )
			throw exception( "nIppSepFilterRC, ipp-col-mem-size error. ");

		//	allocate temporary buffers
		if( !( pBufferCol = ippsMalloc_8u( sizecol ) ) )
			throw exception( "nIppSepFilterRC, ipp-col-temp mem-alloc error. ");

		if( !( pBufferRow = ippsMalloc_8u( sizerow ) ) )
			throw exception( "nIppSepFilterRC, ipp-row-temp mem-alloc error. ");

		Nrss -= ro;
		Ncss -= co;

		// organize dst buffer
		for( int ii=0,jj=Ncss;ii		{
			ppDst[ii] = pTmp + jj * tmpw;
			ppSrc[jj] = pTmp + jj * tmpw;
		}

		IppiBorderType borderType;

		switch( padType )
		{
		case CONSTANT:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = pTmp + ii * tmpw;
				ppSrc[jj] = pTmp + jj * tmpw;
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = pTmp + (roiSize.height+(Ncss*2)) * tmpw;
			}
			borderType = ippBorderConst;
			break;

		case REPLICATE:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = ppSrc[Ncss];
				ppSrc[jj] = ppSrc[roiSize.height+Ncss-1];
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = ppSrc[roiSize.height+Ncss-1];
			}
			borderType = ippBorderRepl;
			break;

		case SYMMETRIC:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = ppSrc[(Ncss*2)-ii-1];
				ppSrc[jj] = ppSrc[roiSize.height+Ncss-ii-1];
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = ppSrc[roiSize.height-1];
			}
			borderType = ippBorderMirrorR;
			break;

		case CIRCULAR:
			for( int ii=0,jj=roiSize.height+Ncss;ii			{
				ppSrc[ii] = ppSrc[roiSize.height+ii];
				ppSrc[jj] = ppSrc[ii+Ncss];
			}
			if( co )
			{
				ppSrc[roiSize.height+(Ncss*2)] = ppSrc[Ncss*2];
			}
			borderType = ippBorderWrap;
			break;

		default:
			// ippStsBorderErr missing from ippdefs.h
			return ippStsPaddingSchemeErr;  
		}

		// perform the actual convolutions
		if( sts = ippiFilterRowBorderPipeline_32f_C1R( (const Ipp32f*)pSrc, srcStep, 
			ppDst, roiSize, hr_flipped, Nr, Nrss, borderType, val, pBufferRow) )
		{
			cout << "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterRC, ipp-row-filter error. ");
			return sts;
		}

		if( sts = ippiFilterColumnPipeline_32f_C1R( (const Ipp32f**)ppSrc, pDst, dstStep, 
			roiSize, hc_flipped, Nc, pBufferCol) )
		{
			cout << "IPP Error: " << ippGetStatusString( sts ) << endl;
			throw exception( "nIppSepFilterRC, ipp-column-filter error. ");
			return sts;
		}

		if( ppSrc )		 { ippsFree(ppSrc);		 ppSrc 		= NULL; }
		if( ppDst )		 { ippsFree(ppDst);		 ppDst 		= NULL; }
		if( pTmpLocal )	 { ippiFree(pTmpLocal);	 pTmpLocal	= NULL; }
		if( pBufferCol ) { ippsFree(pBufferCol); pBufferCol = NULL; }
		if( pBufferRow ) { ippsFree(pBufferRow); pBufferRow = NULL; }
		if( hr_flipped ) { ippsFree(hr_flipped); hr_flipped = NULL; }
		if( hc_flipped ) { ippsFree(hc_flipped); hc_flipped = NULL; }

		return sts;
	}
	CATCH_AUTO
}


Hello,

thanks for updating your sample of IPP based convolution. I would recommend to use attachment for big code chunks instead of inserting code into post as it may cause truncation.

If you can provide us a sample which cause crach in 64-bit mode we will investigate the reason (you also may quickly check if issue is related to optimized code by dispatching a generic, PX version of IPP library)

Regards,
Vladimir

Leave a Comment

Please sign in to add a comment. Not a member? Join today