Increasing delay into video decoder

Increasing delay into video decoder

Hi,

I'm developing a H264 video decoder (using copy-back) for a cloud application receiving a video stream. The video stream is sent using our own protocol, parsed and recomposed so that the video decoder receives one frame each time.

Is currently working and decoding fine but it increases its latency on and on (up to 7" in half and hour).

What's going on? Any ideas?

Thank you!

Initialization code:

mfxStatus sts = MFX_ERR_NONE;
	
	// Initialize Intel Media SDK session
	// - MFX_IMPL_AUTO_ANY selects HW acceleration if available (on any adapter)
	// - Version 1.0 is selected for greatest backwards compatibility.
	// OS specific notes
	// - On Windows both SW and HW libraries may present
	// - On Linux only HW library only is available
	//   If more recent API features are needed, change the version accordingly
	mfxIMPL impl = MFX_IMPL_AUTO_ANY;
	mfxVersion ver = { {0, 1} };
	
	if (_session == GAAS_NULL_PTR)
	{
		_session = new MFXVideoSession();

		// Initialize Intel Media SDK Session
		sts = _session->Init(impl, &ver);
		if (sts < MFX_ERR_NONE)
		{
			GERROR("Error initializating session, mfxStatus code: %i",sts);
			return GAAS_ERROR;
		}
	}

	// Create Media SDK decoder
	_mfxDEC = new MFXVideoDECODE(*_session);

	// Set required video parameters for decode
	_mfxVideoParams = new mfxVideoParam();
	memset(_mfxVideoParams, 0, sizeof(_mfxVideoParams));
	_mfxVideoParams->mfx.CodecId = MFX_CODEC_AVC;
	_mfxVideoParams->IOPattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
	
	// Prepare Media SDK bit stream buffer
	// - Arbitrary buffer size for this example
	_mfxBS = new mfxBitstream();
	memset(_mfxBS, 0, sizeof(_mfxBS));
	_mfxBS->MaxLength = 1024 * 1024;
	_mfxBS->Data = new mfxU8[_mfxBS->MaxLength];
	if (!_mfxBS->Data)
	{
		GERROR("Stream buffer allocation failed.");
		return GAAS_ERROR;
	}

First packet arrived code:

GAAS_UINT32 uiBytesDecoded = 0;
	mfxStatus sts = MFX_ERR_NONE;

	// Read a chunk of data from stream file into bit stream buffer
	// - Parse bit stream, searching for header and fill video parameters structure
	// - Abort if bit stream header is not found in the first bit stream buffer chunk
	uiBytesDecoded = ReadBitStreamData(_mfxBS, ptData, uiSize);
	
	sts = _mfxDEC->DecodeHeader(_mfxBS, _mfxVideoParams);
	if(sts == MFX_WRN_PARTIAL_ACCELERATION)
	{
		sts = MFX_ERR_NONE;
	}
	if (sts < MFX_ERR_NONE)
	{
		GERROR("Decoding error mfxStatus code after decoding header: %i",sts);
		return -1;
	}
	// Validate video decode parameters (optional)
	// sts = mfxDEC.Query(&mfxVideoParams, &mfxVideoParams);
	_mfxVideoParams->AsyncDepth = 1;

	// Query number of required surfaces for decoder
	mfxFrameAllocRequest Request;
	memset(&Request, 0, sizeof(Request));
	sts = _mfxDEC->QueryIOSurf(_mfxVideoParams, &Request);
	
	if(sts == MFX_WRN_PARTIAL_ACCELERATION)
	{
		sts = MFX_ERR_NONE;
	}
	if (sts < MFX_ERR_NONE)
	{
		GERROR("Decoding error mfxStatus code after querying requested surfaces: %i",sts);
		return -1;
	}

	_numSurfaces = Request.NumFrameSuggested;

	// Allocate surfaces for decoder
	// - Width and height of buffer must be aligned, a multiple of 32
	// - Frame surface array keeps pointers all surface planes and general frame info
	mfxU16 width = (mfxU16) MEM_ALIGN32(Request.Info.Width);
	mfxU16 height = (mfxU16) MEM_ALIGN32(Request.Info.Height);
	mfxU8 bitsPerPixel = 12;        // NV12 format is a 12 bits per pixel format

	_configuration->_width	= width; 
	_configuration->_height = height -16; // - ALIGNMENT
	_configuration->_format = GAAS_YUV420P;

	mfxU32 surfaceSize = width * height * bitsPerPixel / 8;
	_surfaceBuffers = (mfxU8*) new mfxU8[surfaceSize * _numSurfaces];

	// Allocate surface headers (mfxFrameSurface1) for decoder
	_pmfxSurfaces = new mfxFrameSurface1 *[_numSurfaces];
	if(!_pmfxSurfaces)
	{
		GERROR("Failed to allocate frame surfaces.");
		return -1;
	}

	for (int i = 0; i < _numSurfaces; i++) 
	{
		_pmfxSurfaces[i] = new mfxFrameSurface1;
		memset(_pmfxSurfaces[i], 0, sizeof(mfxFrameSurface1));
		memcpy(&(_pmfxSurfaces[i]->Info), &(_mfxVideoParams->mfx.FrameInfo), sizeof(mfxFrameInfo));
		_pmfxSurfaces[i]->Data.Y = &_surfaceBuffers[surfaceSize * i];
		_pmfxSurfaces[i]->Data.U = _pmfxSurfaces[i]->Data.Y + width * height;
		_pmfxSurfaces[i]->Data.V = _pmfxSurfaces[i]->Data.U + 1;
		_pmfxSurfaces[i]->Data.Pitch = width;
	}

	// Initialize the Media SDK decoder
	sts = _mfxDEC->Init(_mfxVideoParams);
	if(sts == MFX_WRN_PARTIAL_ACCELERATION)
	{
		sts = MFX_ERR_NONE;
	}
	if (sts < MFX_ERR_NONE)
	{
		GERROR("Decoding error mfxStatus code after Init: %i",sts);
		return -1;
	}

Decode and show:

GAAS_UINT32 uiSize = frameToDecode->GetBufferSize();
	GAAS_UCHAR* ptData = frameToDecode->GetBufferStart();

	GAAS_UINT32 bytesProcessed = 0; 
	mfxFrameSurface1* pmfxOutSurface = NULL;

	mfxSyncPoint syncp;
	int nIndex = 0;
	mfxStatus sts = MFX_ERR_NONE;

	while (uiSize > 0)
	{
		if( _currentFrameIndex == 0 )
		{
			if (frameToDecode->GetType() == I_FRAME	)
			{
				bytesProcessed = processFirstPkt(ptData,uiSize); 
			}
			else
			{
				return GAAS_ERROR;
			}
		}
		else
		{
			// Read a chunk of data from stream file into bit stream buffer
			// - Parse bit stream, searching for header and fill video parameters structure
			// - Abort if bit stream header is not found in the first bit stream buffer chunk
			bytesProcessed = ReadBitStreamData(_mfxBS, ptData, uiSize);
			if ( bytesProcessed < 0) 
			{
				GERROR( "Error while decoding frame %d and error: %d" , _currentFrameIndex , bytesProcessed );
				return GAAS_ERROR;
			}
		}
		uiSize -= bytesProcessed; // update packet size
		ptData += bytesProcessed; // update packet pointer

		syncp = NULL;
		while (syncp == NULL)
		{
			// Decode a frame asychronously (returns immediately)
			//  - If input bitstream contains multiple frames DecodeFrameAsync will start decoding multiple frames, and remove them from bitstream
			sts = _mfxDEC->DecodeFrameAsync(_mfxBS, _pmfxSurfaces[nIndex], &pmfxOutSurface, &syncp);

			if (MFX_WRN_DEVICE_BUSY == sts)
			{
				Sleep(1); // Wait if device is busy, then repeat the same call to DecodeFrameAsync
			}
			else if (sts == MFX_ERR_MORE_SURFACE)
			{
				// Find free frame surface
				if (_pmfxSurfaces)
				{
					for (GAAS_UINT16 i = 0; i < _numSurfaces; i++)
					{
						if (0 == _pmfxSurfaces[i]->Data.Locked)
						{
							nIndex = i;
							break;
						}
					}
				}
				else
				{
					GERROR("Failed to allocate memory.");
					return GAAS_ERROR;
				}
			}
			else if (sts == MFX_ERR_NONE)
			{
				break;
			}
			else 
			{
				GERROR("Error decoding.");
				return GAAS_ERROR;
			}
		}


		// Ignore warnings if output is available,
		// if no output and no action required just repeat the DecodeFrameAsync call
		if (MFX_ERR_NONE < sts && syncp)
		{
			sts = MFX_ERR_NONE;
		}

		if (MFX_ERR_NONE == sts)
		{
			sts = _session->SyncOperation(syncp, 60000);      // Synchronize. Wait until decoded frame is ready
		}

		if (MFX_ERR_NONE == sts) 
		{
			//Show current decoded frame!
			if( _readyToDisplaymutex->tryLock( 15 ) )
			{
				WriteRawFrame(pmfxOutSurface, _pScreenImageData); 

				GAAS_BOOL setRes = _newFrameToDisplayEvent->setEvent();
				_readyToDisplaymutex->unlock(); 
			}
			else
			{
				GINFO( "Dropping Frame (decoded but not displayed), _readyToDisplaymutex is locked" );
			}

			_currentFrameIndex++;//Increase current frame index
		}
		
		// MFX_ERR_MORE_DATA means that file has ended, need to go to buffering loop, exit in case of other errors
		if(sts == MFX_ERR_MORE_DATA)
		{
			sts = MFX_ERR_NONE;
		}
		if (sts < MFX_ERR_NONE)
		{
			GERROR("Decoding error mfxStatus code after main decoding loop: %i",sts);
			return GAAS_ERROR;
		}
	}

 

7 posts / 0 new
Last post
For more complete information about compiler optimizations, see our Optimization Notice.

Thanks for this report.  Could you send more details on how you see the latency increase?  Is the 7 seconds spread out across all frames (that is, averaging 3.9 milliseconds per frame, 7 seconds total of latency in 1800 seconds, assuming 30 FPS)?   Or does it grow with each frame so you see 7 seconds of delay per frame after 30 minutes?  If it is the latter case, have you tried (for the sake of debugging) to add any buffering so you can guarantee that decode is not needing to do lots of polling for more data?

 

 

Hi Jeffrey,

The latency is increasing progressively. I haven't done any exact measurement...

The application is organized in three main blocks:

1.- First of all, it receives several messages, parses them, composes complete H264 coded packets with one full frame each. Here there is a queue for storing the composed packets.

2.- These packets are provided to the decoder

3.- The decoded frames are directly copied to an OpenGL texture, to be shown after the pixel conversion (shader).

Before I added the Intel Decoder, I was using a custom ffmpeg-based CPU decoder and there was no increasing delay at all. So we can assure that there's enough data income and the painting process is working fine.

I've checked both our packet queue and the surfaces used by the decoder and they always use only 1 slot.

Do you see something wrong in the code? Do you need more info?

Thank you!

 

P.S. WriteRawFrame and ReadBitStreamData are both modified to work with my data.

Hi Jeffrey,

I've just finished a FFMPEG based DXVA2 decoder and it has the same behaviour even in other machine with different graphic card (both Intel, of course).

Could it have something to do with the size and number of the slices (H264 stream) ?

Any thoughts?

Media SDK can have variations in latency for a variety of reasons.  Latency is important -- we're always working on minimizing overall latency and maximizing consistency.   While max latency is expected to increase with GPU load, which correlates to factors like resolution and # of streams, the distributions I've seen so far have not had the characteristic of increasing over time that you've described.  I'll try to replicate, but in the meantime I wonder if you've tried experiments to localize the issue.  If the problem can be traced to Media SDK or the driver it should be possible to replicate the increasing latency with a simplified pipeline of file input->decode->null output (block #2 as you've described above).  If this happens for specific inputs, perhaps including network transfer errors, this reproducer stream stored as a file could be very helpful.  Best case would be an input file with minimal modifications to a tutorial or sample.

 

I've taken some measurements (averages):

- ReadBitStreamData: 0.002ms

- DecodeFrameAsync: 0.17ms (just the call to this method)

- SyncOperation: 6.5ms

- WriteRawFrame: 7.5ms

- All the decode frame method: 15ms

- All the decode frame method with FFMPEG (CPU decoding): 6.8ms

The input is H264 stream with variable bitrate and framerate from 30 to 33FPS which we can not control...

Could it be possible that as the Intel decoder is so slow, those frames are being buffered at the display queue and thus the display delay is increasing? Could it have something to do with the paint thread?

Remember I copy the decoded frame from the surface to system memory (converting from NV12 to YUV420P), then to a OpenGL texture, execute a pixel shader program to convert to RGB24 and finally paint.

Is it worth implementing the DX surface to OpenGL texture connection and NV12 to RGB24 shader just to reduce the WriteRawFrame execution time just for testing or do you expect no difference in the behaviour?

P.S. This is my adapted WriteRawFrame code (I tried to change the least possible from the original sample):

//-----------------------------------------------------------------------------

void IntelH264Decoder::WriteSection(mfxU8* plane, mfxU16 factor, mfxU16 chunksize,
                       mfxFrameInfo* pInfo, mfxFrameData* pData, mfxU32 i,
                       mfxU32 j, GAAS_UCHAR* pScreenImageData)
{
	memcpy(pScreenImageData, 
				 plane + (pInfo->CropY * pData->Pitch / factor + pInfo->CropX) + i * pData->Pitch + j, 
				 chunksize);
}

//-----------------------------------------------------------------------------

void IntelH264Decoder::WriteRawFrame(mfxFrameSurface1* pSurface, GAAS_UCHAR* pScreenImageData)
{
	mfxFrameInfo* pInfo = &pSurface->Info;
	mfxFrameData* pData = &pSurface->Data;
	mfxU32 i, j, h, w;
	GAAS_UCHAR* ptDst = pScreenImageData;
	for (i = 0; i < pInfo->CropH; i++)
	{
		WriteSection(pData->Y, 1, pInfo->CropW, pInfo, pData, i, 0, ptDst );
		ptDst += pInfo->CropW;
	}

	h = pInfo->CropH / 2;
	w = pInfo->CropW;

	for (i = 0; i < h; i++)
	{
		for (j = 0; j < w; j += 2)
		{
			WriteSection(pData->UV, 2, 1, pInfo, pData, i, j, ptDst );
			ptDst++;
		}
	}

	for (i = 0; i < h; i++)
	{
		for (j = 1; j < w; j += 2)
		{
			WriteSection(pData->UV, 2, 1, pInfo, pData, i, j, ptDst );
			ptDst ++;
		}
	}

}

//-----------------------------------------------------------------------------

 

Login to leave a comment.