Hello Gents,

I am having a problem here and I would appreciate some advices. I have to write a device driver for a PCI device. The PCI device supports scatter and gather DMA.

Functionnaly I have my implementation working. However it seems like something is not going well witht the buffer. The scheme is as the following:

1) The RTP allocate a buffer using memalign ( 4k aligned, the implementation requires this ).
2) The RTP calls the driver via "ioctl" providing a pointer to memalign allocated buffer.
3) The driver then create a scatter and gather list from the pointer received from RTP.
4) The DMA operation starts and wait for "DMA finished" interrupt to be asserted. At that time we exit the ioctl function. The "DMA finished" interrupt actually incremement an atomic variable and the DMA ioctl is polling for this variable forever ( this will be changed of course ).

The scatter & gatter portion of the code is the following :
void _4fm_create_sg_dma(struct _4fmpDrvCtrl *pDrvCtrl, UINT8 *pBuf, UINT32 Count)
{
INT32 nIndex;
UINT32 nReqNbrPage;
PHYS_ADDR myaddr;
int i;

/* initialize the complete area with zeroes */
memset(pDrvCtrl->devpDMAMem, 0, sizeof(DMA_CLUSTER) * MAX_PAGES_PER_DMA);

/* compute the number of "page" required to represent our buffer */
nReqNbrPage = Count / PAGE_SIZE;

/* if size perfect 4k multiple, we remove one */
if(Count%PAGE_SIZE == 0 )
nReqNbrPage--;

/* populate the scatter and gatter clusters with pointers from our task level buffer */
for(nIndex=0 ;nIndex {
pDrvCtrl->devpDMAMem[nIndex].DMA_addr = (UINT32)CACHE_DMA_VIRT_TO_PHYS((UINT32)(pBuf + PAGE_SIZE * nIndex) + FM482_PCI_OFFSET);

if(nIndex == nReqNbrPage) {
pDrvCtrl->devpDMAMem[nIndex].DMA_len = (Count/8) - ((PAGE_SIZE/8) * nIndex);
pDrvCtrl->devpDMAMem[nIndex].DMA_next_cluster = 0;
}
else {
pDrvCtrl->devpDMAMem[nIndex].DMA_len = (PAGE_SIZE/8);
pDrvCtrl->devpDMAMem[nIndex].DMA_next_cluster = (UINT32)CACHE_DMA_VIRT_TO_PHYS((UINT32)(&(pDrvCtrl->devpDMAMem[nIndex+1].DMA_addr)) + FM482_PCI_OFFSET);
}
pDrvCtrl->devpDMAMem[nIndex].DMA_rsvd = 0x00000000;
}

pDrvCtrl->devpDMAMem[nIndex].DMA_addr = 0;
pDrvCtrl->devpDMAMem[nIndex].DMA_len = 0;
pDrvCtrl->devpDMAMem[nIndex].DMA_next_cluster = 0;
pDrvCtrl->devpDMAMem[nIndex].DMA_rsvd = 0;

}

The function called by the IOCTL in charge of transfering data :
/*
* do a transfer to or from a device using direct IO
*/
static int _4fm_transfer_dio(struct _4fmpDrvCtrl *pDrvCtrl, char *buf, unsigned long mode, int count, int dir)
{
int page_offs;
int required_pages;
int last_page_size;
int pages_mapped;
int dma_buffers;
int x, rc, status = 0;

/* fill the scatter and gather chained list */
_4fm_create_sg_dma(pDrvCtrl, (UINT8 *)buf, count);

/* Configure the DMA engine inside the hardware */
vxbWrite32 (pDrvCtrl->devHandle, (UINT32 *) ((char *)pDrvCtrl->devBar + (DMA_BUS_ADDR_OFFS)*sizeof(UINT32)),
pDrvCtrl->devpDMAMem[0].DMA_addr); /* DMA start address */
vxbWrite32 (pDrvCtrl->devHandle, (UINT32 *) ((char *)pDrvCtrl->devBar + (DMA_TRANSFER_COUNT_OFFS)*sizeof(UINT32)),
pDrvCtrl->devpDMAMem[0].DMA_len); /* DMA cluster size */
vxbWrite32 (pDrvCtrl->devHandle, (UINT32 *) ((char *)pDrvCtrl->devBar + (DMA_DESCIPTOR_PTR_OFFS)*sizeof(UINT32)),
pDrvCtrl->devpDMAMem[0].DMA_next_cluster); /* Next DMA cluster's address */

/* write operation */
if(dir & DIO_TO_DEV) {
vxAtomicClear(&pDrvCtrl->devDMAReady); /* clear the DMA operation finished flag */
vxbWrite32 (pDrvCtrl->devHandle, (UINT32 *) ((char *)pDrvCtrl->devBar + (DMA_CONTROL_REGISTER_OFFS)*sizeof(UINT32)),
START_WR_CMD);

/* wait forever... AM30112010 -> TODO: no loop forever or kernel blocking loop */
while(1) {
if(vxAtomicGet(&pDrvCtrl->devDMAReady)!=0) {
return count;
}
}
}

/* read operation */
if(dir & DIO_FROM_DEV) {
vxAtomicClear(&pDrvCtrl->devDMAReady); /* clear the DMA operation finished flag */
vxbWrite32 (pDrvCtrl->devHandle, (UINT32 *) ((char *)pDrvCtrl->devBar + (DMA_CONTROL_REGISTER_OFFS)*sizeof(UINT32)),
START_RD_CMD);

/* wait forever... AM30112010 -> TODO: no loop forever or kernel blocking loop */
while(1) {
if(vxAtomicGet(&pDrvCtrl->devDMAReady)!=0) {
return count;
}

}
}
return 0;
}

And finally the RTP code :

ptr8_out = (char *)memalign(8*1024*1024, DMA_ALIGN);
/* configure the DDR2 memory controller */
rv[0] = 0x19; /* Burst size address */
rv[1] = DMA_SIZE/8; /* Burst Size */
_4FM_IoCtl_Wr(NULL, IOCTL_4FM_WRT_REG, &rv, sizeof(rv));
rv[0] = 0x1a; /* DDR2 controller enable bits */
rv[1] = 1; /* Write to hardware command */
_4FM_IoCtl_Wr(NULL, IOCTL_4FM_WRT_REG, &rv, sizeof(rv));

/* Send a DMA write */
dmacmd[0] = (unsigned int)ptr8_out; /* pointer to the buffer */
dmacmd[1] = DMA_SIZE; /* buffer size in byte */
dmacmd[2] = DIO_TO_DEV; /* host to device DMA */
_4FM_IoCtl_Wr(NULL, IOCTL_4FM_TRANS_BIDIR, &dmacmd, sizeof(dmacmd));
printf("Done\n");
SLEEP500ms;


As we can see above, this is a write operation ( from host to device ). It goes well, the DMA is correct, I am reading from the correct host system memory location. EVerything goes very smoothly. The problem start is my RTP process change the buffer content before the DMA operation. For example :

1) The RTP allocate a buffer using memalign ( 4k aligned, the implementation requires this ).
1a) The RTP modify the buffer <== ptr8_out[0] = 0 ( assign first byte of this array with 0 );
2) The RTP calls the driver via "ioctl" providing a pointer to memalign allocated buffer.
3) The driver then create a scatter and gather list from the pointer received from RTP.
4) The DMA operation starts and wait for "DMA finished" interrupt to be asserted. At that time we exit the ioctl function. The "DMA finished" interrupt actually incremement an atomic variable and the DMA ioctl is polling for this variable forever ( this will be changed of course ).

This also works well until the free(ptr8_out). As soon I call the free, vxwork complain about an invalid block matching ptr8_out's address. As this is a write operation ( host to device ), ptr8_out remain untouched, the only thing is that the hardware device has read this memory.

Note that if I have the RTP only or the driver only modificating the buffer, everything goes smoothly. During read operation ( hardware to host ) the buffer is also reported to be invalid during the free().

Does all this means I have to create a local copy? I hope not because this would not be efficient.

At this stage, any pointer is greatly welcome!