diff --git a/src/driver/amdxdna/ve2_debug.c b/src/driver/amdxdna/ve2_debug.c index 2b12355fd..0aeaf2cc2 100644 --- a/src/driver/amdxdna/ve2_debug.c +++ b/src/driver/amdxdna/ve2_debug.c @@ -17,6 +17,8 @@ #include "amdxdna_error.h" #include "amdxdna_drm.h" +extern int enable_debug_queue; + static int ve2_query_ctx_status_array(struct amdxdna_client *client, struct amdxdna_drm_hwctx_entry *tmp, pid_t pid, u32 ctx_id) @@ -151,6 +153,53 @@ static int ve2_get_array_hwctx(struct amdxdna_client *client, return ret; } +static int ve2_dbg_queue_data_rw(struct amdxdna_dev *xdev, struct amdxdna_ctx *hwctx, + u32 col, u32 row, u32 addr, void *data, size_t size, + int cmd_type) +{ + struct platform_device *pdev = to_platform_device(xdev->ddev.dev); + dma_addr_t dma_handle; + void *virt_ptr = NULL; + int ret = 0; + + if (size % 4 != 0) { + XDNA_ERR(xdev, "Size (%zu) must be a multiple of 4 bytes", size); + return -EINVAL; + } + /*Allocate phy memory and pass it to submit function*/ + virt_ptr = dma_alloc_coherent(&pdev->dev, size, &dma_handle, GFP_KERNEL); + if (!virt_ptr) { + XDNA_ERR(xdev, "Failed to allocate DMA buffer"); + return -ENOMEM; + } + + addr = addr + ((col << VE2_COL_SHIFT) + (row << VE2_ROW_SHIFT)); + + switch (cmd_type) { + case DBG_CMD_WRITE: + memcpy(virt_ptr, data, size); + ret = submit_command_to_dbg_queue(hwctx, DBG_CMD_WRITE, addr, (u64)dma_handle, + size / 4); + break; + case DBG_CMD_READ: + ret = submit_command_to_dbg_queue(hwctx, DBG_CMD_READ, addr, (u64)dma_handle, + size / 4); + if (ret == 0) + memcpy(data, virt_ptr, size); + break; + case DBG_CMD_EXIT: + ret = submit_command_to_dbg_queue(hwctx, DBG_CMD_EXIT, addr, (u64)dma_handle, + size / 4); + break; + default: + XDNA_ERR(xdev, "CMD_TYPE is not supported"); + ret = -EINVAL; + break; + } + dma_free_coherent(&pdev->dev, size, virt_ptr, dma_handle); + return ret; +} + static int ve2_aie_write(struct amdxdna_client *client, struct amdxdna_drm_set_state *args) { @@ -171,7 +220,7 @@ static int ve2_aie_write(struct amdxdna_client *client, return -EFAULT; } - XDNA_DBG(xdna, "Write request for ctx_id: %u, col: %u, row: %u, addr: 0x%x, size: %u\n", + XDNA_DBG(xdna, "Write request for ctx_id: %u, col: %u, row: %u, addr: 0x%x, size: %u", footer.context_id, footer.col, footer.row, footer.addr, footer.size); /* Find the hardware context */ @@ -187,24 +236,24 @@ static int ve2_aie_write(struct amdxdna_client *client, } if (!hwctx) { - XDNA_ERR(xdna, "hw context :%u pid:%llu not found\n", footer.context_id, + XDNA_ERR(xdna, "hw context :%u pid:%llu not found", footer.context_id, footer.pid); return -EINVAL; } - XDNA_DBG(xdna, "Found hwctx: cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u\n", + XDNA_DBG(xdna, "Found hwctx: cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u", hwctx->client->pid, hwctx->id, hwctx->start_col, hwctx->num_col); /* Validate column is within partition */ if (footer.col >= hwctx->num_col) { - XDNA_ERR(xdna, "Column %u is outside partition range [0, %u)\n", + XDNA_ERR(xdna, "Column %u is outside partition range [0, %u)", footer.col, hwctx->num_col); return -EINVAL; } /* Validate row */ if (footer.row >= xdna->dev_handle->aie_dev_info.rows) { - XDNA_ERR(xdna, "Row %u is outside range [0, %u)\n", + XDNA_ERR(xdna, "Row %u is outside range [0, %u)", footer.row, xdna->dev_handle->aie_dev_info.rows); return -EINVAL; } @@ -212,7 +261,7 @@ static int ve2_aie_write(struct amdxdna_client *client, /* Get AIE device handle */ aie_dev = hwctx->priv->aie_dev; if (!aie_dev) { - XDNA_ERR(xdna, "AIE device handle not found\n"); + XDNA_ERR(xdna, "AIE device handle not found"); return -EINVAL; } @@ -223,16 +272,30 @@ static int ve2_aie_write(struct amdxdna_client *client, /* Copy data from user space (data is at the beginning of buffer) */ if (copy_from_user(local_buf, u64_to_user_ptr(args->buffer), footer.size)) { - XDNA_ERR(xdna, "Error: unable to copy data from userptr\n"); + XDNA_ERR(xdna, "Error: unable to copy data from userptr"); kfree(local_buf); return -EFAULT; } /* Write to AIE memory */ - ret = ve2_partition_write(aie_dev, footer.col, footer.row, footer.addr, - footer.size, local_buf); + //TODO This is temporary fix to exit the debug queue. + if (enable_debug_queue) { + if (footer.col == 3) { + ret = ve2_dbg_queue_data_rw(xdna, hwctx, footer.col, footer.row, + footer.addr, local_buf, footer.size, + DBG_CMD_EXIT); + } else { + ret = ve2_dbg_queue_data_rw(xdna, hwctx, footer.col, footer.row, + footer.addr, local_buf, footer.size, + DBG_CMD_WRITE); + } + } else { + ret = ve2_partition_write(aie_dev, footer.col, footer.row, footer.addr, + footer.size, local_buf); + } + if (ret < 0) { - XDNA_ERR(xdna, "Error in AIE memory write operation, err: %d\n", ret); + XDNA_ERR(xdna, "Error in AIE memory write operation, err: %d", ret); kfree(local_buf); return ret; } @@ -261,7 +324,7 @@ static int ve2_aie_read(struct amdxdna_client *client, struct amdxdna_drm_get_ar return -EFAULT; } - XDNA_DBG(xdna, "Read request for ctx_id: %u, col: %u, row: %u, addr: 0x%x, size: %u\n", + XDNA_DBG(xdna, "Read request for ctx_id: %u, col: %u, row: %u, addr: 0x%x, size: %u", footer.context_id, footer.col, footer.row, footer.addr, footer.size); /* Find the hardware context */ @@ -277,24 +340,24 @@ static int ve2_aie_read(struct amdxdna_client *client, struct amdxdna_drm_get_ar } if (!hwctx) { - XDNA_ERR(xdna, "hw context :%u pid:%llu not found\n", footer.context_id, + XDNA_ERR(xdna, "hw context :%u pid:%llu not found", footer.context_id, footer.pid); return -EINVAL; } - XDNA_DBG(xdna, "Found hwctx: cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u\n", + XDNA_DBG(xdna, "Found hwctx: cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u", hwctx->client->pid, hwctx->id, hwctx->start_col, hwctx->num_col); /* Validate column is within partition */ if (footer.col >= hwctx->num_col) { - XDNA_ERR(xdna, "Column %u is outside partition range [0, %u)\n", + XDNA_ERR(xdna, "Column %u is outside partition range [0, %u)", footer.col, hwctx->num_col); return -EINVAL; } /* Validate row */ if (footer.row >= xdna->dev_handle->aie_dev_info.rows) { - XDNA_ERR(xdna, "Row %u is outside range [0, %u)\n", + XDNA_ERR(xdna, "Row %u is outside range [0, %u)", footer.row, xdna->dev_handle->aie_dev_info.rows); return -EINVAL; } @@ -302,7 +365,7 @@ static int ve2_aie_read(struct amdxdna_client *client, struct amdxdna_drm_get_ar /* Get AIE device handle and relative column */ aie_dev = hwctx->priv->aie_dev; if (!aie_dev) { - XDNA_ERR(xdna, "AIE device handle not found\n"); + XDNA_ERR(xdna, "AIE device handle not found"); return -EINVAL; } @@ -312,17 +375,22 @@ static int ve2_aie_read(struct amdxdna_client *client, struct amdxdna_drm_get_ar return -ENOMEM; /* Read from AIE memory */ - ret = ve2_partition_read(aie_dev, footer.col, footer.row, footer.addr, - footer.size, local_buf); + if (enable_debug_queue) { + ret = ve2_dbg_queue_data_rw(xdna, hwctx, footer.col, footer.row, + footer.addr, local_buf, footer.size, DBG_CMD_READ); + } else { + ret = ve2_partition_read(aie_dev, footer.col, footer.row, footer.addr, + footer.size, local_buf); + } if (ret < 0) { - XDNA_ERR(xdna, "Error in AIE memory read operation, err: %d\n", ret); + XDNA_ERR(xdna, "Error in AIE memory read operation, err: %d", ret); kfree(local_buf); return ret; } /* Copy data to user space */ if (copy_to_user(u64_to_user_ptr(args->buffer), local_buf, footer.size)) { - XDNA_ERR(xdna, "Error: unable to copy memory to userptr\n"); + XDNA_ERR(xdna, "Error: unable to copy memory to userptr"); kfree(local_buf); return -EFAULT; } @@ -366,18 +434,18 @@ static int ve2_coredump_read(struct amdxdna_client *client, struct amdxdna_drm_g } if (!hwctx) { - XDNA_ERR(xdna, "hw context :%u pid:%llu not found\n", footer.context_id, + XDNA_ERR(xdna, "hw context :%u pid:%llu not found", footer.context_id, footer.pid); return -EINVAL; } - XDNA_DBG(xdna, "cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u\n", + XDNA_DBG(xdna, "cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u", hwctx->client->pid, hwctx->id, hwctx->start_col, hwctx->num_col); rel_size = hwctx->priv->num_col * xdna->dev_handle->aie_dev_info.rows * TILE_ADDRESS_SPACE; if (rel_size > buf_size) { - XDNA_DBG(xdna, "Invalid buffer size:%d (rel_size:%d)\n", buf_size, rel_size); + XDNA_DBG(xdna, "Invalid buffer size:%d (rel_size:%d)", buf_size, rel_size); args->element_size = rel_size; return -ENOBUFS; } @@ -387,16 +455,16 @@ static int ve2_coredump_read(struct amdxdna_client *client, struct amdxdna_drm_g return -ENOMEM; ret = ve2_create_coredump(xdna, hwctx, local_buf, rel_size); - XDNA_DBG(xdna, "created dump of size:%d\n", ret); + XDNA_DBG(xdna, "created dump of size:%d", ret); if (ret < 0) { - XDNA_ERR(xdna, "Error in AIE Data mem read operation, err: %d\n", ret); + XDNA_ERR(xdna, "Error in AIE Data mem read operation, err: %d", ret); vfree(local_buf); return ret; } if (copy_to_user(u64_to_user_ptr(args->buffer), local_buf, ret)) { - XDNA_ERR(xdna, "Error: unable to copy memory to userptr\n"); + XDNA_ERR(xdna, "Error: unable to copy memory to userptr"); vfree(local_buf); return -EFAULT; } @@ -684,7 +752,7 @@ static int ve2_get_array_async_error(struct amdxdna_dev *xdna, struct amdxdna_dr XDNA_DBG(xdna, "Waiting for error callback to complete on mgmtctx[%u]", i); if (wait_for_completion_timeout(&mgmtctx->error_cb_completion, wait_timeout) == 0) { - XDNA_WARN(xdna, "Timeout waiting for err callback completion\n"); + XDNA_WARN(xdna, "Timeout waiting for err callback completion"); } } } diff --git a/src/driver/amdxdna/ve2_host_queue.h b/src/driver/amdxdna/ve2_host_queue.h index c492b398e..5afef1678 100644 --- a/src/driver/amdxdna/ve2_host_queue.h +++ b/src/driver/amdxdna/ve2_host_queue.h @@ -123,6 +123,35 @@ struct ve2_hsa_queue { struct device *alloc_dev; }; +enum dbg_cmd_type { + DBG_CMD_EXIT = 11, + DBG_CMD_READ = 12, + DBG_CMD_WRITE = 13, +}; + +struct rw_mem { + u32 aie_addr; + u32 length; + u32 host_addr_high; + u32 host_addr_low; +}; + +struct dbg_queue { + struct host_queue_header hq_header; + struct host_queue_packet hq_entry[HOST_QUEUE_ENTRY]; +}; + +struct ve2_dbg_queue { + struct dbg_queue *dbg_queue_p; + struct ve2_mem dbg_queue_mem; + struct ve2_hq_complete hq_complete; + // hq_lock protects [read | write]_index and reserved_write_index + struct mutex hq_lock; + u64 reserved_write_index; + /* Device used for host queue allocation */ + struct device *alloc_dev; +}; + /* handshake */ #define ALIVE_MAGIC 0x404C5645 struct handshake { diff --git a/src/driver/amdxdna/ve2_hwctx.c b/src/driver/amdxdna/ve2_hwctx.c index becd0bd98..342ff7356 100644 --- a/src/driver/amdxdna/ve2_hwctx.c +++ b/src/driver/amdxdna/ve2_hwctx.c @@ -36,6 +36,10 @@ int max_col; module_param(max_col, int, 0644); MODULE_PARM_DESC(max_col, "Max column supported by this driver"); +int enable_debug_queue; +module_param(enable_debug_queue, int, 0644); +MODULE_PARM_DESC(enable_debug_queue, "Enable debug queue. It is disabled by default."); + #define CTX_TIMER (nsecs_to_jiffies(1)) /* @@ -138,83 +142,87 @@ static int ve2_wait_for_retry_slot(struct amdxdna_ctx *hwctx, u32 timeout_ms) } static struct host_queue_packet * -hsa_queue_reserve_slot(struct amdxdna_dev *xdna, struct amdxdna_ctx_priv *priv, u64 *slot) +ve2_queue_reserve_slot(struct amdxdna_dev *xdna, const char *qname, struct mutex *hq_lock, + u64 *reserved_write_index, struct host_queue_header *header, + struct ve2_hq_complete *hq_complete, struct host_queue_packet *hq_entry, + u64 *slot) { - struct ve2_hsa_queue *queue = &priv->hwctx_hsa_queue; - struct host_queue_header *header = &queue->hsa_queue_p->hq_header; - u32 capacity = header->capacity; - u32 slot_idx; + enum ert_cmd_state state; u64 outstanding; + u32 slot_idx; - mutex_lock(&queue->hq_lock); + mutex_lock(hq_lock); + u64 read_index = header->read_index; + u32 capacity = header->capacity; /* * Check against reserved_write_index to account for in-flight reservations. */ - if (queue->reserved_write_index < header->read_index) { - XDNA_ERR(xdna, "HSA Queue: reserved_write_index(%llu) < read_index(%llu)", - queue->reserved_write_index, header->read_index); - mutex_unlock(&queue->hq_lock); - return NULL; + if (*reserved_write_index < read_index) { + XDNA_ERR(xdna, "%s Queue: reserved_write_index(%llu) < read_index(%llu)", + qname, *reserved_write_index, read_index); + mutex_unlock(hq_lock); + return ERR_PTR(-EINVAL); } - outstanding = queue->reserved_write_index - header->read_index; + outstanding = *reserved_write_index - read_index; if (outstanding >= capacity) { /* Use DBG level - expected during high queue utilization */ - XDNA_DBG(xdna, "HSA Queue full: outstanding=%llu >= capacity=%u", - outstanding, capacity); - mutex_unlock(&queue->hq_lock); + XDNA_DBG(xdna, "%s Queue full: outstanding=%llu >= capacity=%u", + qname, outstanding, capacity); + mutex_unlock(hq_lock); return ERR_PTR(-EBUSY); } - slot_idx = queue->reserved_write_index % capacity; - enum ert_cmd_state state = queue->hq_complete.hqc_mem[slot_idx]; + slot_idx = *reserved_write_index % capacity; + state = (enum ert_cmd_state)hq_complete->hqc_mem[slot_idx]; /* * Slot can only be reused when it's in INVALID state, which is set by * ve2_hwctx_job_release() after the job is fully released from pending array. * Note: ERT_CMD_STATE_INVALID == 0, so this also covers zero-initialized slots. - * This ensures the pending array slot is free before we reserve the HSA queue slot. + * This ensures the pending array slot is free before we reserve the queue slot. */ if (state != ERT_CMD_STATE_INVALID) { XDNA_DBG(xdna, "Slot %u is still in use with state %u", slot_idx, state); - mutex_unlock(&queue->hq_lock); + mutex_unlock(hq_lock); return ERR_PTR(-EBUSY); } /* Reserve this slot by incrementing reserved_write_index. */ - *slot = queue->reserved_write_index++; - queue->hq_complete.hqc_mem[slot_idx] = ERT_CMD_STATE_NEW; + *slot = (*reserved_write_index)++; + hq_complete->hqc_mem[slot_idx] = ERT_CMD_STATE_NEW; - mutex_unlock(&queue->hq_lock); + mutex_unlock(hq_lock); /* Return packet pointer. Caller can now prepare packet in parallel. */ - return &queue->hsa_queue_p->hq_entry[slot_idx]; + return &hq_entry[slot_idx]; } /* Commit the prepared packet by updating write_index when all prior slots are ready. * This ensures CERT sees packets in order even if prepared out-of-order. + * Generic version that works with both HSA queue and DBG queue. */ -static void hsa_queue_commit_slot(struct amdxdna_dev *xdna, struct amdxdna_ctx_priv *priv, - u64 slot) +static void ve2_queue_commit_slot(struct mutex *hq_lock, u64 *reserved_write_index, + struct host_queue_header *header, + struct host_queue_packet *hq_entry, + struct ve2_hq_complete *hq_complete, u64 slot) { - struct ve2_hsa_queue *queue = &priv->hwctx_hsa_queue; - struct host_queue_header *header = &queue->hsa_queue_p->hq_header; u32 capacity = header->capacity; u32 slot_idx = slot % capacity; - struct host_queue_packet *pkt = &queue->hsa_queue_p->hq_entry[slot_idx]; + struct host_queue_packet *pkt = &hq_entry[slot_idx]; - mutex_lock(&queue->hq_lock); + mutex_lock(hq_lock); /* Set packet type to valid so CERT can process it */ pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC; /* Mark this slot as ready in driver tracking */ - queue->hq_complete.hqc_mem[slot_idx] = ERT_CMD_STATE_SUBMITTED; + hq_complete->hqc_mem[slot_idx] = ERT_CMD_STATE_SUBMITTED; /* Advance write_index as far as possible through all ready slots. */ - while (header->write_index < queue->reserved_write_index) { + while (header->write_index < *reserved_write_index) { u32 next_idx = header->write_index % capacity; - enum ert_cmd_state state = queue->hq_complete.hqc_mem[next_idx]; + enum ert_cmd_state state = hq_complete->hqc_mem[next_idx]; if (state != ERT_CMD_STATE_SUBMITTED) break; @@ -222,7 +230,35 @@ static void hsa_queue_commit_slot(struct amdxdna_dev *xdna, struct amdxdna_ctx_p header->write_index++; } - mutex_unlock(&queue->hq_lock); + mutex_unlock(hq_lock); +} + +static void *ve2_get_queue_pkt(struct amdxdna_ctx *hwctx, u64 *seq, int *err, const char *qname, + struct mutex *hq_lock, u64 *reserved_write_index, + struct host_queue_header *header, + struct ve2_hq_complete *hq_complete, + struct host_queue_packet *hq_entry) +{ + struct amdxdna_dev *xdna = hwctx->client->xdna; + struct host_queue_packet *pkt; + + pkt = ve2_queue_reserve_slot(xdna, + qname, + hq_lock, + reserved_write_index, + header, + hq_complete, + hq_entry, + seq); + if (IS_ERR(pkt)) { + *err = PTR_ERR(pkt); + /* Expected during retry - use DBG level */ + XDNA_DBG(xdna, "%s Queue: No slot available (err=%d)", qname, *err); + return NULL; + } + + *err = 0; + return pkt; } static void ve2_job_release(struct kref *ref) @@ -342,46 +378,28 @@ static inline void ve2_hwctx_job_release(struct amdxdna_ctx *hwctx, struct amdxd mutex_unlock(&hwctx->priv->hwctx_hsa_queue.hq_lock); } -static inline struct host_queue_packet *hsa_queue_get_pkt(struct hsa_queue *queue, u64 slot) +static inline struct host_queue_packet *ve2_queue_get_pkt(struct host_queue_packet *hq_entry, + u32 capacity, u64 slot) { - return &queue->hq_entry[slot & (queue->hq_header.capacity - 1)]; + return &hq_entry[slot & (capacity - 1)]; } -static void *get_host_queue_pkt(struct amdxdna_ctx *hwctx, u64 *seq, int *err) -{ - struct amdxdna_dev *xdna = hwctx->client->xdna; - struct host_queue_packet *pkt; - - pkt = hsa_queue_reserve_slot(xdna, hwctx->priv, seq); - if (IS_ERR(pkt)) { - *err = PTR_ERR(pkt); - /* Expected during retry - use DBG level */ - XDNA_DBG(xdna, "No slot available in Host queue (err=%d)", *err); - return NULL; - } - - *err = 0; - return pkt; -} - -static inline void hsa_queue_pkt_set_invalid(struct host_queue_packet *pkt) +static inline void ve2_queue_pkt_set_invalid(struct host_queue_packet *pkt) { pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_INVALID; } -static void ve2_free_hsa_queue(struct amdxdna_dev *xdna, struct ve2_hsa_queue *queue) +static void ve2_free_queue(struct amdxdna_dev *xdna, const char *qname, void **queue_p, + struct device **alloc_dev, dma_addr_t *dma_addr, struct mutex *hq_lock, + size_t alloc_size) { - if (queue->hsa_queue_p) { - XDNA_DBG(xdna, "Freeing host queue: dma_addr=0x%llx", - queue->hsa_queue_mem.dma_addr); - dma_free_coherent(queue->alloc_dev, - sizeof(struct hsa_queue) + sizeof(u64) * HOST_QUEUE_ENTRY, - queue->hsa_queue_p, - queue->hsa_queue_mem.dma_addr); - queue->hsa_queue_p = NULL; - queue->hsa_queue_mem.dma_addr = 0; - queue->alloc_dev = NULL; - mutex_destroy(&queue->hq_lock); + if (*queue_p) { + XDNA_DBG(xdna, "Freeing %s queue: dma_addr=0x%llx", qname, *dma_addr); + dma_free_coherent(*alloc_dev, alloc_size, *queue_p, *dma_addr); + *queue_p = NULL; + *dma_addr = 0; + *alloc_dev = NULL; + mutex_destroy(hq_lock); } } @@ -470,6 +488,157 @@ void packet_dump(struct amdxdna_dev *xdna, struct hsa_queue *queue, u64 slot_id) } } +static int ve2_create_dbg_queue(struct amdxdna_dev *xdna, struct amdxdna_ctx *hwctx, + struct ve2_dbg_queue *queue) +{ + struct platform_device *pdev = to_platform_device(xdna->ddev.dev); + int nslots = HOST_QUEUE_ENTRY; + dma_addr_t dma_handle; + size_t alloc_size; + int r; + struct device *alloc_dev; + + alloc_size = sizeof(struct dbg_queue) + sizeof(u64) * nslots; + XDNA_DBG(xdna, "Creating dbg queue: nslots=%d, alloc_size=%zu", nslots, alloc_size); + + /* Allocate from context's CMA region(s); try bitmap order (region 0, 1, ...). */ + for (r = 0; r < MAX_MEM_REGIONS; r++) { + alloc_dev = xdna->cma_region_devs[r]; + if ((hwctx->priv->mem_bitmap & (1U << r)) && alloc_dev) { + queue->dbg_queue_p = dma_alloc_coherent(alloc_dev, alloc_size, + &dma_handle, GFP_KERNEL); + if (!queue->dbg_queue_p) + continue; + queue->alloc_dev = alloc_dev; + break; + } + } + + /* If no allocation succeeded, use the default device */ + if (!queue->dbg_queue_p) { + queue->dbg_queue_p = dma_alloc_coherent(&pdev->dev, + alloc_size, + &dma_handle, + GFP_KERNEL); + if (!queue->dbg_queue_p) { + XDNA_ERR(xdna, "Failed to allocate dbg queue memory, size=%zu", alloc_size); + return -ENOMEM; + } + queue->alloc_dev = &pdev->dev; + } + memset(queue->dbg_queue_p, 0, alloc_size); + /* Initialize mutex here */ + mutex_init(&queue->hq_lock); + /* Initialize reserved_write_index to track slot reservations */ + queue->reserved_write_index = 0; + /* Set the base DMA address for dbg queue */ + queue->dbg_queue_mem.dma_addr = dma_handle; + /* Calculate the address for hqc_mem within the allocated block */ + queue->hq_complete.hqc_mem = + (u64 *)((char *)queue->dbg_queue_p + sizeof(struct dbg_queue)); + queue->hq_complete.hqc_dma_addr = queue->dbg_queue_mem.dma_addr + sizeof(struct dbg_queue); + queue->dbg_queue_p->hq_header.data_address = queue->dbg_queue_mem.dma_addr + + sizeof(struct host_queue_header); + + WARN_ON(!is_power_of_2(nslots)); + queue->dbg_queue_p->hq_header.capacity = nslots; + /* Set dbg queue slots to invalid */ + for (int slot = 0; slot < nslots; slot++) { + ve2_queue_pkt_set_invalid(ve2_queue_get_pkt(queue->dbg_queue_p->hq_entry, + queue->dbg_queue_p->hq_header.capacity, + slot)); + } + + XDNA_DBG(xdna, "Created dbg queue: dma_addr=0x%llx, capacity=%d, data_addr=0x%llx", + queue->dbg_queue_mem.dma_addr, nslots, queue->dbg_queue_p->hq_header.data_address); + return 0; +} + +int submit_command_to_dbg_queue(struct amdxdna_ctx *hwctx, u32 opcode, u32 aie_addr, u64 paddr, + u32 length) +{ + struct amdxdna_ctx_priv *ve2_ctx = hwctx->priv; + struct amdxdna_dev *xdna = hwctx->client->xdna; + struct ve2_dbg_queue *dbg_queue; + struct xrt_packet_header *hdr; + struct host_queue_packet *pkt; + struct rw_mem *ebp; + long wait_ret = 0; + u64 slot_id = 0; + int err; + + if (!ve2_ctx || !ve2_ctx->hwctx_dbg_queue.dbg_queue_p) { + XDNA_ERR(xdna, "Debug queue is not initialized"); + return -EINVAL; + } + dbg_queue = (struct ve2_dbg_queue *)&ve2_ctx->hwctx_dbg_queue; + pkt = (struct host_queue_packet *)ve2_get_queue_pkt(hwctx, &slot_id, &err, + "DBG", + &dbg_queue->hq_lock, + &dbg_queue->reserved_write_index, + &dbg_queue->dbg_queue_p->hq_header, + &dbg_queue->hq_complete, + dbg_queue->dbg_queue_p->hq_entry); + if (!pkt) { + XDNA_ERR(xdna, "Getting host queue packet failed (err=%d)", err); + return err; + } + XDNA_DBG(xdna, "pkt %p of slot %llx is selected", (void *)pkt, slot_id); + + slot_id = slot_id & (dbg_queue->dbg_queue_p->hq_header.capacity - 1); + + hdr = &pkt->xrt_header; + hdr->common_header.opcode = opcode; + hdr->completion_signal = + (u64)(dbg_queue->hq_complete.hqc_dma_addr + slot_id * sizeof(u64)); + + XDNA_DBG(xdna, "Debug Queue packet opcode: %u", pkt->xrt_header.common_header.opcode); + + hdr->common_header.count = sizeof(struct rw_mem); + hdr->common_header.distribute = 0; + hdr->common_header.indirect = 0; + + ebp = (struct rw_mem *)pkt->data; + ebp->aie_addr = aie_addr; + ebp->host_addr_high = upper_32_bits(paddr); + ebp->host_addr_low = lower_32_bits(paddr); + ebp->length = length; + + ve2_queue_commit_slot(&dbg_queue->hq_lock, &dbg_queue->reserved_write_index, + &dbg_queue->dbg_queue_p->hq_header, dbg_queue->dbg_queue_p->hq_entry, + &dbg_queue->hq_complete, slot_id); + + wait_ret = wait_event_interruptible_timeout(ve2_ctx->dbg_q_waitq, + dbg_queue->dbg_queue_p->hq_header.read_index == + dbg_queue->dbg_queue_p->hq_header.write_index, + msecs_to_jiffies(5000)); + + if (wait_ret == 0) { + XDNA_ERR(xdna, "DBG Queue command wait timeout"); + err = -ETIMEDOUT; + goto cleanup_slot_id; + } else if (wait_ret < 0) { + XDNA_ERR(xdna, "DBG Queue command wait interrupted"); + err = wait_ret; + goto cleanup_slot_id; + } + + XDNA_DBG(xdna, "After command submission write_index is %llx, read_index is %llx", + dbg_queue->dbg_queue_p->hq_header.write_index, + dbg_queue->dbg_queue_p->hq_header.read_index); + + err = 0; + +cleanup_slot_id: + /* Reset slot to INVALID after completion (success, timeout, or interruption) */ + mutex_lock(&dbg_queue->hq_lock); + dbg_queue->hq_complete.hqc_mem[slot_id] = ERT_CMD_STATE_INVALID; + ve2_queue_pkt_set_invalid(pkt); + mutex_unlock(&dbg_queue->hq_lock); + + return err; +} + /* * Create hsa queue in kernel and initialize queue slots. */ @@ -511,7 +680,7 @@ static int ve2_create_host_queue(struct amdxdna_dev *xdna, struct amdxdna_ctx *h } queue->alloc_dev = xdna->ddev.dev; } - + memset(queue->hsa_queue_p, 0, alloc_size); /* Initialize mutex here */ mutex_init(&queue->hq_lock); /* Initialize reserved_write_index to track slot reservations */ @@ -532,7 +701,9 @@ static int ve2_create_host_queue(struct amdxdna_dev *xdna, struct amdxdna_ctx *h for (int slot = 0; slot < nslots; slot++) { struct host_queue_indirect_hdr *hdr = &queue->hsa_queue_p->hq_indirect_hdr[slot]; - hsa_queue_pkt_set_invalid(hsa_queue_get_pkt(queue->hsa_queue_p, slot)); + ve2_queue_pkt_set_invalid(ve2_queue_get_pkt(queue->hsa_queue_p->hq_entry, + queue->hsa_queue_p->hq_header.capacity, + slot)); hdr->header.type = HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC; hdr->header.opcode = HOST_QUEUE_PACKET_EXEC_BUF; hdr->header.count = 0; @@ -571,14 +742,19 @@ static int submit_command_indirect(struct amdxdna_ctx *hwctx, void *cmd_data, u6 u64 slot_id = 0; dpu = (struct ve2_dpu_data *)cmd_data; + hq_queue = (struct ve2_hsa_queue *)&ve2_ctx->hwctx_hsa_queue; - pkt = hsa_queue_reserve_slot(xdna, ve2_ctx, &slot_id); + pkt = ve2_queue_reserve_slot(xdna, "HSA", &hq_queue->hq_lock, + &hq_queue->reserved_write_index, + &hq_queue->hsa_queue_p->hq_header, + &hq_queue->hq_complete, + hq_queue->hsa_queue_p->hq_entry, + &slot_id); if (IS_ERR(pkt)) { XDNA_DBG(xdna, "No slot available in Host queue"); return PTR_ERR(pkt); } - hq_queue = (struct ve2_hsa_queue *)&ve2_ctx->hwctx_hsa_queue; queue = (struct hsa_queue *)hq_queue->hsa_queue_p; *seq = slot_id; @@ -647,7 +823,9 @@ static int submit_command_indirect(struct amdxdna_ctx *hwctx, void *cmd_data, u6 packet_dump(xdna, queue, slot_id); /* Commit the slot - this sets hqc_mem to SUBMITTED and advances write_index */ - hsa_queue_commit_slot(xdna, ve2_ctx, *seq); + ve2_queue_commit_slot(&hq_queue->hq_lock, &hq_queue->reserved_write_index, + &hq_queue->hsa_queue_p->hq_header, hq_queue->hsa_queue_p->hq_entry, + &hq_queue->hq_complete, *seq); return 0; } @@ -669,7 +847,14 @@ static int submit_command(struct amdxdna_ctx *hwctx, void *cmd_data, u64 *seq, b return -EINVAL; } - pkt = (struct host_queue_packet *)get_host_queue_pkt(hwctx, &slot_id, &err); + pkt = (struct host_queue_packet *)ve2_get_queue_pkt(hwctx, &slot_id, &err, + "HSA", + &hq_queue->hq_lock, + &hq_queue->reserved_write_index, + &hq_queue->hsa_queue_p->hq_header, + &hq_queue->hq_complete, + hq_queue->hsa_queue_p->hq_entry); + if (!pkt) { /* Expected during retry - use DBG level */ XDNA_DBG(xdna, "Getting host queue packet failed (err=%d)", err); @@ -685,8 +870,7 @@ static int submit_command(struct amdxdna_ctx *hwctx, void *cmd_data, u64 *seq, b hdr->common_header.chain_flag = last_cmd ? LAST_CMD : NOT_LAST_CMD; hdr->completion_signal = (u64)(hq_queue->hq_complete.hqc_dma_addr + slot_id * sizeof(u64)); -#define XRT_PKT_OPCODE(p) ((p)->xrt_header.common_header.opcode) - XDNA_DBG(xdna, "Queue packet opcode: %u\n", XRT_PKT_OPCODE(pkt)); + XDNA_DBG(xdna, "Queue packet opcode: %u", pkt->xrt_header.common_header.opcode); hdr->common_header.count = sizeof(struct exec_buf); hdr->common_header.distribute = 0; @@ -706,7 +890,9 @@ static int submit_command(struct amdxdna_ctx *hwctx, void *cmd_data, u64 *seq, b XDNA_DBG(xdna, "dpu instruction addr: 0x%llx", dpu_cmd->instruction_buffer); /* Commit the slot - this sets hqc_mem to SUBMITTED and advances write_index */ - hsa_queue_commit_slot(xdna, ve2_ctx, *seq); + ve2_queue_commit_slot(&hq_queue->hq_lock, &hq_queue->reserved_write_index, + &hq_queue->hsa_queue_p->hq_header, + hq_queue->hsa_queue_p->hq_entry, &hq_queue->hq_complete, *seq); return 0; } @@ -1216,6 +1402,17 @@ static void timeout_cb(struct timer_list *t) mod_timer(&priv->event_timer, jiffies + CTX_TIMER); } +static void dbg_q_timeout_cb(struct timer_list *t) +{ + struct amdxdna_ctx_priv *priv = from_timer(priv, t, dbg_q_timer); + + if (!priv || !priv->hwctx_dbg_queue.dbg_queue_p) + return; + + wake_up_interruptible(&priv->dbg_q_waitq); + mod_timer(&priv->dbg_q_timer, jiffies + CTX_TIMER); +} + static void ve2_clear_firmware_status(struct amdxdna_dev *xdna, struct amdxdna_ctx *hwctx) { struct amdxdna_ctx_priv *priv = hwctx->priv; @@ -1263,7 +1460,6 @@ int ve2_hwctx_init(struct amdxdna_ctx *hwctx) XDNA_ERR(xdna, "Failed to create host queue, ret=%d", ret); goto cleanup_xrs; } - if (enable_polling) { XDNA_DBG(xdna, "Running in timer mode"); timer_setup(&priv->event_timer, timeout_cb, 0); @@ -1272,6 +1468,18 @@ int ve2_hwctx_init(struct amdxdna_ctx *hwctx) XDNA_DBG(xdna, "Running in interrupt mode"); } + if (enable_debug_queue) { + /* one dbg_queue entry per hwctx */ + ret = ve2_create_dbg_queue(xdna, hwctx, &priv->hwctx_dbg_queue); + if (ret) { + XDNA_ERR(xdna, "Failed to create dbg queue, ret=%d", ret); + goto free_hsa_queue; + } + init_waitqueue_head(&priv->dbg_q_waitq); + timer_setup(&priv->dbg_q_timer, dbg_q_timeout_cb, 0); + mod_timer(&priv->dbg_q_timer, jiffies + CTX_TIMER); + } + if (verbosity >= VERBOSITY_LEVEL_DBG) ve2_clear_firmware_status(xdna, hwctx); @@ -1284,6 +1492,12 @@ int ve2_hwctx_init(struct amdxdna_ctx *hwctx) return 0; +free_hsa_queue: + ve2_free_queue(xdna, "HSA", (void **)&hwctx->priv->hwctx_hsa_queue.hsa_queue_p, + &hwctx->priv->hwctx_hsa_queue.alloc_dev, + &hwctx->priv->hwctx_hsa_queue.hsa_queue_mem.dma_addr, + &hwctx->priv->hwctx_hsa_queue.hq_lock, + sizeof(struct hsa_queue) + sizeof(u64) * HOST_QUEUE_ENTRY); cleanup_xrs: /* Releases XRS and partition (ve2_mgmt_destroy_partition calls ve2_xrs_release). */ ve2_mgmt_destroy_partition(hwctx); @@ -1307,6 +1521,9 @@ void ve2_hwctx_fini(struct amdxdna_ctx *hwctx) hwctx, nhwctx->start_col, nhwctx->num_col, hwctx->submitted, hwctx->completed); + if (enable_debug_queue && hwctx->priv) + del_timer_sync(&hwctx->priv->dbg_q_timer); + if (enable_polling) del_timer_sync(&hwctx->priv->event_timer); @@ -1354,7 +1571,19 @@ void ve2_hwctx_fini(struct amdxdna_ctx *hwctx) ve2_get_firmware_status(hwctx); ve2_mgmt_destroy_partition(hwctx); - ve2_free_hsa_queue(xdna, &hwctx->priv->hwctx_hsa_queue); + ve2_free_queue(xdna, "HSA", (void **)&hwctx->priv->hwctx_hsa_queue.hsa_queue_p, + &hwctx->priv->hwctx_hsa_queue.alloc_dev, + &hwctx->priv->hwctx_hsa_queue.hsa_queue_mem.dma_addr, + &hwctx->priv->hwctx_hsa_queue.hq_lock, + sizeof(struct hsa_queue) + sizeof(u64) * HOST_QUEUE_ENTRY); + if (enable_debug_queue) { + ve2_free_queue(xdna, "DBG", (void **)&hwctx->priv->hwctx_dbg_queue.dbg_queue_p, + &hwctx->priv->hwctx_dbg_queue.alloc_dev, + &hwctx->priv->hwctx_dbg_queue.dbg_queue_mem.dma_addr, + &hwctx->priv->hwctx_dbg_queue.hq_lock, + sizeof(struct dbg_queue) + sizeof(u64) * HOST_QUEUE_ENTRY); + } + kfree(hwctx->priv->hwctx_config); mutex_destroy(&hwctx->priv->privctx_lock); kfree(hwctx->priv); diff --git a/src/driver/amdxdna/ve2_mgmt.c b/src/driver/amdxdna/ve2_mgmt.c index 1c787f8b9..869010e3e 100644 --- a/src/driver/amdxdna/ve2_mgmt.c +++ b/src/driver/amdxdna/ve2_mgmt.c @@ -30,10 +30,15 @@ static void cert_setup_partition(struct amdxdna_dev *xdna, u32 start_col = nhwctx->start_col; u32 num_col = nhwctx->num_col; u64 hsa_addr = 0xFFFFFFFFFFFFFFFF; + u64 dbg_addr = 0xFFFFFFFFFFFFFFFF; + struct ve2_config_hwctx *hwctx_cfg = &nhwctx->hwctx_config[col]; - if (col == 0) + if (col == 0) { hsa_addr = nhwctx->hwctx_hsa_queue.hsa_queue_mem.dma_addr; + if (enable_debug_queue) + dbg_addr = nhwctx->hwctx_dbg_queue.dbg_queue_mem.dma_addr; + } u32 lead_col_addr = VE2_ADDR(start_col, 0, 0); @@ -41,6 +46,8 @@ static void cert_setup_partition(struct amdxdna_dev *xdna, cert_hs->aie_info.partition_size = num_col; cert_hs->hsa_addr_high = upper_32_bits(hsa_addr); cert_hs->hsa_addr_low = lower_32_bits(hsa_addr); + cert_hs->dbg.hsa_addr_high = upper_32_bits(dbg_addr); + cert_hs->dbg.hsa_addr_low = lower_32_bits(dbg_addr); cert_hs->log_addr_high = upper_32_bits(hwctx_cfg->log_buf_addr); cert_hs->log_addr_low = lower_32_bits(hwctx_cfg->log_buf_addr); cert_hs->log_buf_size = hwctx_cfg->log_buf_size; @@ -57,8 +64,6 @@ static void cert_setup_partition(struct amdxdna_dev *xdna, cert_hs->ctx_switch_req = 0; cert_hs->hsa_location = 0; - cert_hs->dbg.hsa_addr_high = 0xFFFFFFFF; - cert_hs->dbg.hsa_addr_low = 0xFFFFFFFF; cert_hs->mpaie_alive = ALIVE_MAGIC; } diff --git a/src/driver/amdxdna/ve2_mgmt.h b/src/driver/amdxdna/ve2_mgmt.h index 54c0791d0..bdbcaaa1e 100644 --- a/src/driver/amdxdna/ve2_mgmt.h +++ b/src/driver/amdxdna/ve2_mgmt.h @@ -66,6 +66,11 @@ struct misc_info { u32 ppc; }; +extern int enable_debug_queue; + +int submit_command_to_dbg_queue(struct amdxdna_ctx *hwctx, u32 opcode, + u32 aie_addr, u64 paddr, u32 length); + // Read from handshake memory static inline int ve2_partition_read_privileged_mem(struct device *aie_dev, u32 col, diff --git a/src/driver/amdxdna/ve2_of.h b/src/driver/amdxdna/ve2_of.h index 2b69388ef..0c89a1cb7 100644 --- a/src/driver/amdxdna/ve2_of.h +++ b/src/driver/amdxdna/ve2_of.h @@ -89,6 +89,9 @@ struct amdxdna_ctx_priv { struct device *aie_dev; struct aie_partition_init_args *args; struct ve2_hsa_queue hwctx_hsa_queue; + struct ve2_dbg_queue hwctx_dbg_queue; + struct timer_list dbg_q_timer; + wait_queue_head_t dbg_q_waitq; struct ve2_config_hwctx *hwctx_config; wait_queue_head_t waitq; struct amdxdna_sched_job *pending[HWCTX_MAX_CMDS];