diff --git a/src/driver/amdxdna/ve2_debug.c b/src/driver/amdxdna/ve2_debug.c
index 2b12355fd..0aeaf2cc2 100644
--- a/src/driver/amdxdna/ve2_debug.c
+++ b/src/driver/amdxdna/ve2_debug.c
@@ -17,6 +17,8 @@
 #include "amdxdna_error.h"
 #include "amdxdna_drm.h"
 
+extern int enable_debug_queue;
+
 static int ve2_query_ctx_status_array(struct amdxdna_client *client,
 				      struct amdxdna_drm_hwctx_entry *tmp,
 				      pid_t pid, u32 ctx_id)
@@ -151,6 +153,53 @@ static int ve2_get_array_hwctx(struct amdxdna_client *client,
 	return ret;
 }
 
+static int ve2_dbg_queue_data_rw(struct amdxdna_dev *xdev, struct amdxdna_ctx *hwctx,
+				 u32 col, u32 row, u32 addr, void *data, size_t size,
+				 int cmd_type)
+{
+	struct platform_device *pdev = to_platform_device(xdev->ddev.dev);
+	dma_addr_t dma_handle;
+	void *virt_ptr = NULL;
+	int ret = 0;
+
+	if (size % 4 != 0) {
+		XDNA_ERR(xdev, "Size (%zu) must be a multiple of 4 bytes", size);
+		return -EINVAL;
+	}
+	/*Allocate phy memory and pass it to submit function*/
+	virt_ptr = dma_alloc_coherent(&pdev->dev, size, &dma_handle, GFP_KERNEL);
+	if (!virt_ptr) {
+		XDNA_ERR(xdev, "Failed to allocate DMA buffer");
+		return -ENOMEM;
+	}
+
+	addr = addr + ((col << VE2_COL_SHIFT) + (row << VE2_ROW_SHIFT));
+
+	switch (cmd_type) {
+	case DBG_CMD_WRITE:
+		memcpy(virt_ptr, data, size);
+		ret = submit_command_to_dbg_queue(hwctx, DBG_CMD_WRITE, addr, (u64)dma_handle,
+						  size / 4);
+	break;
+	case DBG_CMD_READ:
+		ret = submit_command_to_dbg_queue(hwctx, DBG_CMD_READ, addr, (u64)dma_handle,
+						  size / 4);
+		if (ret == 0)
+			memcpy(data, virt_ptr, size);
+	break;
+	case DBG_CMD_EXIT:
+		ret = submit_command_to_dbg_queue(hwctx, DBG_CMD_EXIT, addr, (u64)dma_handle,
+						  size / 4);
+	break;
+	default:
+		XDNA_ERR(xdev, "CMD_TYPE is not supported");
+		ret = -EINVAL;
+	break;
+	}
+	dma_free_coherent(&pdev->dev, size, virt_ptr, dma_handle);
+	return ret;
+}
+
 static int ve2_aie_write(struct amdxdna_client *client,
 			 struct amdxdna_drm_set_state *args)
 {
@@ -171,7 +220,7 @@ static int ve2_aie_write(struct amdxdna_client *client,
 		return -EFAULT;
 	}
 
-	XDNA_DBG(xdna, "Write request for ctx_id: %u, col: %u, row: %u, addr: 0x%x, size: %u\n",
+	XDNA_DBG(xdna, "Write request for ctx_id: %u, col: %u, row: %u, addr: 0x%x, size: %u",
 		 footer.context_id, footer.col, footer.row, footer.addr, footer.size);
 
 	/* Find the hardware context */
@@ -187,24 +236,24 @@ static int ve2_aie_write(struct amdxdna_client *client,
 	}
 
 	if (!hwctx) {
-		XDNA_ERR(xdna, "hw context :%u pid:%llu not found\n", footer.context_id,
+		XDNA_ERR(xdna, "hw context :%u pid:%llu not found", footer.context_id,
 			 footer.pid);
 		return -EINVAL;
 	}
 
-	XDNA_DBG(xdna, "Found hwctx: cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u\n",
+	XDNA_DBG(xdna, "Found hwctx: cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u",
 		 hwctx->client->pid, hwctx->id, hwctx->start_col, hwctx->num_col);
 
 	/* Validate column is within partition */
 	if (footer.col >= hwctx->num_col) {
-		XDNA_ERR(xdna, "Column %u is outside partition range [0, %u)\n",
+		XDNA_ERR(xdna, "Column %u is outside partition range [0, %u)",
 			 footer.col, hwctx->num_col);
 		return -EINVAL;
 	}
 
 	/* Validate row */
 	if (footer.row >= xdna->dev_handle->aie_dev_info.rows) {
-		XDNA_ERR(xdna, "Row %u is outside range [0, %u)\n",
+		XDNA_ERR(xdna, "Row %u is outside range [0, %u)",
 			 footer.row, xdna->dev_handle->aie_dev_info.rows);
 		return -EINVAL;
 	}
@@ -212,7 +261,7 @@ static int ve2_aie_write(struct amdxdna_client *client,
 	/* Get AIE device handle */
 	aie_dev = hwctx->priv->aie_dev;
 	if (!aie_dev) {
-		XDNA_ERR(xdna, "AIE device handle not found\n");
+		XDNA_ERR(xdna, "AIE device handle not found");
 		return -EINVAL;
 	}
 
@@ -223,16 +272,30 @@ static int ve2_aie_write(struct amdxdna_client *client,
 
 	/* Copy data from user space (data is at the beginning of buffer) */
 	if (copy_from_user(local_buf, u64_to_user_ptr(args->buffer), footer.size)) {
-		XDNA_ERR(xdna, "Error: unable to copy data from userptr\n");
+		XDNA_ERR(xdna, "Error: unable to copy data from userptr");
 		kfree(local_buf);
 		return -EFAULT;
 	}
 
 	/* Write to AIE memory */
-	ret = ve2_partition_write(aie_dev, footer.col, footer.row, footer.addr,
-				  footer.size, local_buf);
+	//TODO This is temporary fix to exit the debug queue.
+	if (enable_debug_queue) {
+		if (footer.col == 3) {
+			ret = ve2_dbg_queue_data_rw(xdna, hwctx, footer.col, footer.row,
+						    footer.addr, local_buf, footer.size,
+						    DBG_CMD_EXIT);
+		} else {
+			ret = ve2_dbg_queue_data_rw(xdna, hwctx, footer.col, footer.row,
+						    footer.addr, local_buf, footer.size,
+						    DBG_CMD_WRITE);
+		}
+	} else {
+		ret = ve2_partition_write(aie_dev, footer.col, footer.row, footer.addr,
+					  footer.size, local_buf);
+	}
+
 	if (ret < 0) {
-		XDNA_ERR(xdna, "Error in AIE memory write operation, err: %d\n", ret);
+		XDNA_ERR(xdna, "Error in AIE memory write operation, err: %d", ret);
 		kfree(local_buf);
 		return ret;
 	}
@@ -261,7 +324,7 @@ static int ve2_aie_read(struct amdxdna_client *client, struct amdxdna_drm_get_ar
 		return -EFAULT;
 	}
 
-	XDNA_DBG(xdna, "Read request for ctx_id: %u, col: %u, row: %u, addr: 0x%x, size: %u\n",
+	XDNA_DBG(xdna, "Read request for ctx_id: %u, col: %u, row: %u, addr: 0x%x, size: %u",
 		 footer.context_id, footer.col, footer.row, footer.addr, footer.size);
 
 	/* Find the hardware context */
@@ -277,24 +340,24 @@ static int ve2_aie_read(struct amdxdna_client *client, struct amdxdna_drm_get_ar
 	}
 
 	if (!hwctx) {
-		XDNA_ERR(xdna, "hw context :%u pid:%llu not found\n", footer.context_id,
+		XDNA_ERR(xdna, "hw context :%u pid:%llu not found", footer.context_id,
 			 footer.pid);
 		return -EINVAL;
 	}
 
-	XDNA_DBG(xdna, "Found hwctx: cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u\n",
+	XDNA_DBG(xdna, "Found hwctx: cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u",
 		 hwctx->client->pid, hwctx->id, hwctx->start_col, hwctx->num_col);
 
 	/* Validate column is within partition */
 	if (footer.col >= hwctx->num_col) {
-		XDNA_ERR(xdna, "Column %u is outside partition range [0, %u)\n",
+		XDNA_ERR(xdna, "Column %u is outside partition range [0, %u)",
 			 footer.col, hwctx->num_col);
 		return -EINVAL;
 	}
 
 	/* Validate row */
 	if (footer.row >= xdna->dev_handle->aie_dev_info.rows) {
-		XDNA_ERR(xdna, "Row %u is outside range [0, %u)\n",
+		XDNA_ERR(xdna, "Row %u is outside range [0, %u)",
 			 footer.row, xdna->dev_handle->aie_dev_info.rows);
 		return -EINVAL;
 	}
@@ -302,7 +365,7 @@ static int ve2_aie_read(struct amdxdna_client *client, struct amdxdna_drm_get_ar
 	/* Get AIE device handle and relative column */
 	aie_dev = hwctx->priv->aie_dev;
 	if (!aie_dev) {
-		XDNA_ERR(xdna, "AIE device handle not found\n");
+		XDNA_ERR(xdna, "AIE device handle not found");
 		return -EINVAL;
 	}
 
@@ -312,17 +375,22 @@ static int ve2_aie_read(struct amdxdna_client *client, struct amdxdna_drm_get_ar
 		return -ENOMEM;
 
 	/* Read from AIE memory */
-	ret = ve2_partition_read(aie_dev, footer.col, footer.row, footer.addr,
-				 footer.size, local_buf);
+	if (enable_debug_queue) {
+		ret = ve2_dbg_queue_data_rw(xdna, hwctx, footer.col, footer.row,
+					    footer.addr, local_buf, footer.size, DBG_CMD_READ);
+	} else {
+		ret = ve2_partition_read(aie_dev, footer.col, footer.row, footer.addr,
+					 footer.size, local_buf);
+	}
 	if (ret < 0) {
-		XDNA_ERR(xdna, "Error in AIE memory read operation, err: %d\n", ret);
+		XDNA_ERR(xdna, "Error in AIE memory read operation, err: %d", ret);
 		kfree(local_buf);
 		return ret;
 	}
 
 	/* Copy data to user space */
 	if (copy_to_user(u64_to_user_ptr(args->buffer), local_buf, footer.size)) {
-		XDNA_ERR(xdna, "Error: unable to copy memory to userptr\n");
+		XDNA_ERR(xdna, "Error: unable to copy memory to userptr");
 		kfree(local_buf);
 		return -EFAULT;
 	}
@@ -366,18 +434,18 @@ static int ve2_coredump_read(struct amdxdna_client *client, struct amdxdna_drm_g
 	}
 
 	if (!hwctx) {
-		XDNA_ERR(xdna, "hw context :%u pid:%llu not found\n", footer.context_id,
+		XDNA_ERR(xdna, "hw context :%u pid:%llu not found", footer.context_id,
 			 footer.pid);
 		return -EINVAL;
 	}
 
-	XDNA_DBG(xdna, "cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u\n",
+	XDNA_DBG(xdna, "cl_pid: %u, hwctx_id: %u, start_col %u, ncol %u",
 		 hwctx->client->pid, hwctx->id, hwctx->start_col,
 		 hwctx->num_col);
 
 	rel_size = hwctx->priv->num_col * xdna->dev_handle->aie_dev_info.rows * TILE_ADDRESS_SPACE;
 	if (rel_size > buf_size) {
-		XDNA_DBG(xdna, "Invalid buffer size:%d (rel_size:%d)\n", buf_size, rel_size);
+		XDNA_DBG(xdna, "Invalid buffer size:%d (rel_size:%d)", buf_size, rel_size);
 		args->element_size = rel_size;
 		return -ENOBUFS;
 	}
@@ -387,16 +455,16 @@ static int ve2_coredump_read(struct amdxdna_client *client, struct amdxdna_drm_g
 		return -ENOMEM;
 
 	ret = ve2_create_coredump(xdna, hwctx, local_buf, rel_size);
-	XDNA_DBG(xdna, "created dump of size:%d\n", ret);
+	XDNA_DBG(xdna, "created dump of size:%d", ret);
 
 	if (ret < 0) {
-		XDNA_ERR(xdna, "Error in AIE Data mem read operation, err: %d\n", ret);
+		XDNA_ERR(xdna, "Error in AIE Data mem read operation, err: %d", ret);
 		vfree(local_buf);
 		return ret;
 	}
 
 	if (copy_to_user(u64_to_user_ptr(args->buffer), local_buf, ret)) {
-		XDNA_ERR(xdna, "Error: unable to copy memory to userptr\n");
+		XDNA_ERR(xdna, "Error: unable to copy memory to userptr");
 		vfree(local_buf);
 		return -EFAULT;
 	}
@@ -684,7 +752,7 @@ static int ve2_get_array_async_error(struct amdxdna_dev *xdna, struct amdxdna_dr
 			XDNA_DBG(xdna, "Waiting for error callback to complete on mgmtctx[%u]", i);
 			if (wait_for_completion_timeout(&mgmtctx->error_cb_completion,
 							wait_timeout) == 0) {
-				XDNA_WARN(xdna, "Timeout waiting for err callback completion\n");
+				XDNA_WARN(xdna, "Timeout waiting for err callback completion");
 			}
 		}
 	}
diff --git a/src/driver/amdxdna/ve2_host_queue.h b/src/driver/amdxdna/ve2_host_queue.h
index c492b398e..5afef1678 100644
--- a/src/driver/amdxdna/ve2_host_queue.h
+++ b/src/driver/amdxdna/ve2_host_queue.h
@@ -123,6 +123,35 @@ struct ve2_hsa_queue {
 	struct device			*alloc_dev;
 };
 
+enum dbg_cmd_type {
+	DBG_CMD_EXIT = 11,
+	DBG_CMD_READ = 12,
+	DBG_CMD_WRITE = 13,
+};
+
+struct rw_mem {
+	u32				aie_addr;
+	u32				length;
+	u32				host_addr_high;
+	u32				host_addr_low;
+};
+
+struct dbg_queue {
+	struct host_queue_header	hq_header;
+	struct host_queue_packet	hq_entry[HOST_QUEUE_ENTRY];
+};
+
+struct ve2_dbg_queue {
+	struct dbg_queue		*dbg_queue_p;
+	struct ve2_mem			dbg_queue_mem;
+	struct ve2_hq_complete		hq_complete;
+	// hq_lock protects [read | write]_index and reserved_write_index
+	struct mutex			hq_lock;
+	u64				reserved_write_index;
+		/* Device used for host queue allocation */
+	struct device			*alloc_dev;
+};
+
 /* handshake */
 #define ALIVE_MAGIC 0x404C5645
 struct handshake {
diff --git a/src/driver/amdxdna/ve2_hwctx.c b/src/driver/amdxdna/ve2_hwctx.c
index becd0bd98..342ff7356 100644
--- a/src/driver/amdxdna/ve2_hwctx.c
+++ b/src/driver/amdxdna/ve2_hwctx.c
@@ -36,6 +36,10 @@ int max_col;
 module_param(max_col, int, 0644);
 MODULE_PARM_DESC(max_col, "Max column supported by this driver");
 
+int enable_debug_queue;
+module_param(enable_debug_queue, int, 0644);
+MODULE_PARM_DESC(enable_debug_queue, "Enable debug queue. It is disabled by default.");
+
 #define CTX_TIMER	(nsecs_to_jiffies(1))
 
 /*
@@ -138,83 +142,87 @@ static int ve2_wait_for_retry_slot(struct amdxdna_ctx *hwctx, u32 timeout_ms)
 }
 
 static struct host_queue_packet *
-hsa_queue_reserve_slot(struct amdxdna_dev *xdna, struct amdxdna_ctx_priv *priv, u64 *slot)
+ve2_queue_reserve_slot(struct amdxdna_dev *xdna, const char *qname, struct mutex *hq_lock,
+		       u64 *reserved_write_index, struct host_queue_header *header,
+		       struct ve2_hq_complete *hq_complete, struct host_queue_packet *hq_entry,
+		       u64 *slot)
 {
-	struct ve2_hsa_queue *queue = &priv->hwctx_hsa_queue;
-	struct host_queue_header *header = &queue->hsa_queue_p->hq_header;
-	u32 capacity = header->capacity;
-	u32 slot_idx;
+	enum ert_cmd_state state;
 	u64 outstanding;
+	u32 slot_idx;
 
-	mutex_lock(&queue->hq_lock);
+	mutex_lock(hq_lock);
 
+	u64 read_index = header->read_index;
+	u32 capacity = header->capacity;
 	/*
 	 * Check against reserved_write_index to account for in-flight reservations.
 	 */
-	if (queue->reserved_write_index < header->read_index) {
-		XDNA_ERR(xdna, "HSA Queue: reserved_write_index(%llu) < read_index(%llu)",
-			 queue->reserved_write_index, header->read_index);
-		mutex_unlock(&queue->hq_lock);
-		return NULL;
+	if (*reserved_write_index < read_index) {
+		XDNA_ERR(xdna, "%s Queue: reserved_write_index(%llu) < read_index(%llu)",
+			 qname, *reserved_write_index, read_index);
+		mutex_unlock(hq_lock);
+		return ERR_PTR(-EINVAL);
 	}
 
-	outstanding = queue->reserved_write_index - header->read_index;
+	outstanding = *reserved_write_index - read_index;
 	if (outstanding >= capacity) {
 		/* Use DBG level - expected during high queue utilization */
-		XDNA_DBG(xdna, "HSA Queue full: outstanding=%llu >= capacity=%u",
-			 outstanding, capacity);
-		mutex_unlock(&queue->hq_lock);
+		XDNA_DBG(xdna, "%s Queue full: outstanding=%llu >= capacity=%u",
+			 qname, outstanding, capacity);
+		mutex_unlock(hq_lock);
 		return ERR_PTR(-EBUSY);
 	}
 
-	slot_idx = queue->reserved_write_index % capacity;
-	enum ert_cmd_state state = queue->hq_complete.hqc_mem[slot_idx];
+	slot_idx = *reserved_write_index % capacity;
+	state = (enum ert_cmd_state)hq_complete->hqc_mem[slot_idx];
 
 	/*
 	 * Slot can only be reused when it's in INVALID state, which is set by
 	 * ve2_hwctx_job_release() after the job is fully released from pending array.
 	 * Note: ERT_CMD_STATE_INVALID == 0, so this also covers zero-initialized slots.
-	 * This ensures the pending array slot is free before we reserve the HSA queue slot.
+	 * This ensures the pending array slot is free before we reserve the queue slot.
 	 */
 	if (state != ERT_CMD_STATE_INVALID) {
 		XDNA_DBG(xdna, "Slot %u is still in use with state %u", slot_idx, state);
-		mutex_unlock(&queue->hq_lock);
+		mutex_unlock(hq_lock);
 		return ERR_PTR(-EBUSY);
 	}
 
 	/* Reserve this slot by incrementing reserved_write_index. */
-	*slot = queue->reserved_write_index++;
-	queue->hq_complete.hqc_mem[slot_idx] = ERT_CMD_STATE_NEW;
+	*slot = (*reserved_write_index)++;
+	hq_complete->hqc_mem[slot_idx] = ERT_CMD_STATE_NEW;
 
-	mutex_unlock(&queue->hq_lock);
+	mutex_unlock(hq_lock);
 
 	/* Return packet pointer. Caller can now prepare packet in parallel. */
-	return &queue->hsa_queue_p->hq_entry[slot_idx];
+	return &hq_entry[slot_idx];
 }
 
 /* Commit the prepared packet by updating write_index when all prior slots are ready.
  * This ensures CERT sees packets in order even if prepared out-of-order.
+ * Generic version that works with both HSA queue and DBG queue.
  */
-static void hsa_queue_commit_slot(struct amdxdna_dev *xdna, struct amdxdna_ctx_priv *priv,
-				  u64 slot)
+static void ve2_queue_commit_slot(struct mutex *hq_lock, u64 *reserved_write_index,
+				  struct host_queue_header *header,
+				  struct host_queue_packet *hq_entry,
+				  struct ve2_hq_complete *hq_complete, u64 slot)
 {
-	struct ve2_hsa_queue *queue = &priv->hwctx_hsa_queue;
-	struct host_queue_header *header = &queue->hsa_queue_p->hq_header;
 	u32 capacity = header->capacity;
 	u32 slot_idx = slot % capacity;
-	struct host_queue_packet *pkt = &queue->hsa_queue_p->hq_entry[slot_idx];
+	struct host_queue_packet *pkt = &hq_entry[slot_idx];
 
-	mutex_lock(&queue->hq_lock);
+	mutex_lock(hq_lock);
 	/* Set packet type to valid so CERT can process it */
 	pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC;
 
 	/* Mark this slot as ready in driver tracking */
-	queue->hq_complete.hqc_mem[slot_idx] = ERT_CMD_STATE_SUBMITTED;
+	hq_complete->hqc_mem[slot_idx] = ERT_CMD_STATE_SUBMITTED;
 
 	/* Advance write_index as far as possible through all ready slots. */
-	while (header->write_index < queue->reserved_write_index) {
+	while (header->write_index < *reserved_write_index) {
 		u32 next_idx = header->write_index % capacity;
-		enum ert_cmd_state state = queue->hq_complete.hqc_mem[next_idx];
+		enum ert_cmd_state state = hq_complete->hqc_mem[next_idx];
 
 		if (state != ERT_CMD_STATE_SUBMITTED)
 			break;
@@ -222,7 +230,35 @@ static void hsa_queue_commit_slot(struct amdxdna_dev *xdna, struct amdxdna_ctx_p
 		header->write_index++;
 	}
 
-	mutex_unlock(&queue->hq_lock);
+	mutex_unlock(hq_lock);
+}
+
+static void *ve2_get_queue_pkt(struct amdxdna_ctx *hwctx, u64 *seq, int *err, const char *qname,
+			       struct mutex *hq_lock, u64 *reserved_write_index,
+			       struct host_queue_header *header,
+			       struct ve2_hq_complete *hq_complete,
+			       struct host_queue_packet *hq_entry)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct host_queue_packet *pkt;
+
+	pkt = ve2_queue_reserve_slot(xdna,
+				     qname,
+				     hq_lock,
+				     reserved_write_index,
+				     header,
+				     hq_complete,
+				     hq_entry,
+				     seq);
+	if (IS_ERR(pkt)) {
+		*err = PTR_ERR(pkt);
+		/* Expected during retry - use DBG level */
+		XDNA_DBG(xdna, "%s Queue: No slot available (err=%d)", qname, *err);
+		return NULL;
+	}
+
+	*err = 0;
+	return pkt;
 }
 
 static void ve2_job_release(struct kref *ref)
@@ -342,46 +378,28 @@ static inline void ve2_hwctx_job_release(struct amdxdna_ctx *hwctx, struct amdxd
 	mutex_unlock(&hwctx->priv->hwctx_hsa_queue.hq_lock);
 }
 
-static inline struct host_queue_packet *hsa_queue_get_pkt(struct hsa_queue *queue, u64 slot)
+static inline struct host_queue_packet *ve2_queue_get_pkt(struct host_queue_packet *hq_entry,
+							  u32 capacity, u64 slot)
 {
-	return &queue->hq_entry[slot & (queue->hq_header.capacity - 1)];
+	return &hq_entry[slot & (capacity - 1)];
 }
 
-static void *get_host_queue_pkt(struct amdxdna_ctx *hwctx, u64 *seq, int *err)
-{
-	struct amdxdna_dev *xdna = hwctx->client->xdna;
-	struct host_queue_packet *pkt;
-
-	pkt = hsa_queue_reserve_slot(xdna, hwctx->priv, seq);
-	if (IS_ERR(pkt)) {
-		*err = PTR_ERR(pkt);
-		/* Expected during retry - use DBG level */
-		XDNA_DBG(xdna, "No slot available in Host queue (err=%d)", *err);
-		return NULL;
-	}
-
-	*err = 0;
-	return pkt;
-}
-
-static inline void hsa_queue_pkt_set_invalid(struct host_queue_packet *pkt)
+static inline void ve2_queue_pkt_set_invalid(struct host_queue_packet *pkt)
 {
 	pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_INVALID;
 }
 
-static void ve2_free_hsa_queue(struct amdxdna_dev *xdna, struct ve2_hsa_queue *queue)
+static void ve2_free_queue(struct amdxdna_dev *xdna, const char *qname, void **queue_p,
+			   struct device **alloc_dev, dma_addr_t *dma_addr, struct mutex *hq_lock,
+			   size_t alloc_size)
 {
-	if (queue->hsa_queue_p) {
-		XDNA_DBG(xdna, "Freeing host queue: dma_addr=0x%llx",
-			 queue->hsa_queue_mem.dma_addr);
-		dma_free_coherent(queue->alloc_dev,
-				  sizeof(struct hsa_queue) + sizeof(u64) * HOST_QUEUE_ENTRY,
-				  queue->hsa_queue_p,
-				  queue->hsa_queue_mem.dma_addr);
-		queue->hsa_queue_p = NULL;
-		queue->hsa_queue_mem.dma_addr = 0;
-		queue->alloc_dev = NULL;
-		mutex_destroy(&queue->hq_lock);
+	if (*queue_p) {
+		XDNA_DBG(xdna, "Freeing %s queue: dma_addr=0x%llx", qname, *dma_addr);
+		dma_free_coherent(*alloc_dev, alloc_size, *queue_p, *dma_addr);
+		*queue_p = NULL;
+		*dma_addr = 0;
+		*alloc_dev = NULL;
+		mutex_destroy(hq_lock);
 	}
 }
 
@@ -470,6 +488,157 @@ void packet_dump(struct amdxdna_dev *xdna, struct hsa_queue *queue, u64 slot_id)
 	}
 }
 
+static int ve2_create_dbg_queue(struct amdxdna_dev *xdna, struct amdxdna_ctx *hwctx,
+				struct ve2_dbg_queue *queue)
+{
+	struct platform_device *pdev = to_platform_device(xdna->ddev.dev);
+	int nslots = HOST_QUEUE_ENTRY;
+	dma_addr_t dma_handle;
+	size_t alloc_size;
+	int r;
+	struct device *alloc_dev;
+
+	alloc_size = sizeof(struct dbg_queue) + sizeof(u64) * nslots;
+	XDNA_DBG(xdna, "Creating dbg queue: nslots=%d, alloc_size=%zu", nslots, alloc_size);
+
+	/* Allocate from context's CMA region(s); try bitmap order (region 0, 1, ...). */
+	for (r = 0; r < MAX_MEM_REGIONS; r++) {
+		alloc_dev = xdna->cma_region_devs[r];
+		if ((hwctx->priv->mem_bitmap & (1U << r)) && alloc_dev) {
+			queue->dbg_queue_p = dma_alloc_coherent(alloc_dev, alloc_size,
+								&dma_handle, GFP_KERNEL);
+			if (!queue->dbg_queue_p)
+				continue;
+			queue->alloc_dev = alloc_dev;
+			break;
+		}
+	}
+
+	/* If no allocation succeeded, use the default device */
+	if (!queue->dbg_queue_p) {
+		queue->dbg_queue_p = dma_alloc_coherent(&pdev->dev,
+							alloc_size,
+							&dma_handle,
+							GFP_KERNEL);
+		if (!queue->dbg_queue_p) {
+			XDNA_ERR(xdna, "Failed to allocate dbg queue memory, size=%zu", alloc_size);
+			return -ENOMEM;
+		}
+		queue->alloc_dev = &pdev->dev;
+	}
+	memset(queue->dbg_queue_p, 0, alloc_size);
+	/* Initialize mutex here */
+	mutex_init(&queue->hq_lock);
+	/* Initialize reserved_write_index to track slot reservations */
+	queue->reserved_write_index = 0;
+	/* Set the base DMA address for dbg queue */
+	queue->dbg_queue_mem.dma_addr = dma_handle;
+	/* Calculate the address for hqc_mem within the allocated block */
+	queue->hq_complete.hqc_mem =
+		(u64 *)((char *)queue->dbg_queue_p + sizeof(struct dbg_queue));
+	queue->hq_complete.hqc_dma_addr = queue->dbg_queue_mem.dma_addr + sizeof(struct dbg_queue);
+	queue->dbg_queue_p->hq_header.data_address = queue->dbg_queue_mem.dma_addr +
+		sizeof(struct host_queue_header);
+
+	WARN_ON(!is_power_of_2(nslots));
+	queue->dbg_queue_p->hq_header.capacity = nslots;
+	/* Set dbg queue slots to invalid */
+	for (int slot = 0; slot < nslots; slot++) {
+		ve2_queue_pkt_set_invalid(ve2_queue_get_pkt(queue->dbg_queue_p->hq_entry,
+							    queue->dbg_queue_p->hq_header.capacity,
+							    slot));
+	}
+
+	XDNA_DBG(xdna, "Created dbg queue: dma_addr=0x%llx, capacity=%d, data_addr=0x%llx",
+		 queue->dbg_queue_mem.dma_addr, nslots, queue->dbg_queue_p->hq_header.data_address);
+	return 0;
+}
+
+int submit_command_to_dbg_queue(struct amdxdna_ctx *hwctx, u32 opcode, u32 aie_addr, u64 paddr,
+				u32 length)
+{
+	struct amdxdna_ctx_priv *ve2_ctx = hwctx->priv;
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct ve2_dbg_queue *dbg_queue;
+	struct xrt_packet_header *hdr;
+	struct host_queue_packet *pkt;
+	struct rw_mem *ebp;
+	long wait_ret = 0;
+	u64 slot_id = 0;
+	int err;
+
+	if (!ve2_ctx || !ve2_ctx->hwctx_dbg_queue.dbg_queue_p) {
+		XDNA_ERR(xdna, "Debug queue is not initialized");
+		return -EINVAL;
+	}
+	dbg_queue = (struct ve2_dbg_queue *)&ve2_ctx->hwctx_dbg_queue;
+	pkt = (struct host_queue_packet *)ve2_get_queue_pkt(hwctx, &slot_id, &err,
+		"DBG",
+		&dbg_queue->hq_lock,
+		&dbg_queue->reserved_write_index,
+		&dbg_queue->dbg_queue_p->hq_header,
+		&dbg_queue->hq_complete,
+		dbg_queue->dbg_queue_p->hq_entry);
+	if (!pkt) {
+		XDNA_ERR(xdna, "Getting host queue packet failed (err=%d)", err);
+		return err;
+	}
+	XDNA_DBG(xdna, "pkt %p of slot %llx is selected", (void *)pkt, slot_id);
+
+	slot_id = slot_id & (dbg_queue->dbg_queue_p->hq_header.capacity - 1);
+
+	hdr = &pkt->xrt_header;
+	hdr->common_header.opcode = opcode;
+	hdr->completion_signal =
+		(u64)(dbg_queue->hq_complete.hqc_dma_addr + slot_id * sizeof(u64));
+
+	XDNA_DBG(xdna, "Debug Queue packet opcode: %u", pkt->xrt_header.common_header.opcode);
+
+	hdr->common_header.count = sizeof(struct rw_mem);
+	hdr->common_header.distribute = 0;
+	hdr->common_header.indirect = 0;
+
+	ebp = (struct rw_mem *)pkt->data;
+	ebp->aie_addr = aie_addr;
+	ebp->host_addr_high = upper_32_bits(paddr);
+	ebp->host_addr_low = lower_32_bits(paddr);
+	ebp->length = length;
+
+	ve2_queue_commit_slot(&dbg_queue->hq_lock, &dbg_queue->reserved_write_index,
+			      &dbg_queue->dbg_queue_p->hq_header, dbg_queue->dbg_queue_p->hq_entry,
+			      &dbg_queue->hq_complete, slot_id);
+
+	wait_ret = wait_event_interruptible_timeout(ve2_ctx->dbg_q_waitq,
+						    dbg_queue->dbg_queue_p->hq_header.read_index ==
+						    dbg_queue->dbg_queue_p->hq_header.write_index,
+						    msecs_to_jiffies(5000));
+
+	if (wait_ret == 0) {
+		XDNA_ERR(xdna, "DBG Queue command wait timeout");
+		err = -ETIMEDOUT;
+		goto cleanup_slot_id;
+	} else if (wait_ret < 0) {
+		XDNA_ERR(xdna, "DBG Queue command wait interrupted");
+		err = wait_ret;
+		goto cleanup_slot_id;
+	}
+
+	XDNA_DBG(xdna, "After command submission write_index is %llx, read_index is %llx",
+		 dbg_queue->dbg_queue_p->hq_header.write_index,
+		 dbg_queue->dbg_queue_p->hq_header.read_index);
+
+	err = 0;
+
+cleanup_slot_id:
+	/* Reset slot to INVALID after completion (success, timeout, or interruption) */
+	mutex_lock(&dbg_queue->hq_lock);
+	dbg_queue->hq_complete.hqc_mem[slot_id] = ERT_CMD_STATE_INVALID;
+	ve2_queue_pkt_set_invalid(pkt);
+	mutex_unlock(&dbg_queue->hq_lock);
+
+	return err;
+}
+
 /*
  * Create hsa queue in kernel and initialize queue slots.
  */
@@ -511,7 +680,7 @@ static int ve2_create_host_queue(struct amdxdna_dev *xdna, struct amdxdna_ctx *h
 		}
 		queue->alloc_dev = xdna->ddev.dev;
 	}
-
+	memset(queue->hsa_queue_p, 0, alloc_size);
 	/* Initialize mutex here */
 	mutex_init(&queue->hq_lock);
 	/* Initialize reserved_write_index to track slot reservations */
@@ -532,7 +701,9 @@ static int ve2_create_host_queue(struct amdxdna_dev *xdna, struct amdxdna_ctx *h
 	for (int slot = 0; slot < nslots; slot++) {
 		struct host_queue_indirect_hdr *hdr = &queue->hsa_queue_p->hq_indirect_hdr[slot];
 
-		hsa_queue_pkt_set_invalid(hsa_queue_get_pkt(queue->hsa_queue_p, slot));
+		ve2_queue_pkt_set_invalid(ve2_queue_get_pkt(queue->hsa_queue_p->hq_entry,
+							    queue->hsa_queue_p->hq_header.capacity,
+							    slot));
 		hdr->header.type = HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC;
 		hdr->header.opcode = HOST_QUEUE_PACKET_EXEC_BUF;
 		hdr->header.count = 0;
@@ -571,14 +742,19 @@ static int submit_command_indirect(struct amdxdna_ctx *hwctx, void *cmd_data, u6
 	u64 slot_id = 0;
 
 	dpu = (struct ve2_dpu_data *)cmd_data;
+	hq_queue = (struct ve2_hsa_queue *)&ve2_ctx->hwctx_hsa_queue;
 
-	pkt = hsa_queue_reserve_slot(xdna, ve2_ctx, &slot_id);
+	pkt = ve2_queue_reserve_slot(xdna, "HSA", &hq_queue->hq_lock,
+				     &hq_queue->reserved_write_index,
+				     &hq_queue->hsa_queue_p->hq_header,
+				     &hq_queue->hq_complete,
+				     hq_queue->hsa_queue_p->hq_entry,
+				     &slot_id);
 	if (IS_ERR(pkt)) {
 		XDNA_DBG(xdna, "No slot available in Host queue");
 		return PTR_ERR(pkt);
 	}
 
-	hq_queue = (struct ve2_hsa_queue *)&ve2_ctx->hwctx_hsa_queue;
 	queue = (struct hsa_queue *)hq_queue->hsa_queue_p;
 
 	*seq = slot_id;
@@ -647,7 +823,9 @@ static int submit_command_indirect(struct amdxdna_ctx *hwctx, void *cmd_data, u6
 		packet_dump(xdna, queue, slot_id);
 
 	/* Commit the slot - this sets hqc_mem to SUBMITTED and advances write_index */
-	hsa_queue_commit_slot(xdna, ve2_ctx, *seq);
+	ve2_queue_commit_slot(&hq_queue->hq_lock, &hq_queue->reserved_write_index,
+			      &hq_queue->hsa_queue_p->hq_header, hq_queue->hsa_queue_p->hq_entry,
+			      &hq_queue->hq_complete, *seq);
 
 	return 0;
 }
@@ -669,7 +847,14 @@ static int submit_command(struct amdxdna_ctx *hwctx, void *cmd_data, u64 *seq, b
 		return -EINVAL;
 	}
 
-	pkt = (struct host_queue_packet *)get_host_queue_pkt(hwctx, &slot_id, &err);
+	pkt = (struct host_queue_packet *)ve2_get_queue_pkt(hwctx, &slot_id, &err,
+		"HSA",
+		&hq_queue->hq_lock,
+		&hq_queue->reserved_write_index,
+		&hq_queue->hsa_queue_p->hq_header,
+		&hq_queue->hq_complete,
+		hq_queue->hsa_queue_p->hq_entry);
+
 	if (!pkt) {
 		/* Expected during retry - use DBG level */
 		XDNA_DBG(xdna, "Getting host queue packet failed (err=%d)", err);
@@ -685,8 +870,7 @@ static int submit_command(struct amdxdna_ctx *hwctx, void *cmd_data, u64 *seq, b
 	hdr->common_header.chain_flag = last_cmd ? LAST_CMD : NOT_LAST_CMD;
 	hdr->completion_signal =
 		(u64)(hq_queue->hq_complete.hqc_dma_addr + slot_id * sizeof(u64));
-#define XRT_PKT_OPCODE(p) ((p)->xrt_header.common_header.opcode)
-	XDNA_DBG(xdna, "Queue packet opcode: %u\n", XRT_PKT_OPCODE(pkt));
+	XDNA_DBG(xdna, "Queue packet opcode: %u", pkt->xrt_header.common_header.opcode);
 
 	hdr->common_header.count = sizeof(struct exec_buf);
 	hdr->common_header.distribute = 0;
@@ -706,7 +890,9 @@ static int submit_command(struct amdxdna_ctx *hwctx, void *cmd_data, u64 *seq, b
 	XDNA_DBG(xdna, "dpu instruction addr: 0x%llx", dpu_cmd->instruction_buffer);
 
 	/* Commit the slot - this sets hqc_mem to SUBMITTED and advances write_index */
-	hsa_queue_commit_slot(xdna, ve2_ctx, *seq);
+	ve2_queue_commit_slot(&hq_queue->hq_lock, &hq_queue->reserved_write_index,
+			      &hq_queue->hsa_queue_p->hq_header,
+			      hq_queue->hsa_queue_p->hq_entry, &hq_queue->hq_complete, *seq);
 
 	return 0;
 }
@@ -1216,6 +1402,17 @@ static void timeout_cb(struct timer_list *t)
 	mod_timer(&priv->event_timer, jiffies + CTX_TIMER);
 }
 
+static void dbg_q_timeout_cb(struct timer_list *t)
+{
+	struct amdxdna_ctx_priv *priv = from_timer(priv, t, dbg_q_timer);
+
+	if (!priv || !priv->hwctx_dbg_queue.dbg_queue_p)
+		return;
+
+	wake_up_interruptible(&priv->dbg_q_waitq);
+	mod_timer(&priv->dbg_q_timer, jiffies + CTX_TIMER);
+}
+
 static void ve2_clear_firmware_status(struct amdxdna_dev *xdna, struct amdxdna_ctx *hwctx)
 {
 	struct amdxdna_ctx_priv *priv = hwctx->priv;
@@ -1263,7 +1460,6 @@ int ve2_hwctx_init(struct amdxdna_ctx *hwctx)
 		XDNA_ERR(xdna, "Failed to create host queue, ret=%d", ret);
 		goto cleanup_xrs;
 	}
-
 	if (enable_polling) {
 		XDNA_DBG(xdna, "Running in timer mode");
 		timer_setup(&priv->event_timer, timeout_cb, 0);
@@ -1272,6 +1468,18 @@ int ve2_hwctx_init(struct amdxdna_ctx *hwctx)
 		XDNA_DBG(xdna, "Running in interrupt mode");
 	}
 
+	if (enable_debug_queue) {
+		/* one dbg_queue entry per hwctx */
+		ret = ve2_create_dbg_queue(xdna, hwctx, &priv->hwctx_dbg_queue);
+		if (ret) {
+			XDNA_ERR(xdna, "Failed to create dbg queue, ret=%d", ret);
+			goto free_hsa_queue;
+		}
+		init_waitqueue_head(&priv->dbg_q_waitq);
+		timer_setup(&priv->dbg_q_timer, dbg_q_timeout_cb, 0);
+		mod_timer(&priv->dbg_q_timer, jiffies + CTX_TIMER);
+	}
+
 	if (verbosity >= VERBOSITY_LEVEL_DBG)
 		ve2_clear_firmware_status(xdna, hwctx);
 
@@ -1284,6 +1492,12 @@ int ve2_hwctx_init(struct amdxdna_ctx *hwctx)
 
 	return 0;
 
+free_hsa_queue:
+	ve2_free_queue(xdna, "HSA", (void **)&hwctx->priv->hwctx_hsa_queue.hsa_queue_p,
+		       &hwctx->priv->hwctx_hsa_queue.alloc_dev,
+		       &hwctx->priv->hwctx_hsa_queue.hsa_queue_mem.dma_addr,
+		       &hwctx->priv->hwctx_hsa_queue.hq_lock,
+		       sizeof(struct hsa_queue) + sizeof(u64) * HOST_QUEUE_ENTRY);
 cleanup_xrs:
 	/* Releases XRS and partition (ve2_mgmt_destroy_partition calls ve2_xrs_release). */
 	ve2_mgmt_destroy_partition(hwctx);
@@ -1307,6 +1521,9 @@ void ve2_hwctx_fini(struct amdxdna_ctx *hwctx)
 		 hwctx, nhwctx->start_col, nhwctx->num_col,
 		 hwctx->submitted, hwctx->completed);
 
+	if (enable_debug_queue && hwctx->priv)
+		del_timer_sync(&hwctx->priv->dbg_q_timer);
+
 	if (enable_polling)
 		del_timer_sync(&hwctx->priv->event_timer);
 
@@ -1354,7 +1571,19 @@ void ve2_hwctx_fini(struct amdxdna_ctx *hwctx)
 		ve2_get_firmware_status(hwctx);
 
 	ve2_mgmt_destroy_partition(hwctx);
-	ve2_free_hsa_queue(xdna, &hwctx->priv->hwctx_hsa_queue);
+	ve2_free_queue(xdna, "HSA", (void **)&hwctx->priv->hwctx_hsa_queue.hsa_queue_p,
+		       &hwctx->priv->hwctx_hsa_queue.alloc_dev,
+		       &hwctx->priv->hwctx_hsa_queue.hsa_queue_mem.dma_addr,
+		       &hwctx->priv->hwctx_hsa_queue.hq_lock,
+		       sizeof(struct hsa_queue) + sizeof(u64) * HOST_QUEUE_ENTRY);
+	if (enable_debug_queue) {
+		ve2_free_queue(xdna, "DBG", (void **)&hwctx->priv->hwctx_dbg_queue.dbg_queue_p,
+			       &hwctx->priv->hwctx_dbg_queue.alloc_dev,
+			       &hwctx->priv->hwctx_dbg_queue.dbg_queue_mem.dma_addr,
+			       &hwctx->priv->hwctx_dbg_queue.hq_lock,
+			       sizeof(struct dbg_queue) + sizeof(u64) * HOST_QUEUE_ENTRY);
+	}
+
 	kfree(hwctx->priv->hwctx_config);
 	mutex_destroy(&hwctx->priv->privctx_lock);
 	kfree(hwctx->priv);
diff --git a/src/driver/amdxdna/ve2_mgmt.c b/src/driver/amdxdna/ve2_mgmt.c
index 1c787f8b9..869010e3e 100644
--- a/src/driver/amdxdna/ve2_mgmt.c
+++ b/src/driver/amdxdna/ve2_mgmt.c
@@ -30,10 +30,15 @@ static void cert_setup_partition(struct amdxdna_dev *xdna,
 	u32 start_col = nhwctx->start_col;
 	u32 num_col = nhwctx->num_col;
 	u64 hsa_addr = 0xFFFFFFFFFFFFFFFF;
+	u64 dbg_addr = 0xFFFFFFFFFFFFFFFF;
+
 	struct ve2_config_hwctx *hwctx_cfg = &nhwctx->hwctx_config[col];
 
-	if (col == 0)
+	if (col == 0) {
 		hsa_addr = nhwctx->hwctx_hsa_queue.hsa_queue_mem.dma_addr;
+		if (enable_debug_queue)
+			dbg_addr = nhwctx->hwctx_dbg_queue.dbg_queue_mem.dma_addr;
+	}
 
 	u32 lead_col_addr = VE2_ADDR(start_col, 0, 0);
 
@@ -41,6 +46,8 @@ static void cert_setup_partition(struct amdxdna_dev *xdna,
 	cert_hs->aie_info.partition_size = num_col;
 	cert_hs->hsa_addr_high =  upper_32_bits(hsa_addr);
 	cert_hs->hsa_addr_low =  lower_32_bits(hsa_addr);
+	cert_hs->dbg.hsa_addr_high =  upper_32_bits(dbg_addr);
+	cert_hs->dbg.hsa_addr_low =  lower_32_bits(dbg_addr);
 	cert_hs->log_addr_high = upper_32_bits(hwctx_cfg->log_buf_addr);
 	cert_hs->log_addr_low = lower_32_bits(hwctx_cfg->log_buf_addr);
 	cert_hs->log_buf_size = hwctx_cfg->log_buf_size;
@@ -57,8 +64,6 @@ static void cert_setup_partition(struct amdxdna_dev *xdna,
 
 	cert_hs->ctx_switch_req = 0;
 	cert_hs->hsa_location = 0;
-	cert_hs->dbg.hsa_addr_high = 0xFFFFFFFF;
-	cert_hs->dbg.hsa_addr_low = 0xFFFFFFFF;
 	cert_hs->mpaie_alive = ALIVE_MAGIC;
 }
 
diff --git a/src/driver/amdxdna/ve2_mgmt.h b/src/driver/amdxdna/ve2_mgmt.h
index 54c0791d0..bdbcaaa1e 100644
--- a/src/driver/amdxdna/ve2_mgmt.h
+++ b/src/driver/amdxdna/ve2_mgmt.h
@@ -66,6 +66,11 @@ struct misc_info {
 	u32 ppc;
 };
 
+extern int enable_debug_queue;
+
+int submit_command_to_dbg_queue(struct amdxdna_ctx *hwctx, u32 opcode,
+				u32 aie_addr, u64 paddr, u32 length);
+
 // Read from handshake memory
 static inline int
 ve2_partition_read_privileged_mem(struct device *aie_dev, u32 col,
diff --git a/src/driver/amdxdna/ve2_of.h b/src/driver/amdxdna/ve2_of.h
index 2b69388ef..0c89a1cb7 100644
--- a/src/driver/amdxdna/ve2_of.h
+++ b/src/driver/amdxdna/ve2_of.h
@@ -89,6 +89,9 @@ struct amdxdna_ctx_priv {
 	struct device			*aie_dev;
 	struct aie_partition_init_args	*args;
 	struct ve2_hsa_queue		hwctx_hsa_queue;
+	struct ve2_dbg_queue		hwctx_dbg_queue;
+	struct timer_list		dbg_q_timer;
+	wait_queue_head_t		dbg_q_waitq;
 	struct ve2_config_hwctx		*hwctx_config;
 	wait_queue_head_t		waitq;
 	struct amdxdna_sched_job	*pending[HWCTX_MAX_CMDS];