From abb7e16fb6dfc32d0c0e63787409fdb4c348915c Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Mon, 16 Sep 2019 09:16:29 +0300
Subject: [PATCH 01/31] habanalabs: handle F/W failure for sensor
 initialization

In case the F/W fails to initialize the thermal sensors, print an
appropriate error message to kernel log and fail the device
initialization.

Reviewed-by: Tomer Tayar <ttayar@habana.ai>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/goya/goya.c          | 5 +++++
 drivers/misc/habanalabs/include/hl_boot_if.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 6fba14b81f90..09caef7642fd 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -2328,6 +2328,11 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
 				"ARM status %d - u-boot stopped by user\n",
 				status);
 			break;
+		case CPU_BOOT_STATUS_TS_INIT_FAIL:
+			dev_err(hdev->dev,
+				"ARM status %d - Thermal Sensor initialization failed\n",
+				status);
+			break;
 		default:
 			dev_err(hdev->dev,
 				"ARM status %d - Invalid status code\n",
diff --git a/drivers/misc/habanalabs/include/hl_boot_if.h b/drivers/misc/habanalabs/include/hl_boot_if.h
index 4cd04c090285..2853a2de8cf6 100644
--- a/drivers/misc/habanalabs/include/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/hl_boot_if.h
@@ -20,6 +20,8 @@ enum cpu_boot_status {
 	CPU_BOOT_STATUS_DRAM_INIT_FAIL,
 	CPU_BOOT_STATUS_FIT_CORRUPTED,
 	CPU_BOOT_STATUS_UBOOT_NOT_READY,
+	CPU_BOOT_STATUS_RESERVED,
+	CPU_BOOT_STATUS_TS_INIT_FAIL,
 };
 
 enum kmd_msg {

From 1e295d4dd5b2e3d9dab68bfcab1f5666cde3804d Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 28 Sep 2019 12:18:04 +0800
Subject: [PATCH 02/31] habanalabs: remove set but not used variable 'ctx'

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/misc/habanalabs/device.c: In function hpriv_release:
drivers/misc/habanalabs/device.c:45:17: warning: variable ctx set but not used [-Wunused-but-set-variable]

It is never used since commit eb7caf84b029 ("habanalabs:
maintain a list of file private data objects")

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/device.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 459fee70a597..2f5a4da707e7 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -42,12 +42,10 @@ static void hpriv_release(struct kref *ref)
 {
 	struct hl_fpriv *hpriv;
 	struct hl_device *hdev;
-	struct hl_ctx *ctx;
 
 	hpriv = container_of(ref, struct hl_fpriv, refcount);
 
 	hdev = hpriv->hdev;
-	ctx = hpriv->ctx;
 
 	put_pid(hpriv->taskpid);
 

From f435614ff55c6783919028cb914ffd7422e0b03b Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Wed, 2 Oct 2019 13:53:52 +0000
Subject: [PATCH 03/31] habanalabs: Fix typos

s/paerser/parser/
s/requeusted/requested/
s/an JOB/a JOB/

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/habanalabs.h | 2 +-
 drivers/misc/habanalabs/hw_queue.c   | 4 ++--
 include/uapi/misc/habanalabs.h       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 75862be53c60..c3d24ffad9fa 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -774,7 +774,7 @@ struct hl_cs_job {
 };
 
 /**
- * struct hl_cs_parser - command submission paerser properties.
+ * struct hl_cs_parser - command submission parser properties.
  * @user_cb: the CB we got from the user.
  * @patched_cb: in case of patching, this is internal CB which is submitted on
  *		the queue instead of the CB we got from the IOCTL.
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index 55b383b2a116..f733b534f738 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -220,7 +220,7 @@ out:
 }
 
 /*
- * ext_hw_queue_schedule_job - submit an JOB to an external queue
+ * ext_hw_queue_schedule_job - submit a JOB to an external queue
  *
  * @job: pointer to the job that needs to be submitted to the queue
  *
@@ -278,7 +278,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
 }
 
 /*
- * int_hw_queue_schedule_job - submit an JOB to an internal queue
+ * int_hw_queue_schedule_job - submit a JOB to an internal queue
  *
  * @job: pointer to the job that needs to be submitted to the queue
  *
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 39c4ea51a719..53e4ff73578e 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -589,7 +589,7 @@ struct hl_debug_args {
  *
  * The user can call this IOCTL with a handle it received from the CS IOCTL
  * to wait until the handle's CS has finished executing. The user will wait
- * inside the kernel until the CS has finished or until the user-requeusted
+ * inside the kernel until the CS has finished or until the user-requested
  * timeout has expired.
  *
  * The return value of the IOCTL is a standard Linux error code. The possible

From df762375f17e1765bc3a0b345378e1726d85ca75 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Thu, 3 Oct 2019 15:22:35 +0000
Subject: [PATCH 04/31] habanalabs: Mark queue as expecting CB handle or
 address

Jobs on some queues must be provided with a handle to a driver command
buffer object, while for other queues, jobs must be provided with an
address to a command buffer.
Currently the distinction is done based on the queue type, which is less
flexible if the same queue type behaves differently on different
types of ASICs.
This patch adds a new queue property for this target, which is
configured per queue type per ASIC type.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/command_submission.c | 4 +++-
 drivers/misc/habanalabs/goya/goya.c          | 3 +++
 drivers/misc/habanalabs/habanalabs.h         | 3 +++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index a9ac045dcfde..f44205540520 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -414,7 +414,9 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
 			"Queue index %d is restricted for the kernel driver\n",
 			chunk->queue_index);
 		return NULL;
-	} else if (hw_queue_prop->type == QUEUE_TYPE_INT) {
+	}
+
+	if (!hw_queue_prop->requires_kernel_cb) {
 		*ext_queue = false;
 		return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
 	}
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 09caef7642fd..71693fcffb16 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -337,17 +337,20 @@ void goya_get_fixed_properties(struct hl_device *hdev)
 	for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
 		prop->hw_queues_props[i].driver_only = 0;
+		prop->hw_queues_props[i].requires_kernel_cb = 1;
 	}
 
 	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
 		prop->hw_queues_props[i].driver_only = 1;
+		prop->hw_queues_props[i].requires_kernel_cb = 0;
 	}
 
 	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES +
 			NUMBER_OF_INT_HW_QUEUES; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
 		prop->hw_queues_props[i].driver_only = 0;
+		prop->hw_queues_props[i].requires_kernel_cb = 0;
 	}
 
 	for (; i < HL_MAX_QUEUES; i++)
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index c3d24ffad9fa..f47f4b22cb6b 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -98,10 +98,13 @@ enum hl_queue_type {
  * @type: queue type.
  * @driver_only: true if only the driver is allowed to send a job to this queue,
  *               false otherwise.
+ * @requires_kernel_cb: true if a CB handle must be provided for jobs on this
+ *                      queue, false otherwise (a CB address must be provided).
  */
 struct hw_queue_properties {
 	enum hl_queue_type	type;
 	u8			driver_only;
+	u8			requires_kernel_cb;
 };
 
 /**

From cb596aee8842c87605ea1a9062af2ab435a742d4 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Thu, 3 Oct 2019 15:22:36 +0000
Subject: [PATCH 05/31] habanalabs: Add a new H/W queue type

This patch adds a support for a new H/W queue type.
This type of queue is for DMA and compute engines jobs, for which
completion notification are sent by H/W.
Command buffer for this queue can be created either through the CB
IOCTL and using the retrieved CB handle, or by preparing a buffer on the
host or device SRAM/DRAM, and using the device address to that buffer.
The patch includes the handling of the 2 options, as well as the
initialization of the H/W queue and its jobs scheduling.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/command_submission.c | 120 ++++++---
 drivers/misc/habanalabs/goya/goya.c          |   4 +-
 drivers/misc/habanalabs/habanalabs.h         |  24 +-
 drivers/misc/habanalabs/hw_queue.c           | 249 +++++++++++++++----
 drivers/misc/habanalabs/include/qman_if.h    |  12 +
 5 files changed, 308 insertions(+), 101 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index f44205540520..776ddafc47fb 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -65,6 +65,18 @@ static void cs_put(struct hl_cs *cs)
 	kref_put(&cs->refcount, cs_do_release);
 }
 
+static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
+{
+	/*
+	 * Patched CB is created for external queues jobs, and for H/W queues
+	 * jobs if the user CB was allocated by driver and MMU is disabled.
+	 */
+	return (job->queue_type == QUEUE_TYPE_EXT ||
+			(job->queue_type == QUEUE_TYPE_HW &&
+					job->is_kernel_allocated_cb &&
+					!hdev->mmu_enable));
+}
+
 /*
  * cs_parser - parse the user command submission
  *
@@ -91,11 +103,13 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
 	parser.patched_cb = NULL;
 	parser.user_cb = job->user_cb;
 	parser.user_cb_size = job->user_cb_size;
-	parser.ext_queue = job->ext_queue;
+	parser.queue_type = job->queue_type;
+	parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
 	job->patched_cb = NULL;
 
 	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
-	if (job->ext_queue) {
+
+	if (is_cb_patched(hdev, job)) {
 		if (!rc) {
 			job->patched_cb = parser.patched_cb;
 			job->job_cb_size = parser.patched_cb_size;
@@ -124,7 +138,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 {
 	struct hl_cs *cs = job->cs;
 
-	if (job->ext_queue) {
+	if (is_cb_patched(hdev, job)) {
 		hl_userptr_delete_list(hdev, &job->userptr_list);
 
 		/*
@@ -140,6 +154,19 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 		}
 	}
 
+	/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
+	 * enabled, the user CB isn't released in cs_parser() and thus should be
+	 * released here.
+	 */
+	if (job->queue_type == QUEUE_TYPE_HW &&
+			job->is_kernel_allocated_cb && hdev->mmu_enable) {
+		spin_lock(&job->user_cb->lock);
+		job->user_cb->cs_cnt--;
+		spin_unlock(&job->user_cb->lock);
+
+		hl_cb_put(job->user_cb);
+	}
+
 	/*
 	 * This is the only place where there can be multiple threads
 	 * modifying the list at the same time
@@ -150,7 +177,8 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 
 	hl_debugfs_remove_job(hdev, job);
 
-	if (job->ext_queue)
+	if (job->queue_type == QUEUE_TYPE_EXT ||
+			job->queue_type == QUEUE_TYPE_HW)
 		cs_put(cs);
 
 	kfree(job);
@@ -387,18 +415,13 @@ static void job_wq_completion(struct work_struct *work)
 	free_job(hdev, job);
 }
 
-static struct hl_cb *validate_queue_index(struct hl_device *hdev,
-					struct hl_cb_mgr *cb_mgr,
-					struct hl_cs_chunk *chunk,
-					bool *ext_queue)
+static int validate_queue_index(struct hl_device *hdev,
+				struct hl_cs_chunk *chunk,
+				enum hl_queue_type *queue_type,
+				bool *is_kernel_allocated_cb)
 {
 	struct asic_fixed_properties *asic = &hdev->asic_prop;
 	struct hw_queue_properties *hw_queue_prop;
-	u32 cb_handle;
-	struct hl_cb *cb;
-
-	/* Assume external queue */
-	*ext_queue = true;
 
 	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
 
@@ -406,22 +429,29 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
 			(hw_queue_prop->type == QUEUE_TYPE_NA)) {
 		dev_err(hdev->dev, "Queue index %d is invalid\n",
 			chunk->queue_index);
-		return NULL;
+		return -EINVAL;
 	}
 
 	if (hw_queue_prop->driver_only) {
 		dev_err(hdev->dev,
 			"Queue index %d is restricted for the kernel driver\n",
 			chunk->queue_index);
-		return NULL;
+		return -EINVAL;
 	}
 
-	if (!hw_queue_prop->requires_kernel_cb) {
-		*ext_queue = false;
-		return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
-	}
+	*queue_type = hw_queue_prop->type;
+	*is_kernel_allocated_cb = !!hw_queue_prop->requires_kernel_cb;
+
+	return 0;
+}
+
+static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
+					struct hl_cb_mgr *cb_mgr,
+					struct hl_cs_chunk *chunk)
+{
+	struct hl_cb *cb;
+	u32 cb_handle;
 
-	/* Retrieve CB object */
 	cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);
 
 	cb = hl_cb_get(hdev, cb_mgr, cb_handle);
@@ -446,7 +476,8 @@ release_cb:
 	return NULL;
 }
 
-struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
+		enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
 {
 	struct hl_cs_job *job;
 
@@ -454,12 +485,14 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
 	if (!job)
 		return NULL;
 
-	job->ext_queue = ext_queue;
+	job->queue_type = queue_type;
+	job->is_kernel_allocated_cb = is_kernel_allocated_cb;
 
-	if (job->ext_queue) {
+	if (is_cb_patched(hdev, job))
 		INIT_LIST_HEAD(&job->userptr_list);
+
+	if (job->queue_type == QUEUE_TYPE_EXT)
 		INIT_WORK(&job->finish_work, job_wq_completion);
-	}
 
 	return job;
 }
@@ -472,7 +505,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 	struct hl_cs_job *job;
 	struct hl_cs *cs;
 	struct hl_cb *cb;
-	bool ext_queue_present = false;
+	bool int_queues_only = true;
 	u32 size_to_copy;
 	int rc, i, parse_cnt;
 
@@ -516,23 +549,33 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 	/* Validate ALL the CS chunks before submitting the CS */
 	for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
 		struct hl_cs_chunk *chunk = &cs_chunk_array[i];
-		bool ext_queue;
+		enum hl_queue_type queue_type;
+		bool is_kernel_allocated_cb;
 
-		cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk,
-					&ext_queue);
-		if (ext_queue) {
-			ext_queue_present = true;
+		rc = validate_queue_index(hdev, chunk, &queue_type,
+						&is_kernel_allocated_cb);
+		if (rc)
+			goto free_cs_object;
+
+		if (is_kernel_allocated_cb) {
+			cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
 			if (!cb) {
 				rc = -EINVAL;
 				goto free_cs_object;
 			}
+		} else {
+			cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
 		}
 
-		job = hl_cs_allocate_job(hdev, ext_queue);
+		if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
+			int_queues_only = false;
+
+		job = hl_cs_allocate_job(hdev, queue_type,
+						is_kernel_allocated_cb);
 		if (!job) {
 			dev_err(hdev->dev, "Failed to allocate a new job\n");
 			rc = -ENOMEM;
-			if (ext_queue)
+			if (is_kernel_allocated_cb)
 				goto release_cb;
 			else
 				goto free_cs_object;
@@ -542,7 +585,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 		job->cs = cs;
 		job->user_cb = cb;
 		job->user_cb_size = chunk->cb_size;
-		if (job->ext_queue)
+		if (is_kernel_allocated_cb)
 			job->job_cb_size = cb->size;
 		else
 			job->job_cb_size = chunk->cb_size;
@@ -555,10 +598,11 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 		/*
 		 * Increment CS reference. When CS reference is 0, CS is
 		 * done and can be signaled to user and free all its resources
-		 * Only increment for JOB on external queues, because only
-		 * for those JOBs we get completion
+		 * Only increment for JOB on external or H/W queues, because
+		 * only for those JOBs we get completion
 		 */
-		if (job->ext_queue)
+		if (job->queue_type == QUEUE_TYPE_EXT ||
+				job->queue_type == QUEUE_TYPE_HW)
 			cs_get(cs);
 
 		hl_debugfs_add_job(hdev, job);
@@ -572,9 +616,9 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 		}
 	}
 
-	if (!ext_queue_present) {
+	if (int_queues_only) {
 		dev_err(hdev->dev,
-			"Reject CS %d.%llu because no external queues jobs\n",
+			"Reject CS %d.%llu because only internal queues jobs are present\n",
 			cs->ctx->asid, cs->sequence);
 		rc = -EINVAL;
 		goto free_cs_object;
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 71693fcffb16..0b40915bede2 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -3943,7 +3943,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
 {
 	struct goya_device *goya = hdev->asic_specific;
 
-	if (!parser->ext_queue)
+	if (parser->queue_type == QUEUE_TYPE_INT)
 		return goya_parse_cb_no_ext_queue(hdev, parser);
 
 	if (goya->hw_cap_initialized & HW_CAP_MMU)
@@ -4614,7 +4614,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
 		lin_dma_pkt++;
 	} while (--lin_dma_pkts_cnt);
 
-	job = hl_cs_allocate_job(hdev, true);
+	job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
 	if (!job) {
 		dev_err(hdev->dev, "Failed to allocate a new job\n");
 		rc = -ENOMEM;
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index f47f4b22cb6b..371d1ec15697 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -85,12 +85,15 @@ struct hl_fpriv;
  * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's
  *			memories and/or operates the compute engines.
  * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU.
+ * @QUEUE_TYPE_HW: queue of DMA and compute engines jobs, for which completion
+ *                 notifications are sent by H/W.
  */
 enum hl_queue_type {
 	QUEUE_TYPE_NA,
 	QUEUE_TYPE_EXT,
 	QUEUE_TYPE_INT,
-	QUEUE_TYPE_CPU
+	QUEUE_TYPE_CPU,
+	QUEUE_TYPE_HW
 };
 
 /**
@@ -755,11 +758,14 @@ struct hl_cs {
  * @userptr_list: linked-list of userptr mappings that belong to this job and
  *			wait for completion.
  * @debugfs_list: node in debugfs list of command submission jobs.
+ * @queue_type: the type of the H/W queue this job is submitted to.
  * @id: the id of this job inside a CS.
  * @hw_queue_id: the id of the H/W queue this job is submitted to.
  * @user_cb_size: the actual size of the CB we got from the user.
  * @job_cb_size: the actual size of the CB that we put on the queue.
- * @ext_queue: whether the job is for external queue or internal queue.
+ * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
+ *                          handle to a kernel-allocated CB object, false
+ *                          otherwise (SRAM/DRAM/host address).
  */
 struct hl_cs_job {
 	struct list_head	cs_node;
@@ -769,11 +775,12 @@ struct hl_cs_job {
 	struct work_struct	finish_work;
 	struct list_head	userptr_list;
 	struct list_head	debugfs_list;
+	enum hl_queue_type	queue_type;
 	u32			id;
 	u32			hw_queue_id;
 	u32			user_cb_size;
 	u32			job_cb_size;
-	u8			ext_queue;
+	u8			is_kernel_allocated_cb;
 };
 
 /**
@@ -784,24 +791,28 @@ struct hl_cs_job {
  * @job_userptr_list: linked-list of userptr mappings that belong to the related
  *			job and wait for completion.
  * @cs_sequence: the sequence number of the related CS.
+ * @queue_type: the type of the H/W queue this job is submitted to.
  * @ctx_id: the ID of the context the related CS belongs to.
  * @hw_queue_id: the id of the H/W queue this job is submitted to.
  * @user_cb_size: the actual size of the CB we got from the user.
  * @patched_cb_size: the size of the CB after parsing.
- * @ext_queue: whether the job is for external queue or internal queue.
  * @job_id: the id of the related job inside the related CS.
+ * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
+ *                          handle to a kernel-allocated CB object, false
+ *                          otherwise (SRAM/DRAM/host address).
  */
 struct hl_cs_parser {
 	struct hl_cb		*user_cb;
 	struct hl_cb		*patched_cb;
 	struct list_head	*job_userptr_list;
 	u64			cs_sequence;
+	enum hl_queue_type	queue_type;
 	u32			ctx_id;
 	u32			hw_queue_id;
 	u32			user_cb_size;
 	u32			patched_cb_size;
-	u8			ext_queue;
 	u8			job_id;
+	u8			is_kernel_allocated_cb;
 };
 
 
@@ -1504,7 +1515,8 @@ int hl_cb_pool_init(struct hl_device *hdev);
 int hl_cb_pool_fini(struct hl_device *hdev);
 
 void hl_cs_rollback_all(struct hl_device *hdev);
-struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue);
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
+		enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index f733b534f738..91579dde9262 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -58,8 +58,8 @@ out:
 }
 
 /*
- * ext_queue_submit_bd - Submit a buffer descriptor to an external queue
- *
+ * ext_and_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
+ *                                H/W queue.
  * @hdev: pointer to habanalabs device structure
  * @q: pointer to habanalabs queue structure
  * @ctl: BD's control word
@@ -73,8 +73,8 @@ out:
  * This function must be called when the scheduler mutex is taken
  *
  */
-static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
-				u32 ctl, u32 len, u64 ptr)
+static void ext_and_hw_queue_submit_bd(struct hl_device *hdev,
+			struct hl_hw_queue *q, u32 ctl, u32 len, u64 ptr)
 {
 	struct hl_bd *bd;
 
@@ -173,6 +173,45 @@ static int int_queue_sanity_checks(struct hl_device *hdev,
 	return 0;
 }
 
+/*
+ * hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue.
+ * @hdev: Pointer to hl_device structure.
+ * @q: Pointer to hl_hw_queue structure.
+ * @num_of_entries: How many entries to check for space.
+ *
+ * Perform the following:
+ * - Make sure we have enough space in the completion queue.
+ *   This check also ensures that there is enough space in the h/w queue, as
+ *   both queues are of the same size.
+ * - Reserve space in the completion queue (needs to be reversed if there
+ *   is a failure down the road before the actual submission of work).
+ *
+ * Both operations are done using the "free_slots_cnt" field of the completion
+ * queue. The CI counters of the queue and the completion queue are not
+ * needed/used for the H/W queue type.
+ */
+static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
+					int num_of_entries)
+{
+	atomic_t *free_slots =
+			&hdev->completion_queue[q->hw_queue_id].free_slots_cnt;
+
+	/*
+	 * Check we have enough space in the completion queue.
+	 * Add -1 to counter (decrement) unless counter was already 0.
+	 * In that case, CQ is full so we can't submit a new CB.
+	 * atomic_add_unless will return 0 if counter was already 0.
+	 */
+	if (atomic_add_negative(num_of_entries * -1, free_slots)) {
+		dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n",
+			num_of_entries, q->hw_queue_id);
+		atomic_add(num_of_entries, free_slots);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
 /*
  * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
  *
@@ -188,7 +227,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 				u32 cb_size, u64 cb_ptr)
 {
 	struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
-	int rc;
+	int rc = 0;
 
 	/*
 	 * The CPU queue is a synchronous queue with an effective depth of
@@ -206,11 +245,18 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 		goto out;
 	}
 
-	rc = ext_queue_sanity_checks(hdev, q, 1, false);
-	if (rc)
-		goto out;
+	/*
+	 * hl_hw_queue_send_cb_no_cmpl() is called for queues of a H/W queue
+	 * type only on init phase, when the queues are empty and being tested,
+	 * so there is no need for sanity checks.
+	 */
+	if (q->queue_type != QUEUE_TYPE_HW) {
+		rc = ext_queue_sanity_checks(hdev, q, 1, false);
+		if (rc)
+			goto out;
+	}
 
-	ext_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
+	ext_and_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
 
 out:
 	if (q->queue_type != QUEUE_TYPE_CPU)
@@ -220,14 +266,14 @@ out:
 }
 
 /*
- * ext_hw_queue_schedule_job - submit a JOB to an external queue
+ * ext_queue_schedule_job - submit a JOB to an external queue
  *
  * @job: pointer to the job that needs to be submitted to the queue
  *
  * This function must be called when the scheduler mutex is taken
  *
  */
-static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
+static void ext_queue_schedule_job(struct hl_cs_job *job)
 {
 	struct hl_device *hdev = job->cs->ctx->hdev;
 	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
@@ -260,7 +306,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
 	 * H/W queues is done under the scheduler mutex
 	 *
 	 * No need to check if CQ is full because it was already
-	 * checked in hl_queue_sanity_checks
+	 * checked in ext_queue_sanity_checks
 	 */
 	cq = &hdev->completion_queue[q->hw_queue_id];
 	cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);
@@ -274,18 +320,18 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
 
 	cq->pi = hl_cq_inc_ptr(cq->pi);
 
-	ext_queue_submit_bd(hdev, q, ctl, len, ptr);
+	ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 
 /*
- * int_hw_queue_schedule_job - submit a JOB to an internal queue
+ * int_queue_schedule_job - submit a JOB to an internal queue
  *
  * @job: pointer to the job that needs to be submitted to the queue
  *
  * This function must be called when the scheduler mutex is taken
  *
  */
-static void int_hw_queue_schedule_job(struct hl_cs_job *job)
+static void int_queue_schedule_job(struct hl_cs_job *job)
 {
 	struct hl_device *hdev = job->cs->ctx->hdev;
 	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
@@ -307,6 +353,60 @@ static void int_hw_queue_schedule_job(struct hl_cs_job *job)
 	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
 }
 
+/*
+ * hw_queue_schedule_job - submit a JOB to a H/W queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void hw_queue_schedule_job(struct hl_cs_job *job)
+{
+	struct hl_device *hdev = job->cs->ctx->hdev;
+	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+	struct hl_cq *cq;
+	u64 ptr;
+	u32 offset, ctl, len;
+
+	/*
+	 * Upon PQE completion, COMP_DATA is used as the write data to the
+	 * completion queue (QMAN HBW message), and COMP_OFFSET is used as the
+	 * write address offset in the SM block (QMAN LBW message).
+	 * The write address offset is calculated as "COMP_OFFSET << 2".
+	 */
+	offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
+	ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
+		((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
+
+	len = job->job_cb_size;
+
+	/*
+	 * A patched CB is created only if a user CB was allocated by driver and
+	 * MMU is disabled. If MMU is enabled, the user CB should be used
+	 * instead. If the user CB wasn't allocated by driver, assume that it
+	 * holds an address.
+	 */
+	if (job->patched_cb)
+		ptr = job->patched_cb->bus_address;
+	else if (job->is_kernel_allocated_cb)
+		ptr = job->user_cb->bus_address;
+	else
+		ptr = (u64) (uintptr_t) job->user_cb;
+
+	/*
+	 * No need to protect pi_offset because scheduling to the
+	 * H/W queues is done under the scheduler mutex
+	 *
+	 * No need to check if CQ is full because it was already
+	 * checked in hw_queue_sanity_checks
+	 */
+	cq = &hdev->completion_queue[q->hw_queue_id];
+	cq->pi = hl_cq_inc_ptr(cq->pi);
+
+	ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
+}
+
 /*
  * hl_hw_queue_schedule_cs - schedule a command submission
  *
@@ -330,23 +430,34 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 	}
 
 	q = &hdev->kernel_queues[0];
-	/* This loop assumes all external queues are consecutive */
 	for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
-		if (q->queue_type == QUEUE_TYPE_EXT) {
-			if (cs->jobs_in_queue_cnt[i]) {
+		if (cs->jobs_in_queue_cnt[i]) {
+			switch (q->queue_type) {
+			case QUEUE_TYPE_EXT:
 				rc = ext_queue_sanity_checks(hdev, q,
-					cs->jobs_in_queue_cnt[i], true);
-				if (rc)
-					goto unroll_cq_resv;
-				cq_cnt++;
-			}
-		} else if (q->queue_type == QUEUE_TYPE_INT) {
-			if (cs->jobs_in_queue_cnt[i]) {
+						cs->jobs_in_queue_cnt[i], true);
+				break;
+			case QUEUE_TYPE_INT:
 				rc = int_queue_sanity_checks(hdev, q,
-					cs->jobs_in_queue_cnt[i]);
-				if (rc)
-					goto unroll_cq_resv;
+						cs->jobs_in_queue_cnt[i]);
+				break;
+			case QUEUE_TYPE_HW:
+				rc = hw_queue_sanity_checks(hdev, q,
+						cs->jobs_in_queue_cnt[i]);
+				break;
+			default:
+				dev_err(hdev->dev, "Queue type %d is invalid\n",
+					q->queue_type);
+				rc = -EINVAL;
+				break;
 			}
+
+			if (rc)
+				goto unroll_cq_resv;
+
+			if (q->queue_type == QUEUE_TYPE_EXT ||
+					q->queue_type == QUEUE_TYPE_HW)
+				cq_cnt++;
 		}
 	}
 
@@ -373,21 +484,30 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 	}
 
 	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
-		if (job->ext_queue)
-			ext_hw_queue_schedule_job(job);
-		else
-			int_hw_queue_schedule_job(job);
+		switch (job->queue_type) {
+		case QUEUE_TYPE_EXT:
+			ext_queue_schedule_job(job);
+			break;
+		case QUEUE_TYPE_INT:
+			int_queue_schedule_job(job);
+			break;
+		case QUEUE_TYPE_HW:
+			hw_queue_schedule_job(job);
+			break;
+		default:
+			break;
+		}
 
 	cs->submitted = true;
 
 	goto out;
 
 unroll_cq_resv:
-	/* This loop assumes all external queues are consecutive */
 	q = &hdev->kernel_queues[0];
 	for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
-		if ((q->queue_type == QUEUE_TYPE_EXT) &&
-				(cs->jobs_in_queue_cnt[i])) {
+		if ((q->queue_type == QUEUE_TYPE_EXT ||
+				q->queue_type == QUEUE_TYPE_HW) &&
+				cs->jobs_in_queue_cnt[i]) {
 			atomic_t *free_slots =
 				&hdev->completion_queue[i].free_slots_cnt;
 			atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
@@ -414,8 +534,8 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
 	q->ci = hl_queue_inc_ptr(q->ci);
 }
 
-static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
-				struct hl_hw_queue *q, bool is_cpu_queue)
+static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
+					bool is_cpu_queue)
 {
 	void *p;
 	int rc;
@@ -465,7 +585,7 @@ free_queue:
 	return rc;
 }
 
-static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 {
 	void *p;
 
@@ -485,18 +605,38 @@ static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 	return 0;
 }
 
-static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 {
-	return ext_and_cpu_hw_queue_init(hdev, q, true);
+	return ext_and_cpu_queue_init(hdev, q, true);
 }
 
-static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int ext_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 {
-	return ext_and_cpu_hw_queue_init(hdev, q, false);
+	return ext_and_cpu_queue_init(hdev, q, false);
+}
+
+static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+{
+	void *p;
+
+	p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev,
+						HL_QUEUE_SIZE_IN_BYTES,
+						&q->bus_address,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+
+	q->kernel_address = (u64) (uintptr_t) p;
+
+	/* Make sure read/write pointers are initialized to start of queue */
+	q->ci = 0;
+	q->pi = 0;
+
+	return 0;
 }
 
 /*
- * hw_queue_init - main initialization function for H/W queue object
+ * queue_init - main initialization function for H/W queue object
  *
  * @hdev: pointer to hl_device device structure
  * @q: pointer to hl_hw_queue queue structure
@@ -505,7 +645,7 @@ static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
  * Allocate dma-able memory for the queue and initialize fields
  * Returns 0 on success
  */
-static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
+static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
 			u32 hw_queue_id)
 {
 	int rc;
@@ -516,21 +656,20 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
 
 	switch (q->queue_type) {
 	case QUEUE_TYPE_EXT:
-		rc = ext_hw_queue_init(hdev, q);
+		rc = ext_queue_init(hdev, q);
 		break;
-
 	case QUEUE_TYPE_INT:
-		rc = int_hw_queue_init(hdev, q);
+		rc = int_queue_init(hdev, q);
 		break;
-
 	case QUEUE_TYPE_CPU:
-		rc = cpu_hw_queue_init(hdev, q);
+		rc = cpu_queue_init(hdev, q);
+		break;
+	case QUEUE_TYPE_HW:
+		rc = hw_queue_init(hdev, q);
 		break;
-
 	case QUEUE_TYPE_NA:
 		q->valid = 0;
 		return 0;
-
 	default:
 		dev_crit(hdev->dev, "wrong queue type %d during init\n",
 			q->queue_type);
@@ -554,7 +693,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
  *
  * Free the queue memory
  */
-static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
+static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
 {
 	if (!q->valid)
 		return;
@@ -612,7 +751,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
 			i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {
 
 		q->queue_type = asic->hw_queues_props[i].type;
-		rc = hw_queue_init(hdev, q, i);
+		rc = queue_init(hdev, q, i);
 		if (rc) {
 			dev_err(hdev->dev,
 				"failed to initialize queue %d\n", i);
@@ -624,7 +763,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
 
 release_queues:
 	for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++)
-		hw_queue_fini(hdev, q);
+		queue_fini(hdev, q);
 
 	kfree(hdev->kernel_queues);
 
@@ -637,7 +776,7 @@ void hl_hw_queues_destroy(struct hl_device *hdev)
 	int i;
 
 	for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++)
-		hw_queue_fini(hdev, q);
+		queue_fini(hdev, q);
 
 	kfree(hdev->kernel_queues);
 }
diff --git a/drivers/misc/habanalabs/include/qman_if.h b/drivers/misc/habanalabs/include/qman_if.h
index bf59bbe27fdc..0fdb49188ed7 100644
--- a/drivers/misc/habanalabs/include/qman_if.h
+++ b/drivers/misc/habanalabs/include/qman_if.h
@@ -23,6 +23,8 @@ struct hl_bd {
 #define HL_BD_SIZE			sizeof(struct hl_bd)
 
 /*
+ * S/W CTL FIELDS.
+ *
  * BD_CTL_REPEAT_VALID tells the CP whether the repeat field in the BD CTL is
  * valid. 1 means the repeat field is valid, 0 means not-valid,
  * i.e. repeat == 1
@@ -33,6 +35,16 @@ struct hl_bd {
 #define BD_CTL_SHADOW_INDEX_SHIFT	0
 #define BD_CTL_SHADOW_INDEX_MASK	0x00000FFF
 
+/*
+ * H/W CTL FIELDS
+ */
+
+#define BD_CTL_COMP_OFFSET_SHIFT	16
+#define BD_CTL_COMP_OFFSET_MASK		0x00FF0000
+
+#define BD_CTL_COMP_DATA_SHIFT		0
+#define BD_CTL_COMP_DATA_MASK		0x0000FFFF
+
 /*
  * COMPLETION QUEUE
  */

From 8fdacf2a530f36f6f0621a95ef0e37d8db2d2f89 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Wed, 2 Oct 2019 14:14:08 +0300
Subject: [PATCH 06/31] habanalabs: set TPC Icache to 16 cache lines

Reduce latency to memory during TPC kernel execution.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
---
 drivers/misc/habanalabs/goya/goya.c  | 3 +++
 drivers/misc/habanalabs/habanalabs.h | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 0b40915bede2..d49f5ecd903b 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -1457,6 +1457,9 @@ static void goya_init_golden_registers(struct hl_device *hdev)
 				1 << TPC0_NRTR_SCRAMB_EN_VAL_SHIFT);
 		WREG32(mmTPC0_NRTR_NON_LIN_SCRAMB + offset,
 				1 << TPC0_NRTR_NON_LIN_SCRAMB_EN_SHIFT);
+
+		WREG32_FIELD(TPC0_CFG_MSS_CONFIG, offset,
+				ICACHE_FETCH_LINE_NUM, 2);
 	}
 
 	WREG32(mmDMA_NRTR_SCRAMB_EN, 1 << DMA_NRTR_SCRAMB_EN_VAL_SHIFT);
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 371d1ec15697..91445371b08b 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1062,9 +1062,10 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 
 #define REG_FIELD_SHIFT(reg, field) reg##_##field##_SHIFT
 #define REG_FIELD_MASK(reg, field) reg##_##field##_MASK
-#define WREG32_FIELD(reg, field, val)	\
-	WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \
-			(val) << REG_FIELD_SHIFT(reg, field))
+#define WREG32_FIELD(reg, offset, field, val)	\
+	WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & \
+				~REG_FIELD_MASK(reg, field)) | \
+				(val) << REG_FIELD_SHIFT(reg, field))
 
 /* Timeout should be longer when working with simulator but cap the
  * increased timeout to some maximum

From 62c1e124a9e03ccb8bb39efe1d092c2376967528 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Thu, 10 Oct 2019 15:48:59 +0300
Subject: [PATCH 07/31] habanalabs: add opcode to INFO IOCTL to return clock
 rate

Add a new opcode to the INFO IOCTL to allow the user application to
retrieve the ASIC's current and maximum clock rate. The rate is
returned in MHz.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
---
 drivers/misc/habanalabs/goya/goya.c        |  3 ++-
 drivers/misc/habanalabs/goya/goyaP.h       |  2 ++
 drivers/misc/habanalabs/goya/goya_hwmgr.c  | 31 ++++++++++++++++++++++
 drivers/misc/habanalabs/habanalabs.h       |  2 ++
 drivers/misc/habanalabs/habanalabs_ioctl.c | 23 ++++++++++++++++
 include/uapi/misc/habanalabs.h             | 19 +++++++++----
 6 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index d49f5ecd903b..ac574d18c139 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5148,7 +5148,8 @@ static const struct hl_asic_funcs goya_funcs = {
 	.init_iatu = goya_init_iatu,
 	.rreg = hl_rreg,
 	.wreg = hl_wreg,
-	.halt_coresight = goya_halt_coresight
+	.halt_coresight = goya_halt_coresight,
+	.get_clk_rate = goya_get_clk_rate
 };
 
 /*
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index 89b6574f8e4f..c3230cb6e25c 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -233,4 +233,6 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 					void *vaddr);
 void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev);
 
+int goya_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
+
 #endif /* GOYAP_H_ */
diff --git a/drivers/misc/habanalabs/goya/goya_hwmgr.c b/drivers/misc/habanalabs/goya/goya_hwmgr.c
index a2a700c3d597..b2ebc01e27f4 100644
--- a/drivers/misc/habanalabs/goya/goya_hwmgr.c
+++ b/drivers/misc/habanalabs/goya/goya_hwmgr.c
@@ -32,6 +32,37 @@ void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq)
 	}
 }
 
+int goya_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk)
+{
+	long value;
+
+	if (hl_device_disabled_or_in_reset(hdev))
+		return -ENODEV;
+
+	value = hl_get_frequency(hdev, MME_PLL, false);
+
+	if (value < 0) {
+		dev_err(hdev->dev, "Failed to retrieve device max clock %ld\n",
+			value);
+		return value;
+	}
+
+	*max_clk = (value / 1000 / 1000);
+
+	value = hl_get_frequency(hdev, MME_PLL, true);
+
+	if (value < 0) {
+		dev_err(hdev->dev,
+			"Failed to retrieve device current clock %ld\n",
+			value);
+		return value;
+	}
+
+	*cur_clk = (value / 1000 / 1000);
+
+	return 0;
+}
+
 static ssize_t mme_clk_show(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 91445371b08b..4ff2da859653 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -508,6 +508,7 @@ enum hl_pll_frequency {
  * @rreg: Read a register. Needed for simulator support.
  * @wreg: Write a register. Needed for simulator support.
  * @halt_coresight: stop the ETF and ETR traces.
+ * @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -590,6 +591,7 @@ struct hl_asic_funcs {
 	u32 (*rreg)(struct hl_device *hdev, u32 reg);
 	void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
 	void (*halt_coresight)(struct hl_device *hdev);
+	int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
 };
 
 
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index 66d9c710073c..cd4b5a9ceac1 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -221,6 +221,25 @@ static int device_utilization(struct hl_device *hdev, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(device_util))) ? -EFAULT : 0;
 }
 
+static int get_clk_rate(struct hl_device *hdev, struct hl_info_args *args)
+{
+	struct hl_info_clk_rate clk_rate = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hdev->asic_funcs->get_clk_rate(hdev, &clk_rate.cur_clk_rate_mhz,
+						&clk_rate.max_clk_rate_mhz);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &clk_rate,
+		min((size_t) max_size, sizeof(clk_rate))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -271,6 +290,10 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 		rc = hw_events_info(hdev, true, args);
 		break;
 
+	case HL_INFO_CLK_RATE:
+		rc = get_clk_rate(hdev, args);
+		break;
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 53e4ff73578e..783793c8be1c 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -88,13 +88,16 @@ enum hl_device_status {
  *                         internal engine.
  * HL_INFO_DEVICE_STATUS - Retrieve the device's status. This opcode doesn't
  *                         require an open context.
- * HL_INFO_DEVICE_UTILIZATION - Retrieve the total utilization of the device
- *                              over the last period specified by the user.
- *                              The period can be between 100ms to 1s, in
- *                              resolution of 100ms. The return value is a
- *                              percentage of the utilization rate.
+ * HL_INFO_DEVICE_UTILIZATION  - Retrieve the total utilization of the device
+ *                               over the last period specified by the user.
+ *                               The period can be between 100ms to 1s, in
+ *                               resolution of 100ms. The return value is a
+ *                               percentage of the utilization rate.
  * HL_INFO_HW_EVENTS_AGGREGATE - Receive an array describing how many times each
  *                               event occurred since the driver was loaded.
+ * HL_INFO_CLK_RATE            - Retrieve the current and maximum clock rate
+ *                               of the device in MHz. The maximum clock rate is
+ *                               configurable via sysfs parameter
  */
 #define HL_INFO_HW_IP_INFO		0
 #define HL_INFO_HW_EVENTS		1
@@ -103,6 +106,7 @@ enum hl_device_status {
 #define HL_INFO_DEVICE_STATUS		4
 #define HL_INFO_DEVICE_UTILIZATION	6
 #define HL_INFO_HW_EVENTS_AGGREGATE	7
+#define HL_INFO_CLK_RATE		8
 
 #define HL_INFO_VERSION_MAX_LEN	128
 
@@ -149,6 +153,11 @@ struct hl_info_device_utilization {
 	__u32 pad;
 };
 
+struct hl_info_clk_rate {
+	__u32 cur_clk_rate_mhz;
+	__u32 max_clk_rate_mhz;
+};
+
 struct hl_info_args {
 	/* Location of relevant struct in userspace */
 	__u64 return_pointer;

From 8d6de52866dcf1d6cbdec9aa10f722dd43b2431f Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 16 Oct 2019 16:46:32 +0800
Subject: [PATCH 08/31] habanalabs: remove set but not used variable
 'qman_base_addr'

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/misc/habanalabs/goya/goya.c: In function 'goya_init_mme_cmdq':
drivers/misc/habanalabs/goya/goya.c:1536:6: warning:
 variable 'qman_base_addr' set but not used [-Wunused-but-set-variable]

It is never used, so can be removed.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/goya/goya.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index ac574d18c139..e8812154343f 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -1539,7 +1539,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev)
 	u32 mtr_base_lo, mtr_base_hi;
 	u32 so_base_lo, so_base_hi;
 	u32 gic_base_lo, gic_base_hi;
-	u64 qman_base_addr;
 
 	mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
 	mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
@@ -1551,9 +1550,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev)
 	gic_base_hi =
 		upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
 
-	qman_base_addr = hdev->asic_prop.sram_base_address +
-				MME_QMAN_BASE_OFFSET;
-
 	WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_LO, mtr_base_lo);
 	WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_HI, mtr_base_hi);
 	WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_LO,	so_base_lo);

From 91edbf2cf8f0416b854674e891d7a5274f4b1702 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Wed, 16 Oct 2019 11:53:52 +0300
Subject: [PATCH 09/31] habanalabs: expose card name in INFO IOCTL

To enable userspace processes, e.g. management utilities, to display the
card name to the user, add the card name property to the HW_IP
structure that is copied to the user in the INFO IOCTL.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/goya/goya.c        | 3 +++
 drivers/misc/habanalabs/habanalabs_ioctl.c | 9 +++++++--
 include/uapi/misc/habanalabs.h             | 2 ++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index e8812154343f..d3ee9e2aa57e 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -396,6 +396,9 @@ void goya_get_fixed_properties(struct hl_device *hdev)
 	prop->tpc_enabled_mask = TPC_ENABLED_MASK;
 	prop->pcie_dbi_base_address = mmPCIE_DBI_BASE;
 	prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI;
+
+	strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
+		CARD_NAME_MAX_LEN);
 }
 
 /*
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index cd4b5a9ceac1..02d7491fa28f 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -63,8 +63,13 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	if (hw_ip.dram_size > 0)
 		hw_ip.dram_enabled = 1;
 	hw_ip.num_of_events = prop->num_of_events;
-	memcpy(hw_ip.armcp_version,
-		prop->armcp_info.armcp_version, VERSION_MAX_LEN);
+
+	memcpy(hw_ip.armcp_version, prop->armcp_info.armcp_version,
+		min(VERSION_MAX_LEN, HL_INFO_VERSION_MAX_LEN));
+
+	memcpy(hw_ip.card_name, prop->armcp_info.card_name,
+		min(CARD_NAME_MAX_LEN, HL_INFO_CARD_NAME_MAX_LEN));
+
 	hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version);
 	hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr;
 	hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 783793c8be1c..e387d9e560b3 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -109,6 +109,7 @@ enum hl_device_status {
 #define HL_INFO_CLK_RATE		8
 
 #define HL_INFO_VERSION_MAX_LEN	128
+#define HL_INFO_CARD_NAME_MAX_LEN	16
 
 struct hl_info_hw_ip_info {
 	__u64 sram_base_address;
@@ -127,6 +128,7 @@ struct hl_info_hw_ip_info {
 	__u8 dram_enabled;
 	__u8 pad[2];
 	__u8 armcp_version[HL_INFO_VERSION_MAX_LEN];
+	__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
 };
 
 struct hl_info_dram_usage {

From f05912d8f16bf303e293d4add2caecb8a9231c41 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sun, 20 Oct 2019 11:07:11 +0300
Subject: [PATCH 10/31] habanalabs: read F/W versions before failure

Move the read of the F/W boot versions before exiting on possible failures
of the F/W boot. This will help debug boot failures as we will be able to
know the F/W boot version.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>
---
 drivers/misc/habanalabs/goya/goya.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index d3ee9e2aa57e..4e767e1d78e4 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -2296,6 +2296,10 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
 		10000,
 		cpu_timeout);
 
+	/* Read U-Boot version now in case we will later fail */
+	goya_read_device_fw_version(hdev, FW_COMP_UBOOT);
+	goya_read_device_fw_version(hdev, FW_COMP_PREBOOT);
+
 	if (rc) {
 		dev_err(hdev->dev, "Error in ARM u-boot!");
 		switch (status) {
@@ -2347,10 +2351,6 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
 		return -EIO;
 	}
 
-	/* Read U-Boot version now in case we will later fail */
-	goya_read_device_fw_version(hdev, FW_COMP_UBOOT);
-	goya_read_device_fw_version(hdev, FW_COMP_PREBOOT);
-
 	if (!hdev->fw_loading) {
 		dev_info(hdev->dev, "Skip loading FW\n");
 		goto out;

From e1a84d56fcb92d4551692cbec4bada1cec00e620 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Thu, 24 Oct 2019 09:52:25 +0300
Subject: [PATCH 11/31] habanalabs: use registers name defines for ETR block

We have a single ETR block in the SOC, so use explicit register
name defines for initializing this block. This makes it more readable and
maintainable.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>
---
 drivers/misc/habanalabs/goya/goya_coresight.c |  51 ++++----
 .../include/goya/asic_reg/goya_regs.h         |   1 +
 .../include/goya/asic_reg/psoc_etr_regs.h     | 114 ++++++++++++++++++
 3 files changed, 140 insertions(+), 26 deletions(-)
 create mode 100644 drivers/misc/habanalabs/include/goya/asic_reg/psoc_etr_regs.h

diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c
index b4d406af1bed..16bcd60b111f 100644
--- a/drivers/misc/habanalabs/goya/goya_coresight.c
+++ b/drivers/misc/habanalabs/goya/goya_coresight.c
@@ -377,33 +377,32 @@ static int goya_config_etr(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
 	struct hl_debug_params_etr *input;
-	u64 base_reg = mmPSOC_ETR_BASE - CFG_BASE;
 	u32 val;
 	int rc;
 
-	WREG32(base_reg + 0xFB0, CORESIGHT_UNLOCK);
+	WREG32(mmPSOC_ETR_LAR, CORESIGHT_UNLOCK);
 
-	val = RREG32(base_reg + 0x304);
+	val = RREG32(mmPSOC_ETR_FFCR);
 	val |= 0x1000;
-	WREG32(base_reg + 0x304, val);
+	WREG32(mmPSOC_ETR_FFCR, val);
 	val |= 0x40;
-	WREG32(base_reg + 0x304, val);
+	WREG32(mmPSOC_ETR_FFCR, val);
 
-	rc = goya_coresight_timeout(hdev, base_reg + 0x304, 6, false);
+	rc = goya_coresight_timeout(hdev, mmPSOC_ETR_FFCR, 6, false);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to %s ETR on timeout, error %d\n",
 				params->enable ? "enable" : "disable", rc);
 		return rc;
 	}
 
-	rc = goya_coresight_timeout(hdev, base_reg + 0xC, 2, true);
+	rc = goya_coresight_timeout(hdev, mmPSOC_ETR_STS, 2, true);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to %s ETR on timeout, error %d\n",
 				params->enable ? "enable" : "disable", rc);
 		return rc;
 	}
 
-	WREG32(base_reg + 0x20, 0);
+	WREG32(mmPSOC_ETR_CTL, 0);
 
 	if (params->enable) {
 		input = params->input;
@@ -423,25 +422,25 @@ static int goya_config_etr(struct hl_device *hdev,
 			return -EINVAL;
 		}
 
-		WREG32(base_reg + 0x34, 0x3FFC);
-		WREG32(base_reg + 0x4, input->buffer_size);
-		WREG32(base_reg + 0x28, input->sink_mode);
-		WREG32(base_reg + 0x110, 0x700);
-		WREG32(base_reg + 0x118,
+		WREG32(mmPSOC_ETR_BUFWM, 0x3FFC);
+		WREG32(mmPSOC_ETR_RSZ, input->buffer_size);
+		WREG32(mmPSOC_ETR_MODE, input->sink_mode);
+		WREG32(mmPSOC_ETR_AXICTL, 0x700);
+		WREG32(mmPSOC_ETR_DBALO,
 				lower_32_bits(input->buffer_address));
-		WREG32(base_reg + 0x11C,
+		WREG32(mmPSOC_ETR_DBAHI,
 				upper_32_bits(input->buffer_address));
-		WREG32(base_reg + 0x304, 3);
-		WREG32(base_reg + 0x308, 0xA);
-		WREG32(base_reg + 0x20, 1);
+		WREG32(mmPSOC_ETR_FFCR, 3);
+		WREG32(mmPSOC_ETR_PSCR, 0xA);
+		WREG32(mmPSOC_ETR_CTL, 1);
 	} else {
-		WREG32(base_reg + 0x34, 0);
-		WREG32(base_reg + 0x4, 0x400);
-		WREG32(base_reg + 0x118, 0);
-		WREG32(base_reg + 0x11C, 0);
-		WREG32(base_reg + 0x308, 0);
-		WREG32(base_reg + 0x28, 0);
-		WREG32(base_reg + 0x304, 0);
+		WREG32(mmPSOC_ETR_BUFWM, 0);
+		WREG32(mmPSOC_ETR_RSZ, 0x400);
+		WREG32(mmPSOC_ETR_DBALO, 0);
+		WREG32(mmPSOC_ETR_DBAHI, 0);
+		WREG32(mmPSOC_ETR_PSCR, 0);
+		WREG32(mmPSOC_ETR_MODE, 0);
+		WREG32(mmPSOC_ETR_FFCR, 0);
 
 		if (params->output_size >= sizeof(u64)) {
 			u32 rwp, rwphi;
@@ -451,8 +450,8 @@ static int goya_config_etr(struct hl_device *hdev,
 			 * the buffer is set in the RWP register (lower 32
 			 * bits), and in the RWPHI register (upper 8 bits).
 			 */
-			rwp = RREG32(base_reg + 0x18);
-			rwphi = RREG32(base_reg + 0x3c) & 0xff;
+			rwp = RREG32(mmPSOC_ETR_RWP);
+			rwphi = RREG32(mmPSOC_ETR_RWPHI) & 0xff;
 			*(u64 *) params->output = ((u64) rwphi << 32) | rwp;
 		}
 	}
diff --git a/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h b/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h
index 19b0f0ef1d0b..fce490e6a231 100644
--- a/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h
+++ b/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h
@@ -84,6 +84,7 @@
 #include "tpc6_rtr_regs.h"
 #include "tpc7_nrtr_regs.h"
 #include "tpc0_eml_cfg_regs.h"
+#include "psoc_etr_regs.h"
 
 #include "psoc_global_conf_masks.h"
 #include "dma_macro_masks.h"
diff --git a/drivers/misc/habanalabs/include/goya/asic_reg/psoc_etr_regs.h b/drivers/misc/habanalabs/include/goya/asic_reg/psoc_etr_regs.h
new file mode 100644
index 000000000000..b7c33e025db5
--- /dev/null
+++ b/drivers/misc/habanalabs/include/goya/asic_reg/psoc_etr_regs.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2016-2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+/************************************
+ ** This is an auto-generated file **
+ **       DO NOT EDIT BELOW        **
+ ************************************/
+
+#ifndef ASIC_REG_PSOC_ETR_REGS_H_
+#define ASIC_REG_PSOC_ETR_REGS_H_
+
+/*
+ *****************************************
+ *   PSOC_ETR (Prototype: ETR)
+ *****************************************
+ */
+
+#define mmPSOC_ETR_RSZ                                               0x2C43004
+
+#define mmPSOC_ETR_STS                                               0x2C4300C
+
+#define mmPSOC_ETR_RRD                                               0x2C43010
+
+#define mmPSOC_ETR_RRP                                               0x2C43014
+
+#define mmPSOC_ETR_RWP                                               0x2C43018
+
+#define mmPSOC_ETR_TRG                                               0x2C4301C
+
+#define mmPSOC_ETR_CTL                                               0x2C43020
+
+#define mmPSOC_ETR_RWD                                               0x2C43024
+
+#define mmPSOC_ETR_MODE                                              0x2C43028
+
+#define mmPSOC_ETR_LBUFLEVEL                                         0x2C4302C
+
+#define mmPSOC_ETR_CBUFLEVEL                                         0x2C43030
+
+#define mmPSOC_ETR_BUFWM                                             0x2C43034
+
+#define mmPSOC_ETR_RRPHI                                             0x2C43038
+
+#define mmPSOC_ETR_RWPHI                                             0x2C4303C
+
+#define mmPSOC_ETR_AXICTL                                            0x2C43110
+
+#define mmPSOC_ETR_DBALO                                             0x2C43118
+
+#define mmPSOC_ETR_DBAHI                                             0x2C4311C
+
+#define mmPSOC_ETR_FFSR                                              0x2C43300
+
+#define mmPSOC_ETR_FFCR                                              0x2C43304
+
+#define mmPSOC_ETR_PSCR                                              0x2C43308
+
+#define mmPSOC_ETR_ITMISCOP0                                         0x2C43EE0
+
+#define mmPSOC_ETR_ITTRFLIN                                          0x2C43EE8
+
+#define mmPSOC_ETR_ITATBDATA0                                        0x2C43EEC
+
+#define mmPSOC_ETR_ITATBCTR2                                         0x2C43EF0
+
+#define mmPSOC_ETR_ITATBCTR1                                         0x2C43EF4
+
+#define mmPSOC_ETR_ITATBCTR0                                         0x2C43EF8
+
+#define mmPSOC_ETR_ITCTRL                                            0x2C43F00
+
+#define mmPSOC_ETR_CLAIMSET                                          0x2C43FA0
+
+#define mmPSOC_ETR_CLAIMCLR                                          0x2C43FA4
+
+#define mmPSOC_ETR_LAR                                               0x2C43FB0
+
+#define mmPSOC_ETR_LSR                                               0x2C43FB4
+
+#define mmPSOC_ETR_AUTHSTATUS                                        0x2C43FB8
+
+#define mmPSOC_ETR_DEVID                                             0x2C43FC8
+
+#define mmPSOC_ETR_DEVTYPE                                           0x2C43FCC
+
+#define mmPSOC_ETR_PERIPHID4                                         0x2C43FD0
+
+#define mmPSOC_ETR_PERIPHID5                                         0x2C43FD4
+
+#define mmPSOC_ETR_PERIPHID6                                         0x2C43FD8
+
+#define mmPSOC_ETR_PERIPHID7                                         0x2C43FDC
+
+#define mmPSOC_ETR_PERIPHID0                                         0x2C43FE0
+
+#define mmPSOC_ETR_PERIPHID1                                         0x2C43FE4
+
+#define mmPSOC_ETR_PERIPHID2                                         0x2C43FE8
+
+#define mmPSOC_ETR_PERIPHID3                                         0x2C43FEC
+
+#define mmPSOC_ETR_COMPID0                                           0x2C43FF0
+
+#define mmPSOC_ETR_COMPID1                                           0x2C43FF4
+
+#define mmPSOC_ETR_COMPID2                                           0x2C43FF8
+
+#define mmPSOC_ETR_COMPID3                                           0x2C43FFC
+
+#endif /* ASIC_REG_PSOC_ETR_REGS_H_ */

From 6476b472437de2c41dc8873134c60d2928f806ce Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Thu, 24 Oct 2019 10:12:35 +0300
Subject: [PATCH 12/31] habanalabs: set ETR as non-secured

ETR should always be non-secured as it is used by the users to record
profiling/trace data.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>
---
 drivers/misc/habanalabs/goya/goya_coresight.c              | 4 +++-
 drivers/misc/habanalabs/include/goya/asic_reg/goya_masks.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c
index 16bcd60b111f..c1ee6e2b5dff 100644
--- a/drivers/misc/habanalabs/goya/goya_coresight.c
+++ b/drivers/misc/habanalabs/goya/goya_coresight.c
@@ -8,6 +8,7 @@
 #include "goyaP.h"
 #include "include/goya/goya_coresight.h"
 #include "include/goya/asic_reg/goya_regs.h"
+#include "include/goya/asic_reg/goya_masks.h"
 
 #include <uapi/misc/habanalabs.h>
 
@@ -425,7 +426,8 @@ static int goya_config_etr(struct hl_device *hdev,
 		WREG32(mmPSOC_ETR_BUFWM, 0x3FFC);
 		WREG32(mmPSOC_ETR_RSZ, input->buffer_size);
 		WREG32(mmPSOC_ETR_MODE, input->sink_mode);
-		WREG32(mmPSOC_ETR_AXICTL, 0x700);
+		WREG32(mmPSOC_ETR_AXICTL,
+				0x700 | PSOC_ETR_AXICTL_PROTCTRLBIT1_SHIFT);
 		WREG32(mmPSOC_ETR_DBALO,
 				lower_32_bits(input->buffer_address));
 		WREG32(mmPSOC_ETR_DBAHI,
diff --git a/drivers/misc/habanalabs/include/goya/asic_reg/goya_masks.h b/drivers/misc/habanalabs/include/goya/asic_reg/goya_masks.h
index 8618891d5afa..3c44ef3a23ed 100644
--- a/drivers/misc/habanalabs/include/goya/asic_reg/goya_masks.h
+++ b/drivers/misc/habanalabs/include/goya/asic_reg/goya_masks.h
@@ -260,4 +260,6 @@
 #define DMA_QM_3_GLBL_CFG1_DMA_STOP_SHIFT DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT
 #define DMA_QM_4_GLBL_CFG1_DMA_STOP_SHIFT DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT
 
+#define PSOC_ETR_AXICTL_PROTCTRLBIT1_SHIFT                           1
+
 #endif /* ASIC_REG_GOYA_MASKS_H_ */

From bd4c8cb17d4e8f9e01ce48e3f2009307a58e60d2 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sat, 9 Nov 2019 23:16:33 +0200
Subject: [PATCH 13/31] habanalabs: increase max jobs number to 512

In training, there is a need for a large amount of patching to the recipe.
This results in many command buffers contains a lot of DMA packets. The
number of command buffers per CS is larger than the current maximum of 64,
which is an arbitrary number that is enough for inference, but it has no
real affect on the code and/or resources of the host machine.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>
---
 drivers/misc/habanalabs/habanalabs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 4ff2da859653..0813041f669a 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -40,7 +40,7 @@
 
 #define HL_MAX_QUEUES			128
 
-#define HL_MAX_JOBS_PER_CS		64
+#define HL_MAX_JOBS_PER_CS		512
 
 /* MUST BE POWER OF 2 and larger than 1 */
 #define HL_MAX_PENDING_CS		64

From eda58bf7860a028f207e8d4201d86191b898bbee Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sun, 10 Nov 2019 18:48:06 +0200
Subject: [PATCH 14/31] habanalabs: don't print error when queues are full

If the queues are full and we return -EAGAIN to the user, there is no need
to print an error, as that case isn't an error and the user is expected to
re-submit the work.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>
---
 drivers/misc/habanalabs/command_submission.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index 776ddafc47fb..8850f475a413 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -626,9 +626,10 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 
 	rc = hl_hw_queue_schedule_cs(cs);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to submit CS %d.%llu to H/W queues, error %d\n",
-			cs->ctx->asid, cs->sequence, rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to submit CS %d.%llu to H/W queues, error %d\n",
+				cs->ctx->asid, cs->sequence, rc);
 		goto free_cs_object;
 	}
 

From 5d1012576d20dd7cb70e00ea1b4c2af11a6c9156 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sun, 10 Nov 2019 16:08:26 +0200
Subject: [PATCH 15/31] habanalabs: export uapi defines to user-space

The two defines that control the maximum size of a command buffer and the
maximum number of JOBS per CS need to be exported to the user as they are
part of the API towards user-space.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>
---
 drivers/misc/habanalabs/habanalabs.h |  4 ----
 include/uapi/misc/habanalabs.h       | 16 ++++++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 0813041f669a..2a5344cc1a60 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -40,8 +40,6 @@
 
 #define HL_MAX_QUEUES			128
 
-#define HL_MAX_JOBS_PER_CS		512
-
 /* MUST BE POWER OF 2 and larger than 1 */
 #define HL_MAX_PENDING_CS		64
 
@@ -242,8 +240,6 @@ struct hl_dma_fence {
  * Command Buffers
  */
 
-#define HL_MAX_CB_SIZE		0x200000	/* 2MB */
-
 /**
  * struct hl_cb_mgr - describes a Command Buffer Manager.
  * @cb_lock: protects cb_handles.
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index e387d9e560b3..716e70750f23 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -192,13 +192,15 @@ struct hl_info_args {
 /* Opcode to destroy previously created command buffer */
 #define HL_CB_OP_DESTROY	1
 
+#define HL_MAX_CB_SIZE		0x200000	/* 2MB */
+
 struct hl_cb_in {
 	/* Handle of CB or 0 if we want to create one */
 	__u64 cb_handle;
 	/* HL_CB_OP_* */
 	__u32 op;
-	/* Size of CB. Maximum size is 2MB. The minimum size that will be
-	 * allocated, regardless of this parameter's value, is PAGE_SIZE
+	/* Size of CB. Maximum size is HL_MAX_CB_SIZE. The minimum size that
+	 * will be allocated, regardless of this parameter's value, is PAGE_SIZE
 	 */
 	__u32 cb_size;
 	/* Context ID - Currently not in use */
@@ -244,6 +246,8 @@ struct hl_cs_chunk {
 
 #define HL_CS_STATUS_SUCCESS		0
 
+#define HL_MAX_JOBS_PER_CS		512
+
 struct hl_cs_in {
 	/* this holds address of array of hl_cs_chunk for restore phase */
 	__u64 chunks_restore;
@@ -253,9 +257,13 @@ struct hl_cs_in {
 	 * Currently not in use
 	 */
 	__u64 chunks_store;
-	/* Number of chunks in restore phase array */
+	/* Number of chunks in restore phase array. Maximum number is
+	 * HL_MAX_JOBS_PER_CS
+	 */
 	__u32 num_chunks_restore;
-	/* Number of chunks in execution array */
+	/* Number of chunks in execution array. Maximum number is
+	 * HL_MAX_JOBS_PER_CS
+	 */
 	__u32 num_chunks_execute;
 	/* Number of chunks in restore phase array - Currently not in use */
 	__u32 num_chunks_store;

From 7f74d4d335f1bdcb51fca584d5ad065c4ff996ac Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Mon, 12 Aug 2019 11:48:46 +0300
Subject: [PATCH 16/31] habanalabs: re-factor memory module code

Some of the functions in the memory module code were too long and/or
contained multiple operations that are not always done together. Re-factor
the code by dividing those functions to smaller functions which are more
readable and maintainable.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/goya/goya.c  |   2 +-
 drivers/misc/habanalabs/habanalabs.h |   4 +-
 drivers/misc/habanalabs/memory.c     | 283 +++++++++++++++------------
 3 files changed, 159 insertions(+), 130 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 4e767e1d78e4..9712122d6cb1 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -3935,7 +3935,7 @@ static int goya_parse_cb_no_ext_queue(struct hl_device *hdev,
 		return 0;
 
 	dev_err(hdev->dev,
-		"Internal CB address %px + 0x%x is not in SRAM nor in DRAM\n",
+		"Internal CB address 0x%px + 0x%x is not in SRAM nor in DRAM\n",
 		parser->user_cb, parser->user_cb_size);
 
 	return -EFAULT;
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 2a5344cc1a60..78aef59e690b 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -692,7 +692,7 @@ struct hl_ctx_mgr {
  * @sgt: pointer to the scatter-gather table that holds the pages.
  * @dir: for DMA unmapping, the direction must be supplied, so save it.
  * @debugfs_list: node in debugfs list of command submissions.
- * @addr: user-space virtual pointer to the start of the memory area.
+ * @addr: user-space virtual address of the start of the memory area.
  * @size: size of the memory area to pin & map.
  * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise.
  */
@@ -1527,7 +1527,7 @@ void hl_vm_fini(struct hl_device *hdev);
 
 int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
 			struct hl_userptr *userptr);
-int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr);
+void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr);
 void hl_userptr_delete_list(struct hl_device *hdev,
 				struct list_head *userptr_list);
 bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size,
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index 365fb0cb8dff..8ade9886a5a7 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -159,20 +159,19 @@ pages_pack_err:
 }
 
 /*
- * get_userptr_from_host_va - initialize userptr structure from given host
- *                            virtual address
- *
- * @hdev                : habanalabs device structure
- * @args                : parameters containing the virtual address and size
- * @p_userptr           : pointer to result userptr structure
+ * dma_map_host_va - DMA mapping of the given host virtual address.
+ * @hdev: habanalabs device structure
+ * @addr: the host virtual address of the memory area
+ * @size: the size of the memory area
+ * @p_userptr: pointer to result userptr structure
  *
  * This function does the following:
  * - Allocate userptr structure
  * - Pin the given host memory using the userptr structure
  * - Perform DMA mapping to have the DMA addresses of the pages
  */
-static int get_userptr_from_host_va(struct hl_device *hdev,
-		struct hl_mem_in *args, struct hl_userptr **p_userptr)
+static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
+				struct hl_userptr **p_userptr)
 {
 	struct hl_userptr *userptr;
 	int rc;
@@ -183,8 +182,7 @@ static int get_userptr_from_host_va(struct hl_device *hdev,
 		goto userptr_err;
 	}
 
-	rc = hl_pin_host_memory(hdev, args->map_host.host_virt_addr,
-			args->map_host.mem_size, userptr);
+	rc = hl_pin_host_memory(hdev, addr, size, userptr);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to pin host memory\n");
 		goto pin_err;
@@ -215,16 +213,16 @@ userptr_err:
 }
 
 /*
- * free_userptr - free userptr structure
- *
- * @hdev                : habanalabs device structure
- * @userptr             : userptr to free
+ * dma_unmap_host_va - DMA unmapping of the given host virtual address.
+ * @hdev: habanalabs device structure
+ * @userptr: userptr to free
  *
  * This function does the following:
  * - Unpins the physical pages
  * - Frees the userptr structure
  */
-static void free_userptr(struct hl_device *hdev, struct hl_userptr *userptr)
+static void dma_unmap_host_va(struct hl_device *hdev,
+				struct hl_userptr *userptr)
 {
 	hl_unpin_host_memory(hdev, userptr);
 	kfree(userptr);
@@ -253,10 +251,9 @@ static void dram_pg_pool_do_release(struct kref *ref)
 }
 
 /*
- * free_phys_pg_pack   - free physical page pack
- *
- * @hdev               : habanalabs device structure
- * @phys_pg_pack       : physical page pack to free
+ * free_phys_pg_pack - free physical page pack
+ * @hdev: habanalabs device structure
+ * @phys_pg_pack: physical page pack to free
  *
  * This function does the following:
  * - For DRAM memory only, iterate over the pack and free each physical block
@@ -264,7 +261,7 @@ static void dram_pg_pool_do_release(struct kref *ref)
  * - Free the hl_vm_phys_pg_pack structure
  */
 static void free_phys_pg_pack(struct hl_device *hdev,
-		struct hl_vm_phys_pg_pack *phys_pg_pack)
+				struct hl_vm_phys_pg_pack *phys_pg_pack)
 {
 	struct hl_vm *vm = &hdev->vm;
 	u64 i;
@@ -631,20 +628,18 @@ static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
 
 /*
  * init_phys_pg_pack_from_userptr - initialize physical page pack from host
- *                                   memory
- *
- * @ctx                : current context
- * @userptr            : userptr to initialize from
- * @pphys_pg_pack      : res pointer
+ *                                  memory
+ * @asid: current context ASID
+ * @userptr: userptr to initialize from
+ * @pphys_pg_pack: result pointer
  *
  * This function does the following:
  * - Pin the physical pages related to the given virtual block
  * - Create a physical page pack from the physical pages related to the given
  *   virtual block
  */
-static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
-		struct hl_userptr *userptr,
-		struct hl_vm_phys_pg_pack **pphys_pg_pack)
+static int init_phys_pg_pack_from_userptr(u32 asid, struct hl_userptr *userptr,
+				struct hl_vm_phys_pg_pack **pphys_pg_pack)
 {
 	struct hl_vm_phys_pg_pack *phys_pg_pack;
 	struct scatterlist *sg;
@@ -660,7 +655,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 
 	phys_pg_pack->vm_type = userptr->vm_type;
 	phys_pg_pack->created_from_userptr = true;
-	phys_pg_pack->asid = ctx->asid;
+	phys_pg_pack->asid = asid;
 	atomic_set(&phys_pg_pack->mapping_cnt, 1);
 
 	/* Only if all dma_addrs are aligned to 2MB and their
@@ -731,19 +726,18 @@ page_pack_arr_mem_err:
 }
 
 /*
- * map_phys_page_pack - maps the physical page pack
- *
- * @ctx                : current context
- * @vaddr              : start address of the virtual area to map from
- * @phys_pg_pack       : the pack of physical pages to map to
+ * map_phys_pg_pack - maps the physical page pack.
+ * @ctx: current context
+ * @vaddr: start address of the virtual area to map from
+ * @phys_pg_pack: the pack of physical pages to map to
  *
  * This function does the following:
  * - Maps each chunk of virtual memory to matching physical chunk
  * - Stores number of successful mappings in the given argument
- * - Returns 0 on success, error code otherwise.
+ * - Returns 0 on success, error code otherwise
  */
-static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr,
-		struct hl_vm_phys_pg_pack *phys_pg_pack)
+static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
+				struct hl_vm_phys_pg_pack *phys_pg_pack)
 {
 	struct hl_device *hdev = ctx->hdev;
 	u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = 0, i;
@@ -783,6 +777,36 @@ err:
 	return rc;
 }
 
+/*
+ * unmap_phys_pg_pack - unmaps the physical page pack
+ * @ctx: current context
+ * @vaddr: start address of the virtual area to unmap
+ * @phys_pg_pack: the pack of physical pages to unmap
+ */
+static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
+				struct hl_vm_phys_pg_pack *phys_pg_pack)
+{
+	struct hl_device *hdev = ctx->hdev;
+	u64 next_vaddr, i;
+	u32 page_size;
+
+	page_size = phys_pg_pack->page_size;
+	next_vaddr = vaddr;
+
+	for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {
+		if (hl_mmu_unmap(ctx, next_vaddr, page_size))
+			dev_warn_ratelimited(hdev->dev,
+			"unmap failed for vaddr: 0x%llx\n", next_vaddr);
+
+		/*
+		 * unmapping on Palladium can be really long, so avoid a CPU
+		 * soft lockup bug by sleeping a little between unmapping pages
+		 */
+		if (hdev->pldm)
+			usleep_range(500, 1000);
+	}
+}
+
 static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args,
 				u64 *paddr)
 {
@@ -839,18 +863,21 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	*device_addr = 0;
 
 	if (is_userptr) {
-		rc = get_userptr_from_host_va(hdev, args, &userptr);
+		u64 addr = args->map_host.host_virt_addr,
+			size = args->map_host.mem_size;
+
+		rc = dma_map_host_va(hdev, addr, size, &userptr);
 		if (rc) {
 			dev_err(hdev->dev, "failed to get userptr from va\n");
 			return rc;
 		}
 
-		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
+		rc = init_phys_pg_pack_from_userptr(ctx->asid, userptr,
 				&phys_pg_pack);
 		if (rc) {
 			dev_err(hdev->dev,
 				"unable to init page pack for vaddr 0x%llx\n",
-				args->map_host.host_virt_addr);
+				addr);
 			goto init_page_pack_err;
 		}
 
@@ -909,7 +936,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 
 	mutex_lock(&ctx->mmu_lock);
 
-	rc = map_phys_page_pack(ctx, ret_vaddr, phys_pg_pack);
+	rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
 	if (rc) {
 		mutex_unlock(&ctx->mmu_lock);
 		dev_err(hdev->dev, "mapping page pack failed for handle %u\n",
@@ -955,7 +982,7 @@ shared_err:
 		free_phys_pg_pack(hdev, phys_pg_pack);
 init_page_pack_err:
 	if (is_userptr)
-		free_userptr(hdev, userptr);
+		dma_unmap_host_va(hdev, userptr);
 
 	return rc;
 }
@@ -977,8 +1004,6 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
 	struct hl_vm_hash_node *hnode = NULL;
 	struct hl_userptr *userptr = NULL;
 	enum vm_type_t *vm_type;
-	u64 next_vaddr, i;
-	u32 page_size;
 	bool is_userptr;
 	int rc;
 
@@ -1004,8 +1029,8 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
 	if (*vm_type == VM_TYPE_USERPTR) {
 		is_userptr = true;
 		userptr = hnode->ptr;
-		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
-				&phys_pg_pack);
+		rc = init_phys_pg_pack_from_userptr(ctx->asid, userptr,
+							&phys_pg_pack);
 		if (rc) {
 			dev_err(hdev->dev,
 				"unable to init page pack for vaddr 0x%llx\n",
@@ -1029,24 +1054,11 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
 		goto mapping_cnt_err;
 	}
 
-	page_size = phys_pg_pack->page_size;
-	vaddr &= ~(((u64) page_size) - 1);
-
-	next_vaddr = vaddr;
+	vaddr &= ~(((u64) phys_pg_pack->page_size) - 1);
 
 	mutex_lock(&ctx->mmu_lock);
 
-	for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {
-		if (hl_mmu_unmap(ctx, next_vaddr, page_size))
-			dev_warn_ratelimited(hdev->dev,
-			"unmap failed for vaddr: 0x%llx\n", next_vaddr);
-
-		/* unmapping on Palladium can be really long, so avoid a CPU
-		 * soft lockup bug by sleeping a little between unmapping pages
-		 */
-		if (hdev->pldm)
-			usleep_range(500, 1000);
-	}
+	unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);
 
 	hdev->asic_funcs->mmu_invalidate_cache(hdev, true);
 
@@ -1064,7 +1076,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
 
 	if (is_userptr) {
 		free_phys_pg_pack(hdev, phys_pg_pack);
-		free_userptr(hdev, userptr);
+		dma_unmap_host_va(hdev, userptr);
 	}
 
 	return 0;
@@ -1203,57 +1215,17 @@ out:
 	return rc;
 }
 
-/*
- * hl_pin_host_memory - pins a chunk of host memory
- *
- * @hdev                : pointer to the habanalabs device structure
- * @addr                : the user-space virtual address of the memory area
- * @size                : the size of the memory area
- * @userptr	        : pointer to hl_userptr structure
- *
- * This function does the following:
- * - Pins the physical pages
- * - Create a SG list from those pages
- */
-int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
-			struct hl_userptr *userptr)
+static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,
+				u32 npages, u64 start, u32 offset,
+				struct hl_userptr *userptr)
 {
-	u64 start, end;
-	u32 npages, offset;
 	int rc;
 
-	if (!size) {
-		dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);
-		return -EINVAL;
-	}
-
 	if (!access_ok((void __user *) (uintptr_t) addr, size)) {
 		dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);
 		return -EFAULT;
 	}
 
-	/*
-	 * If the combination of the address and size requested for this memory
-	 * region causes an integer overflow, return error.
-	 */
-	if (((addr + size) < addr) ||
-			PAGE_ALIGN(addr + size) < (addr + size)) {
-		dev_err(hdev->dev,
-			"user pointer 0x%llx + %llu causes integer overflow\n",
-			addr, size);
-		return -EINVAL;
-	}
-
-	start = addr & PAGE_MASK;
-	offset = addr & ~PAGE_MASK;
-	end = PAGE_ALIGN(addr + size);
-	npages = (end - start) >> PAGE_SHIFT;
-
-	userptr->size = size;
-	userptr->addr = addr;
-	userptr->dma_mapped = false;
-	INIT_LIST_HEAD(&userptr->job_node);
-
 	userptr->vec = frame_vector_create(npages);
 	if (!userptr->vec) {
 		dev_err(hdev->dev, "Failed to create frame vector\n");
@@ -1279,17 +1251,82 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
 		goto put_framevec;
 	}
 
-	userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_ATOMIC);
-	if (!userptr->sgt) {
-		rc = -ENOMEM;
-		goto put_framevec;
-	}
-
 	rc = sg_alloc_table_from_pages(userptr->sgt,
 					frame_vector_pages(userptr->vec),
 					npages, offset, size, GFP_ATOMIC);
 	if (rc < 0) {
 		dev_err(hdev->dev, "failed to create SG table from pages\n");
+		goto put_framevec;
+	}
+
+	return 0;
+
+put_framevec:
+	put_vaddr_frames(userptr->vec);
+destroy_framevec:
+	frame_vector_destroy(userptr->vec);
+	return rc;
+}
+
+/*
+ * hl_pin_host_memory - pins a chunk of host memory.
+ * @hdev: pointer to the habanalabs device structure
+ * @addr: the host virtual address of the memory area
+ * @size: the size of the memory area
+ * @userptr: pointer to hl_userptr structure
+ *
+ * This function does the following:
+ * - Pins the physical pages
+ * - Create an SG list from those pages
+ */
+int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
+					struct hl_userptr *userptr)
+{
+	u64 start, end;
+	u32 npages, offset;
+	int rc;
+
+	if (!size) {
+		dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);
+		return -EINVAL;
+	}
+
+	/*
+	 * If the combination of the address and size requested for this memory
+	 * region causes an integer overflow, return error.
+	 */
+	if (((addr + size) < addr) ||
+			PAGE_ALIGN(addr + size) < (addr + size)) {
+		dev_err(hdev->dev,
+			"user pointer 0x%llx + %llu causes integer overflow\n",
+			addr, size);
+		return -EINVAL;
+	}
+
+	/*
+	 * This function can be called also from data path, hence use atomic
+	 * always as it is not a big allocation.
+	 */
+	userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_ATOMIC);
+	if (!userptr->sgt)
+		return -ENOMEM;
+
+	start = addr & PAGE_MASK;
+	offset = addr & ~PAGE_MASK;
+	end = PAGE_ALIGN(addr + size);
+	npages = (end - start) >> PAGE_SHIFT;
+
+	userptr->size = size;
+	userptr->addr = addr;
+	userptr->dma_mapped = false;
+	INIT_LIST_HEAD(&userptr->job_node);
+
+	rc = get_user_memory(hdev, addr, size, npages, start, offset,
+				userptr);
+	if (rc) {
+		dev_err(hdev->dev,
+			"failed to get user memory for address 0x%llx\n",
+			addr);
 		goto free_sgt;
 	}
 
@@ -1299,34 +1336,28 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
 
 free_sgt:
 	kfree(userptr->sgt);
-put_framevec:
-	put_vaddr_frames(userptr->vec);
-destroy_framevec:
-	frame_vector_destroy(userptr->vec);
 	return rc;
 }
 
 /*
- * hl_unpin_host_memory - unpins a chunk of host memory
- *
- * @hdev                : pointer to the habanalabs device structure
- * @userptr             : pointer to hl_userptr structure
+ * hl_unpin_host_memory - unpins a chunk of host memory.
+ * @hdev: pointer to the habanalabs device structure
+ * @userptr: pointer to hl_userptr structure
  *
  * This function does the following:
  * - Unpins the physical pages related to the host memory
  * - Free the SG list
  */
-int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
+void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
 {
 	struct page **pages;
 
 	hl_debugfs_remove_userptr(hdev, userptr);
 
 	if (userptr->dma_mapped)
-		hdev->asic_funcs->hl_dma_unmap_sg(hdev,
-				userptr->sgt->sgl,
-				userptr->sgt->nents,
-				userptr->dir);
+		hdev->asic_funcs->hl_dma_unmap_sg(hdev, userptr->sgt->sgl,
+							userptr->sgt->nents,
+							userptr->dir);
 
 	pages = frame_vector_pages(userptr->vec);
 	if (!IS_ERR(pages)) {
@@ -1342,8 +1373,6 @@ int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
 
 	sg_free_table(userptr->sgt);
 	kfree(userptr->sgt);
-
-	return 0;
 }
 
 /*
@@ -1627,7 +1656,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 	idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)
 		if (phys_pg_list->asid == ctx->asid) {
 			dev_dbg(hdev->dev,
-				"page list 0x%p of asid %d is still alive\n",
+				"page list 0x%px of asid %d is still alive\n",
 				phys_pg_list, ctx->asid);
 			atomic64_sub(phys_pg_list->total_size,
 					&hdev->dram_used_mem);

From 7b6e4ea0f7b16ab292df5e67f5d847929f8e4d3e Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Thu, 14 Nov 2019 18:23:53 +0000
Subject: [PATCH 17/31] habanalabs: type specific MMU cache invalidation

Add the ability to invalidate the necessary MMU cache only.
This ability is a prerequisite for future ASICs support.
Note that in Goya ASIC, a single cache is used for both host/DRAM
mappings and hence this patch should not have any effect on current
behavior.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/goya/goya.c  |  6 ++++--
 drivers/misc/habanalabs/habanalabs.h | 11 ++++++-----
 drivers/misc/habanalabs/memory.c     |  4 ++--
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 9712122d6cb1..3c22fb96a26f 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -2463,7 +2463,8 @@ int goya_mmu_init(struct hl_device *hdev)
 	WREG32_AND(mmSTLB_STLB_FEATURE_EN,
 			(~STLB_STLB_FEATURE_EN_FOLLOWER_EN_MASK));
 
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, true);
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
+					VM_TYPE_USERPTR | VM_TYPE_PHYS_PACK);
 
 	WREG32(mmMMU_MMU_ENABLE, 1);
 	WREG32(mmMMU_SPI_MASK, 0xF);
@@ -4845,7 +4846,8 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
 		goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
 }
 
-static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard)
+static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
+					u32 flags)
 {
 	struct goya_device *goya = hdev->asic_specific;
 	u32 status, timeout_usec;
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 78aef59e690b..36d05c32f7ec 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -114,8 +114,8 @@ struct hw_queue_properties {
  * @VM_TYPE_PHYS_PACK: mapping of DRAM memory to device virtual address.
  */
 enum vm_type_t {
-	VM_TYPE_USERPTR,
-	VM_TYPE_PHYS_PACK
+	VM_TYPE_USERPTR = 0x1,
+	VM_TYPE_PHYS_PACK = 0x2
 };
 
 /**
@@ -483,8 +483,8 @@ enum hl_pll_frequency {
  * @get_events_stat: retrieve event queue entries histogram.
  * @read_pte: read MMU page table entry from DRAM.
  * @write_pte: write MMU page table entry to DRAM.
- * @mmu_invalidate_cache: flush MMU STLB cache, either with soft (L1 only) or
- *                        hard (L0 & L1) flush.
+ * @mmu_invalidate_cache: flush MMU STLB host/DRAM cache, either with soft
+ *                        (L1 only) or hard (L0 & L1) flush.
  * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with
  *                              ASID-VA-size mask.
  * @send_heartbeat: send is-alive packet to ArmCP and verify response.
@@ -565,7 +565,8 @@ struct hl_asic_funcs {
 				u32 *size);
 	u64 (*read_pte)(struct hl_device *hdev, u64 addr);
 	void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val);
-	void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard);
+	void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
+					u32 flags);
 	void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
 			u32 asid, u64 va, u64 size);
 	int (*send_heartbeat)(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index 8ade9886a5a7..12db6609da27 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -944,7 +944,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		goto map_err;
 	}
 
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, false);
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type);
 
 	mutex_unlock(&ctx->mmu_lock);
 
@@ -1060,7 +1060,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
 
 	unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);
 
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, true);
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, *vm_type);
 
 	mutex_unlock(&ctx->mmu_lock);
 

From 30919edef243e9dc91a3c65e5b1059d481e597e9 Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Thu, 14 Nov 2019 18:23:54 +0000
Subject: [PATCH 18/31] habanalabs: re-factor MMU masks and documentation

Some cosmetics around the MMU code to make it more self-explanatory.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/debugfs.c             |  6 ++--
 .../include/hw_ip/mmu/mmu_general.h           |  6 ++--
 drivers/misc/habanalabs/mmu.c                 | 36 +++++++++----------
 3 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c
index 87f37ac31ccd..1e1fa619a225 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -345,7 +345,7 @@ static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
 static inline u64 get_next_hop_addr(u64 curr_pte)
 {
 	if (curr_pte & PAGE_PRESENT_MASK)
-		return curr_pte & PHYS_ADDR_MASK;
+		return curr_pte & HOP_PHYS_ADDR_MASK;
 	else
 		return ULLONG_MAX;
 }
@@ -535,7 +535,7 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 {
 	struct hl_ctx *ctx = hdev->compute_ctx;
 	u64 hop_addr, hop_pte_addr, hop_pte;
-	u64 offset_mask = HOP4_MASK | OFFSET_MASK;
+	u64 offset_mask = HOP4_MASK | FLAGS_MASK;
 	int rc = 0;
 
 	if (!ctx) {
@@ -579,7 +579,7 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 		hop_pte_addr = get_hop4_pte_addr(ctx, hop_addr, virt_addr);
 		hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
 
-		offset_mask = OFFSET_MASK;
+		offset_mask = FLAGS_MASK;
 	}
 
 	if (!(hop_pte & PAGE_PRESENT_MASK))
diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
index 71ea3c3e8ba3..74a5502b8c4e 100644
--- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
@@ -17,13 +17,12 @@
 #define PAGE_PRESENT_MASK		0x0000000000001ull
 #define SWAP_OUT_MASK			0x0000000000004ull
 #define LAST_MASK			0x0000000000800ull
-#define PHYS_ADDR_MASK			0xFFFFFFFFFFFFF000ull
 #define HOP0_MASK			0x3000000000000ull
 #define HOP1_MASK			0x0FF8000000000ull
 #define HOP2_MASK			0x0007FC0000000ull
 #define HOP3_MASK			0x000003FE00000ull
 #define HOP4_MASK			0x00000001FF000ull
-#define OFFSET_MASK			0x0000000000FFFull
+#define FLAGS_MASK			0x0000000000FFFull
 
 #define HOP0_SHIFT			48
 #define HOP1_SHIFT			39
@@ -31,8 +30,7 @@
 #define HOP3_SHIFT			21
 #define HOP4_SHIFT			12
 
-#define PTE_PHYS_ADDR_SHIFT		12
-#define PTE_PHYS_ADDR_MASK		~OFFSET_MASK
+#define HOP_PHYS_ADDR_MASK		(~FLAGS_MASK)
 
 #define HL_PTE_SIZE			sizeof(u64)
 #define HOP_TABLE_SIZE			PAGE_SIZE_4KB
diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/mmu.c
index 176c315836f1..21b4e3281b3e 100644
--- a/drivers/misc/habanalabs/mmu.c
+++ b/drivers/misc/habanalabs/mmu.c
@@ -105,8 +105,8 @@ static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
 	 * clear the 12 LSBs and translate the shadow hop to its associated
 	 * physical hop, and add back the original 12 LSBs.
 	 */
-	u64 phys_val = get_phys_addr(ctx, val & PTE_PHYS_ADDR_MASK) |
-				(val & OFFSET_MASK);
+	u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
+				(val & FLAGS_MASK);
 
 	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
 					get_phys_addr(ctx, shadow_pte_addr),
@@ -199,7 +199,7 @@ static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
 static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
 {
 	if (curr_pte & PAGE_PRESENT_MASK)
-		return curr_pte & PHYS_ADDR_MASK;
+		return curr_pte & HOP_PHYS_ADDR_MASK;
 	else
 		return ULLONG_MAX;
 }
@@ -288,23 +288,23 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	}
 
 	/* need only pte 0 in hops 0 and 1 */
-	pte_val = (hop1_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+	pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
 	write_pte(ctx, hop0_addr, pte_val);
 
-	pte_val = (hop2_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+	pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
 	write_pte(ctx, hop1_addr, pte_val);
 	get_pte(ctx, hop1_addr);
 
 	hop2_pte_addr = hop2_addr;
 	for (i = 0 ; i < num_of_hop3 ; i++) {
-		pte_val = (ctx->dram_default_hops[i] & PTE_PHYS_ADDR_MASK) |
+		pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
 				PAGE_PRESENT_MASK;
 		write_pte(ctx, hop2_pte_addr, pte_val);
 		get_pte(ctx, hop2_addr);
 		hop2_pte_addr += HL_PTE_SIZE;
 	}
 
-	pte_val = (prop->mmu_dram_default_page_addr & PTE_PHYS_ADDR_MASK) |
+	pte_val = (prop->mmu_dram_default_page_addr & HOP_PHYS_ADDR_MASK) |
 			LAST_MASK | PAGE_PRESENT_MASK;
 
 	for (i = 0 ; i < num_of_hop3 ; i++) {
@@ -400,8 +400,6 @@ int hl_mmu_init(struct hl_device *hdev)
 	if (!hdev->mmu_enable)
 		return 0;
 
-	/* MMU H/W init was already done in device hw_init() */
-
 	hdev->mmu_pgt_pool =
 			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
 
@@ -427,6 +425,8 @@ int hl_mmu_init(struct hl_device *hdev)
 		goto err_pool_add;
 	}
 
+	/* MMU H/W init will be done in device hw_init() */
+
 	return 0;
 
 err_pool_add:
@@ -450,10 +450,10 @@ void hl_mmu_fini(struct hl_device *hdev)
 	if (!hdev->mmu_enable)
 		return;
 
+	/* MMU H/W fini was already done in device hw_fini() */
+
 	kvfree(hdev->mmu_shadow_hop0);
 	gen_pool_destroy(hdev->mmu_pgt_pool);
-
-	/* MMU H/W fini will be done in device hw_fini() */
 }
 
 /**
@@ -584,7 +584,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
 
 	if (hdev->dram_default_page_mapping && is_dram_addr) {
 		u64 default_pte = (prop->mmu_dram_default_page_addr &
-				PTE_PHYS_ADDR_MASK) | LAST_MASK |
+				HOP_PHYS_ADDR_MASK) | LAST_MASK |
 					PAGE_PRESENT_MASK;
 		if (curr_pte == default_pte) {
 			dev_err(hdev->dev,
@@ -773,7 +773,7 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 
 	if (hdev->dram_default_page_mapping && is_dram_addr) {
 		u64 default_pte = (prop->mmu_dram_default_page_addr &
-					PTE_PHYS_ADDR_MASK) | LAST_MASK |
+					HOP_PHYS_ADDR_MASK) | LAST_MASK |
 						PAGE_PRESENT_MASK;
 
 		if (curr_pte != default_pte) {
@@ -813,7 +813,7 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 		goto err;
 	}
 
-	curr_pte = (phys_addr & PTE_PHYS_ADDR_MASK) | LAST_MASK
+	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK
 			| PAGE_PRESENT_MASK;
 
 	if (is_huge)
@@ -823,25 +823,25 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 
 	if (hop1_new) {
 		curr_pte =
-			(hop1_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+			(hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
 		write_pte(ctx, hop0_pte_addr, curr_pte);
 	}
 	if (hop2_new) {
 		curr_pte =
-			(hop2_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+			(hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
 		write_pte(ctx, hop1_pte_addr, curr_pte);
 		get_pte(ctx, hop1_addr);
 	}
 	if (hop3_new) {
 		curr_pte =
-			(hop3_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+			(hop3_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
 		write_pte(ctx, hop2_pte_addr, curr_pte);
 		get_pte(ctx, hop2_addr);
 	}
 
 	if (!is_huge) {
 		if (hop4_new) {
-			curr_pte = (hop4_addr & PTE_PHYS_ADDR_MASK) |
+			curr_pte = (hop4_addr & HOP_PHYS_ADDR_MASK) |
 					PAGE_PRESENT_MASK;
 			write_pte(ctx, hop3_pte_addr, curr_pte);
 			get_pte(ctx, hop3_addr);

From 54bb67444ea3f388756c5955db52ef62eb4ba3b9 Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Thu, 14 Nov 2019 18:23:55 +0000
Subject: [PATCH 19/31] habanalabs: split MMU properties to PCI/DRAM

Split the properties used for MMU mappings to DRAM and PCI (host) types.
This is a prerequisite for future ASICs support.
Note that in Goya ASIC, the PMMU and DMMU are the same (except of page
sizes) as only one MMU mechanism is used for both of the mapping types.
Hence this patch should not have any effect on current behavior.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/debugfs.c             |  90 +++++++----
 drivers/misc/habanalabs/goya/goya.c           |  17 ++
 drivers/misc/habanalabs/habanalabs.h          | 114 ++++++++-----
 .../include/hw_ip/mmu/mmu_general.h           |   1 -
 drivers/misc/habanalabs/memory.c              |  45 +++---
 drivers/misc/habanalabs/mmu.c                 | 151 +++++++++++-------
 6 files changed, 269 insertions(+), 149 deletions(-)

diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c
index 1e1fa619a225..1cf75010a379 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -307,39 +307,51 @@ static inline u64 get_hop0_addr(struct hl_ctx *ctx)
 			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
 }
 
-static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
+					u64 virt_addr, u64 mask, u64 shift)
 {
 	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP0_MASK) >> HOP0_SHIFT);
+			((virt_addr & mask) >> shift);
 }
 
-static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
 {
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP1_MASK) >> HOP1_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop0_mask,
+					mmu_specs->hop0_shift);
 }
 
-static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
 {
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP2_MASK) >> HOP2_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop1_mask,
+					mmu_specs->hop1_shift);
 }
 
-static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
 {
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP3_MASK) >> HOP3_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop2_mask,
+					mmu_specs->hop2_shift);
 }
 
-static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
 {
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP4_MASK) >> HOP4_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop3_mask,
+					mmu_specs->hop3_shift);
+}
+
+static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop4_mask,
+					mmu_specs->hop4_shift);
 }
 
 static inline u64 get_next_hop_addr(u64 curr_pte)
@@ -355,7 +367,10 @@ static int mmu_show(struct seq_file *s, void *data)
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
 	struct hl_ctx *ctx;
+	bool is_dram_addr;
 
 	u64 hop0_addr = 0, hop0_pte_addr = 0, hop0_pte = 0,
 		hop1_addr = 0, hop1_pte_addr = 0, hop1_pte = 0,
@@ -377,33 +392,39 @@ static int mmu_show(struct seq_file *s, void *data)
 		return 0;
 	}
 
+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+				prop->va_space_dram_start_address,
+				prop->va_space_dram_end_address);
+
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+
 	mutex_lock(&ctx->mmu_lock);
 
 	/* the following lookup is copied from unmap() in mmu.c */
 
 	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr);
+	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
 	hop0_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr);
 	hop1_addr = get_next_hop_addr(hop0_pte);
 
 	if (hop1_addr == ULLONG_MAX)
 		goto not_mapped;
 
-	hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr);
+	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
 	hop1_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr);
 	hop2_addr = get_next_hop_addr(hop1_pte);
 
 	if (hop2_addr == ULLONG_MAX)
 		goto not_mapped;
 
-	hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr);
+	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
 	hop2_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr);
 	hop3_addr = get_next_hop_addr(hop2_pte);
 
 	if (hop3_addr == ULLONG_MAX)
 		goto not_mapped;
 
-	hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr);
+	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
 	hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);
 
 	if (!(hop3_pte & LAST_MASK)) {
@@ -412,7 +433,8 @@ static int mmu_show(struct seq_file *s, void *data)
 		if (hop4_addr == ULLONG_MAX)
 			goto not_mapped;
 
-		hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr);
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
+							virt_addr);
 		hop4_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
 		if (!(hop4_pte & PAGE_PRESENT_MASK))
 			goto not_mapped;
@@ -534,41 +556,50 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 				u64 *phys_addr)
 {
 	struct hl_ctx *ctx = hdev->compute_ctx;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
 	u64 hop_addr, hop_pte_addr, hop_pte;
 	u64 offset_mask = HOP4_MASK | FLAGS_MASK;
 	int rc = 0;
+	bool is_dram_addr;
 
 	if (!ctx) {
 		dev_err(hdev->dev, "no ctx available\n");
 		return -EINVAL;
 	}
 
+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+				prop->va_space_dram_start_address,
+				prop->va_space_dram_end_address);
+
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+
 	mutex_lock(&ctx->mmu_lock);
 
 	/* hop 0 */
 	hop_addr = get_hop0_addr(ctx);
-	hop_pte_addr = get_hop0_pte_addr(ctx, hop_addr, virt_addr);
+	hop_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
 	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
 
 	/* hop 1 */
 	hop_addr = get_next_hop_addr(hop_pte);
 	if (hop_addr == ULLONG_MAX)
 		goto not_mapped;
-	hop_pte_addr = get_hop1_pte_addr(ctx, hop_addr, virt_addr);
+	hop_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
 	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
 
 	/* hop 2 */
 	hop_addr = get_next_hop_addr(hop_pte);
 	if (hop_addr == ULLONG_MAX)
 		goto not_mapped;
-	hop_pte_addr = get_hop2_pte_addr(ctx, hop_addr, virt_addr);
+	hop_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
 	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
 
 	/* hop 3 */
 	hop_addr = get_next_hop_addr(hop_pte);
 	if (hop_addr == ULLONG_MAX)
 		goto not_mapped;
-	hop_pte_addr = get_hop3_pte_addr(ctx, hop_addr, virt_addr);
+	hop_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
 	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
 
 	if (!(hop_pte & LAST_MASK)) {
@@ -576,7 +607,8 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 		hop_addr = get_next_hop_addr(hop_pte);
 		if (hop_addr == ULLONG_MAX)
 			goto not_mapped;
-		hop_pte_addr = get_hop4_pte_addr(ctx, hop_addr, virt_addr);
+		hop_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop_addr,
+							virt_addr);
 		hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
 
 		offset_mask = FLAGS_MASK;
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 3c22fb96a26f..3294a6a92f75 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -380,6 +380,23 @@ void goya_get_fixed_properties(struct hl_device *hdev)
 	prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;
 
+	prop->dmmu.hop0_shift = HOP0_SHIFT;
+	prop->dmmu.hop1_shift = HOP1_SHIFT;
+	prop->dmmu.hop2_shift = HOP2_SHIFT;
+	prop->dmmu.hop3_shift = HOP3_SHIFT;
+	prop->dmmu.hop4_shift = HOP4_SHIFT;
+	prop->dmmu.hop0_mask = HOP0_MASK;
+	prop->dmmu.hop1_mask = HOP1_MASK;
+	prop->dmmu.hop2_mask = HOP2_MASK;
+	prop->dmmu.hop3_mask = HOP3_MASK;
+	prop->dmmu.hop4_mask = HOP4_MASK;
+	prop->dmmu.huge_page_size = PAGE_SIZE_2MB;
+
+	/* No difference between PMMU and DMMU except of page size */
+	memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
+	prop->dmmu.page_size = PAGE_SIZE_2MB;
+	prop->pmmu.page_size = PAGE_SIZE_4KB;
+
 	prop->va_space_host_start_address = VA_HOST_SPACE_START;
 	prop->va_space_host_end_address = VA_HOST_SPACE_END;
 	prop->va_space_dram_start_address = VA_DDR_SPACE_START;
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 36d05c32f7ec..00c949f4ccd1 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -130,6 +130,36 @@ enum hl_device_hw_state {
 	HL_DEVICE_HW_STATE_DIRTY
 };
 
+/**
+ * struct hl_mmu_properties - ASIC specific MMU address translation properties.
+ * @hop0_shift: shift of hop 0 mask.
+ * @hop1_shift: shift of hop 1 mask.
+ * @hop2_shift: shift of hop 2 mask.
+ * @hop3_shift: shift of hop 3 mask.
+ * @hop4_shift: shift of hop 4 mask.
+ * @hop0_mask: mask to get the PTE address in hop 0.
+ * @hop1_mask: mask to get the PTE address in hop 1.
+ * @hop2_mask: mask to get the PTE address in hop 2.
+ * @hop3_mask: mask to get the PTE address in hop 3.
+ * @hop4_mask: mask to get the PTE address in hop 4.
+ * @page_size: default page size used to allocate memory.
+ * @huge_page_size: page size used to allocate memory with huge pages.
+ */
+struct hl_mmu_properties {
+	u64	hop0_shift;
+	u64	hop1_shift;
+	u64	hop2_shift;
+	u64	hop3_shift;
+	u64	hop4_shift;
+	u64	hop0_mask;
+	u64	hop1_mask;
+	u64	hop2_mask;
+	u64	hop3_mask;
+	u64	hop4_mask;
+	u32	page_size;
+	u32	huge_page_size;
+};
+
 /**
  * struct asic_fixed_properties - ASIC specific immutable properties.
  * @hw_queues_props: H/W queues properties.
@@ -137,6 +167,8 @@ enum hl_device_hw_state {
  *		available sensors.
  * @uboot_ver: F/W U-boot version.
  * @preboot_ver: F/W Preboot version.
+ * @dmmu: DRAM MMU address translation properties.
+ * @pmmu: PCI (host) MMU address translation properties.
  * @sram_base_address: SRAM physical start address.
  * @sram_end_address: SRAM physical end address.
  * @sram_user_base_address - SRAM physical start address for user access.
@@ -173,53 +205,55 @@ enum hl_device_hw_state {
  * @psoc_pci_pll_nf: PCI PLL NF value.
  * @psoc_pci_pll_od: PCI PLL OD value.
  * @psoc_pci_pll_div_factor: PCI PLL DIV FACTOR 1 value.
- * @completion_queues_count: number of completion queues.
  * @high_pll: high PLL frequency used by the device.
  * @cb_pool_cb_cnt: number of CBs in the CB pool.
  * @cb_pool_cb_size: size of each CB in the CB pool.
  * @tpc_enabled_mask: which TPCs are enabled.
+ * @completion_queues_count: number of completion queues.
  */
 struct asic_fixed_properties {
 	struct hw_queue_properties	hw_queues_props[HL_MAX_QUEUES];
-	struct armcp_info	armcp_info;
-	char			uboot_ver[VERSION_MAX_LEN];
-	char			preboot_ver[VERSION_MAX_LEN];
-	u64			sram_base_address;
-	u64			sram_end_address;
-	u64			sram_user_base_address;
-	u64			dram_base_address;
-	u64			dram_end_address;
-	u64			dram_user_base_address;
-	u64			dram_size;
-	u64			dram_pci_bar_size;
-	u64			max_power_default;
-	u64			va_space_host_start_address;
-	u64			va_space_host_end_address;
-	u64			va_space_dram_start_address;
-	u64			va_space_dram_end_address;
-	u64			dram_size_for_default_page_mapping;
-	u64			pcie_dbi_base_address;
-	u64			pcie_aux_dbi_reg_addr;
-	u64			mmu_pgt_addr;
-	u64			mmu_dram_default_page_addr;
-	u32			mmu_pgt_size;
-	u32			mmu_pte_size;
-	u32			mmu_hop_table_size;
-	u32			mmu_hop0_tables_total_size;
-	u32			dram_page_size;
-	u32			cfg_size;
-	u32			sram_size;
-	u32			max_asid;
-	u32			num_of_events;
-	u32			psoc_pci_pll_nr;
-	u32			psoc_pci_pll_nf;
-	u32			psoc_pci_pll_od;
-	u32			psoc_pci_pll_div_factor;
-	u32			high_pll;
-	u32			cb_pool_cb_cnt;
-	u32			cb_pool_cb_size;
-	u8			completion_queues_count;
-	u8			tpc_enabled_mask;
+	struct armcp_info		armcp_info;
+	char				uboot_ver[VERSION_MAX_LEN];
+	char				preboot_ver[VERSION_MAX_LEN];
+	struct hl_mmu_properties	dmmu;
+	struct hl_mmu_properties	pmmu;
+	u64				sram_base_address;
+	u64				sram_end_address;
+	u64				sram_user_base_address;
+	u64				dram_base_address;
+	u64				dram_end_address;
+	u64				dram_user_base_address;
+	u64				dram_size;
+	u64				dram_pci_bar_size;
+	u64				max_power_default;
+	u64				va_space_host_start_address;
+	u64				va_space_host_end_address;
+	u64				va_space_dram_start_address;
+	u64				va_space_dram_end_address;
+	u64				dram_size_for_default_page_mapping;
+	u64				pcie_dbi_base_address;
+	u64				pcie_aux_dbi_reg_addr;
+	u64				mmu_pgt_addr;
+	u64				mmu_dram_default_page_addr;
+	u32				mmu_pgt_size;
+	u32				mmu_pte_size;
+	u32				mmu_hop_table_size;
+	u32				mmu_hop0_tables_total_size;
+	u32				dram_page_size;
+	u32				cfg_size;
+	u32				sram_size;
+	u32				max_asid;
+	u32				num_of_events;
+	u32				psoc_pci_pll_nr;
+	u32				psoc_pci_pll_nf;
+	u32				psoc_pci_pll_od;
+	u32				psoc_pci_pll_div_factor;
+	u32				high_pll;
+	u32				cb_pool_cb_cnt;
+	u32				cb_pool_cb_size;
+	u8				tpc_enabled_mask;
+	u8				completion_queues_count;
 };
 
 /**
diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
index 74a5502b8c4e..a6851a9d3f03 100644
--- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
@@ -12,7 +12,6 @@
 #define PAGE_SHIFT_2MB			21
 #define PAGE_SIZE_2MB			(_AC(1, UL) << PAGE_SHIFT_2MB)
 #define PAGE_SIZE_4KB			(_AC(1, UL) << PAGE_SHIFT_4KB)
-#define PAGE_MASK_2MB			(~(PAGE_SIZE_2MB - 1))
 
 #define PAGE_PRESENT_MASK		0x0000000000001ull
 #define SWAP_OUT_MASK			0x0000000000004ull
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index 12db6609da27..cce6bdb6e655 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -13,7 +13,6 @@
 #include <linux/slab.h>
 #include <linux/genalloc.h>
 
-#define PGS_IN_2MB_PAGE	(PAGE_SIZE_2MB >> PAGE_SHIFT)
 #define HL_MMU_DEBUG	0
 
 /*
@@ -516,8 +515,8 @@ static inline int add_va_block(struct hl_device *hdev,
  * - Return the start address of the virtual block
  */
 static u64 get_va_block(struct hl_device *hdev,
-		struct hl_va_range *va_range, u64 size, u64 hint_addr,
-		bool is_userptr)
+			struct hl_va_range *va_range, u64 size, u64 hint_addr,
+			bool is_userptr)
 {
 	struct hl_vm_va_block *va_block, *new_va_block = NULL;
 	u64 valid_start, valid_size, prev_start, prev_end, page_mask,
@@ -525,18 +524,17 @@ static u64 get_va_block(struct hl_device *hdev,
 	u32 page_size;
 	bool add_prev = false;
 
-	if (is_userptr) {
+	if (is_userptr)
 		/*
 		 * We cannot know if the user allocated memory with huge pages
 		 * or not, hence we continue with the biggest possible
 		 * granularity.
 		 */
-		page_size = PAGE_SIZE_2MB;
-		page_mask = PAGE_MASK_2MB;
-	} else {
-		page_size = hdev->asic_prop.dram_page_size;
-		page_mask = ~((u64)page_size - 1);
-	}
+		page_size = hdev->asic_prop.pmmu.huge_page_size;
+	else
+		page_size = hdev->asic_prop.dmmu.page_size;
+
+	page_mask = ~((u64)page_size - 1);
 
 	mutex_lock(&va_range->lock);
 
@@ -558,7 +556,6 @@ static u64 get_va_block(struct hl_device *hdev,
 
 		if (valid_size >= size &&
 			(!new_va_block || valid_size < res_valid_size)) {
-
 			new_va_block = va_block;
 			res_valid_start = valid_start;
 			res_valid_size = valid_size;
@@ -629,7 +626,7 @@ static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
 /*
  * init_phys_pg_pack_from_userptr - initialize physical page pack from host
  *                                  memory
- * @asid: current context ASID
+ * @ctx: current context
  * @userptr: userptr to initialize from
  * @pphys_pg_pack: result pointer
  *
@@ -638,16 +635,20 @@ static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
  * - Create a physical page pack from the physical pages related to the given
  *   virtual block
  */
-static int init_phys_pg_pack_from_userptr(u32 asid, struct hl_userptr *userptr,
+static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
+				struct hl_userptr *userptr,
 				struct hl_vm_phys_pg_pack **pphys_pg_pack)
 {
+	struct hl_mmu_properties *mmu_prop = &ctx->hdev->asic_prop.pmmu;
 	struct hl_vm_phys_pg_pack *phys_pg_pack;
 	struct scatterlist *sg;
 	dma_addr_t dma_addr;
 	u64 page_mask, total_npages;
-	u32 npages, page_size = PAGE_SIZE;
+	u32 npages, page_size = PAGE_SIZE,
+		huge_page_size = mmu_prop->huge_page_size;
 	bool first = true, is_huge_page_opt = true;
 	int rc, i, j;
+	u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
 
 	phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
 	if (!phys_pg_pack)
@@ -655,7 +656,7 @@ static int init_phys_pg_pack_from_userptr(u32 asid, struct hl_userptr *userptr,
 
 	phys_pg_pack->vm_type = userptr->vm_type;
 	phys_pg_pack->created_from_userptr = true;
-	phys_pg_pack->asid = asid;
+	phys_pg_pack->asid = ctx->asid;
 	atomic_set(&phys_pg_pack->mapping_cnt, 1);
 
 	/* Only if all dma_addrs are aligned to 2MB and their
@@ -670,14 +671,14 @@ static int init_phys_pg_pack_from_userptr(u32 asid, struct hl_userptr *userptr,
 
 		total_npages += npages;
 
-		if ((npages % PGS_IN_2MB_PAGE) ||
-					(dma_addr & (PAGE_SIZE_2MB - 1)))
+		if ((npages % pgs_in_huge_page) ||
+					(dma_addr & (huge_page_size - 1)))
 			is_huge_page_opt = false;
 	}
 
 	if (is_huge_page_opt) {
-		page_size = PAGE_SIZE_2MB;
-		total_npages /= PGS_IN_2MB_PAGE;
+		page_size = huge_page_size;
+		do_div(total_npages, pgs_in_huge_page);
 	}
 
 	page_mask = ~(((u64) page_size) - 1);
@@ -709,7 +710,7 @@ static int init_phys_pg_pack_from_userptr(u32 asid, struct hl_userptr *userptr,
 			dma_addr += page_size;
 
 			if (is_huge_page_opt)
-				npages -= PGS_IN_2MB_PAGE;
+				npages -= pgs_in_huge_page;
 			else
 				npages--;
 		}
@@ -872,7 +873,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 			return rc;
 		}
 
-		rc = init_phys_pg_pack_from_userptr(ctx->asid, userptr,
+		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
 				&phys_pg_pack);
 		if (rc) {
 			dev_err(hdev->dev,
@@ -1029,7 +1030,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
 	if (*vm_type == VM_TYPE_USERPTR) {
 		is_userptr = true;
 		userptr = hnode->ptr;
-		rc = init_phys_pg_pack_from_userptr(ctx->asid, userptr,
+		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
 							&phys_pg_pack);
 		if (rc) {
 			dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/mmu.c
index 21b4e3281b3e..3a7f8ff19eb2 100644
--- a/drivers/misc/habanalabs/mmu.c
+++ b/drivers/misc/habanalabs/mmu.c
@@ -171,29 +171,44 @@ static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
 			((virt_addr & mask) >> shift);
 }
 
-static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
 {
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP0_MASK, HOP0_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop0_mask,
+					mmu_prop->hop0_shift);
 }
 
-static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
 {
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP1_MASK, HOP1_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop1_mask,
+					mmu_prop->hop1_shift);
 }
 
-static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
 {
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP2_MASK, HOP2_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop2_mask,
+					mmu_prop->hop2_shift);
 }
 
-static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
 {
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP3_MASK, HOP3_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop3_mask,
+					mmu_prop->hop3_shift);
 }
 
-static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
 {
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP4_MASK, HOP4_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop4_mask,
+					mmu_prop->hop4_shift);
 }
 
 static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
@@ -513,24 +528,23 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx)
 	mutex_destroy(&ctx->mmu_lock);
 }
 
-static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
+static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr)
 {
 	struct hl_device *hdev = ctx->hdev;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
 	u64 hop0_addr = 0, hop0_pte_addr = 0,
 		hop1_addr = 0, hop1_pte_addr = 0,
 		hop2_addr = 0, hop2_pte_addr = 0,
 		hop3_addr = 0, hop3_pte_addr = 0,
 		hop4_addr = 0, hop4_pte_addr = 0,
 		curr_pte;
-	bool is_dram_addr, is_huge, clear_hop3 = true;
+	bool is_huge, clear_hop3 = true;
 
-	is_dram_addr = hl_mem_area_inside_range(virt_addr, PAGE_SIZE_2MB,
-				prop->va_space_dram_start_address,
-				prop->va_space_dram_end_address);
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
 
 	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr);
+	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
 
 	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
 
@@ -539,7 +553,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
 	if (hop1_addr == ULLONG_MAX)
 		goto not_mapped;
 
-	hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr);
+	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
 
 	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
 
@@ -548,7 +562,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
 	if (hop2_addr == ULLONG_MAX)
 		goto not_mapped;
 
-	hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr);
+	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
 
 	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
 
@@ -557,7 +571,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
 	if (hop3_addr == ULLONG_MAX)
 		goto not_mapped;
 
-	hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr);
+	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
 
 	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
 
@@ -575,7 +589,8 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
 		if (hop4_addr == ULLONG_MAX)
 			goto not_mapped;
 
-		hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr);
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
+							virt_addr);
 
 		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
 
@@ -667,25 +682,36 @@ not_mapped:
 int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size)
 {
 	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
 	u64 real_virt_addr;
 	u32 real_page_size, npages;
 	int i, rc;
+	bool is_dram_addr;
 
 	if (!hdev->mmu_enable)
 		return 0;
 
+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+				prop->va_space_dram_start_address,
+				prop->va_space_dram_end_address);
+
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+
 	/*
-	 * The H/W handles mapping of 4KB/2MB page. Hence if the host page size
-	 * is bigger, we break it to sub-pages and unmap them separately.
+	 * The H/W handles mapping of specific page sizes. Hence if the page
+	 * size is bigger, we break it to sub-pages and unmap them separately.
 	 */
-	if ((page_size % PAGE_SIZE_2MB) == 0) {
-		real_page_size = PAGE_SIZE_2MB;
-	} else if ((page_size % PAGE_SIZE_4KB) == 0) {
-		real_page_size = PAGE_SIZE_4KB;
+	if ((page_size % mmu_prop->huge_page_size) == 0) {
+		real_page_size = mmu_prop->huge_page_size;
+	} else if ((page_size % mmu_prop->page_size) == 0) {
+		real_page_size = mmu_prop->page_size;
 	} else {
 		dev_err(hdev->dev,
-			"page size of %u is not 4KB nor 2MB aligned, can't unmap\n",
-				page_size);
+			"page size of %u is not %uKB nor %uMB aligned, can't unmap\n",
+			page_size,
+			mmu_prop->page_size >> 10,
+			mmu_prop->huge_page_size >> 20);
 
 		return -EFAULT;
 	}
@@ -694,7 +720,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size)
 	real_virt_addr = virt_addr;
 
 	for (i = 0 ; i < npages ; i++) {
-		rc = _hl_mmu_unmap(ctx, real_virt_addr);
+		rc = _hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr);
 		if (rc)
 			return rc;
 
@@ -705,10 +731,11 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size)
 }
 
 static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
-		u32 page_size)
+			u32 page_size, bool is_dram_addr)
 {
 	struct hl_device *hdev = ctx->hdev;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
 	u64 hop0_addr = 0, hop0_pte_addr = 0,
 		hop1_addr = 0, hop1_pte_addr = 0,
 		hop2_addr = 0, hop2_pte_addr = 0,
@@ -716,21 +743,19 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 		hop4_addr = 0, hop4_pte_addr = 0,
 		curr_pte = 0;
 	bool hop1_new = false, hop2_new = false, hop3_new = false,
-		hop4_new = false, is_huge, is_dram_addr;
+		hop4_new = false, is_huge;
 	int rc = -ENOMEM;
 
-	/*
-	 * This mapping function can map a 4KB/2MB page. For 2MB page there are
-	 * only 3 hops rather than 4. Currently the DRAM allocation uses 2MB
-	 * pages only but user memory could have been allocated with one of the
-	 * two page sizes. Since this is a common code for all the three cases,
-	 * we need this hugs page check.
-	 */
-	is_huge = page_size == PAGE_SIZE_2MB;
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
 
-	is_dram_addr = hl_mem_area_inside_range(virt_addr, page_size,
-				prop->va_space_dram_start_address,
-				prop->va_space_dram_end_address);
+	/*
+	 * This mapping function can map a page or a huge page. For huge page
+	 * there are only 3 hops rather than 4. Currently the DRAM allocation
+	 * uses huge pages only but user memory could have been allocated with
+	 * one of the two page sizes. Since this is a common code for all the
+	 * three cases, we need this hugs page check.
+	 */
+	is_huge = page_size == mmu_prop->huge_page_size;
 
 	if (is_dram_addr && !is_huge) {
 		dev_err(hdev->dev, "DRAM mapping should use huge pages only\n");
@@ -738,28 +763,28 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 	}
 
 	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr);
+	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
 	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
 
 	hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
 	if (hop1_addr == ULLONG_MAX)
 		goto err;
 
-	hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr);
+	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
 	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
 
 	hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
 	if (hop2_addr == ULLONG_MAX)
 		goto err;
 
-	hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr);
+	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
 	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
 
 	hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
 	if (hop3_addr == ULLONG_MAX)
 		goto err;
 
-	hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr);
+	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
 	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
 
 	if (!is_huge) {
@@ -767,7 +792,8 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 		if (hop4_addr == ULLONG_MAX)
 			goto err;
 
-		hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr);
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
+							virt_addr);
 		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
 	}
 
@@ -890,25 +916,36 @@ err:
 int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size)
 {
 	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
 	u64 real_virt_addr, real_phys_addr;
 	u32 real_page_size, npages;
 	int i, rc, mapped_cnt = 0;
+	bool is_dram_addr;
 
 	if (!hdev->mmu_enable)
 		return 0;
 
+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+				prop->va_space_dram_start_address,
+				prop->va_space_dram_end_address);
+
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+
 	/*
-	 * The H/W handles mapping of 4KB/2MB page. Hence if the host page size
-	 * is bigger, we break it to sub-pages and map them separately.
+	 * The H/W handles mapping of specific page sizes. Hence if the page
+	 * size is bigger, we break it to sub-pages and map them separately.
 	 */
-	if ((page_size % PAGE_SIZE_2MB) == 0) {
-		real_page_size = PAGE_SIZE_2MB;
-	} else if ((page_size % PAGE_SIZE_4KB) == 0) {
-		real_page_size = PAGE_SIZE_4KB;
+	if ((page_size % mmu_prop->huge_page_size) == 0) {
+		real_page_size = mmu_prop->huge_page_size;
+	} else if ((page_size % mmu_prop->page_size) == 0) {
+		real_page_size = mmu_prop->page_size;
 	} else {
 		dev_err(hdev->dev,
-			"page size of %u is not 4KB nor 2MB aligned, can't map\n",
-				page_size);
+			"page size of %u is not %dKB nor %dMB aligned, can't unmap\n",
+			page_size,
+			mmu_prop->page_size >> 10,
+			mmu_prop->huge_page_size >> 20);
 
 		return -EFAULT;
 	}
@@ -923,7 +960,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size)
 
 	for (i = 0 ; i < npages ; i++) {
 		rc = _hl_mmu_map(ctx, real_virt_addr, real_phys_addr,
-				real_page_size);
+				real_page_size, is_dram_addr);
 		if (rc)
 			goto err;
 
@@ -937,7 +974,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size)
 err:
 	real_virt_addr = virt_addr;
 	for (i = 0 ; i < mapped_cnt ; i++) {
-		if (_hl_mmu_unmap(ctx, real_virt_addr))
+		if (_hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr))
 			dev_warn_ratelimited(hdev->dev,
 				"failed to unmap va: 0x%llx\n", real_virt_addr);
 

From bc75d799f9531fe112f7d78a5547ad549293fb7e Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Thu, 14 Nov 2019 18:23:56 +0000
Subject: [PATCH 20/31] habanalabs: prevent read/write from/to the device
 during hard reset

During hard reset we should not access the device except of necessary
reset operations because the device might be stuck or unresponsive.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/goya/goya.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 3294a6a92f75..2935e84fe7d8 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4870,7 +4870,8 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 	u32 status, timeout_usec;
 	int rc;
 
-	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
+		hdev->hard_reset_pending)
 		return;
 
 	/* no need in L1 only invalidation in Goya */
@@ -4909,7 +4910,8 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
 	u32 status, timeout_usec, inv_data, pi;
 	int rc;
 
-	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
+		hdev->hard_reset_pending)
 		return;
 
 	/* no need in L1 only invalidation in Goya */

From 1b98d8b23f29e5ff82fe84ca0c8acffaa971b9d6 Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Thu, 14 Nov 2019 18:23:57 +0000
Subject: [PATCH 21/31] habanalabs: optimize MMU unmap

Reduce context close time by skipping hash table lookup if possible in
order to avoid hard reset with open contexts.
Reset with open contexts can potentially lead to a kernel crash as the
generic pool of the MMU hops is destroyed while it is not empty because
some unmap operations are not done.
This commit affect mainly when running on simulator.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/mmu.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/mmu.c
index 3a7f8ff19eb2..6262b26e2086 100644
--- a/drivers/misc/habanalabs/mmu.c
+++ b/drivers/misc/habanalabs/mmu.c
@@ -25,10 +25,9 @@ static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
 	return pgt_info;
 }
 
-static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
+static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
 {
 	struct hl_device *hdev = ctx->hdev;
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
 
 	gen_pool_free(hdev->mmu_pgt_pool, pgt_info->phys_addr,
 			hdev->asic_prop.mmu_hop_table_size);
@@ -37,6 +36,13 @@ static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
 	kfree(pgt_info);
 }
 
+static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
+
+	_free_hop(ctx, pgt_info);
+}
+
 static u64 alloc_hop(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
@@ -159,7 +165,7 @@ static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
 	 */
 	num_of_ptes_left = pgt_info->num_of_ptes;
 	if (!num_of_ptes_left)
-		free_hop(ctx, hop_addr);
+		_free_hop(ctx, pgt_info);
 
 	return num_of_ptes_left;
 }
@@ -516,13 +522,14 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx)
 	dram_default_mapping_fini(ctx);
 
 	if (!hash_empty(ctx->mmu_shadow_hash))
-		dev_err(hdev->dev, "ctx is freed while it has pgts in use\n");
+		dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
+			ctx->asid);
 
 	hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
 			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
-		free_hop(ctx, pgt_info->shadow_addr);
+		_free_hop(ctx, pgt_info);
 	}
 
 	mutex_destroy(&ctx->mmu_lock);

From 71c5e55e7c077fa17c42fbda91a8d14322825c44 Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Thu, 14 Nov 2019 18:23:57 +0000
Subject: [PATCH 22/31] habanalabs: skip VA block list update in reset flow

Reduce context close time by skipping the VA block free list update in
order to avoid hard reset with open contexts.
Reset with open contexts can potentially lead to a kernel crash as the
generic pool of the MMU hops is destroyed while it is not empty because
some unmap operations are not done.
The commit affect mainly when running on simulator.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/memory.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index cce6bdb6e655..e6412e13145e 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -993,17 +993,19 @@ init_page_pack_err:
  *
  * @ctx                 : current context
  * @vaddr               : device virtual address to unmap
+ * @ctx_free            : true if in context free flow, false otherwise.
  *
  * This function does the following:
  * - Unmap the physical pages related to the given virtual address
  * - return the device virtual block to the virtual block list
  */
-static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
+static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
 {
 	struct hl_device *hdev = ctx->hdev;
 	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
 	struct hl_vm_hash_node *hnode = NULL;
 	struct hl_userptr *userptr = NULL;
+	struct hl_va_range *va_range;
 	enum vm_type_t *vm_type;
 	bool is_userptr;
 	int rc;
@@ -1029,6 +1031,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
 
 	if (*vm_type == VM_TYPE_USERPTR) {
 		is_userptr = true;
+		va_range = &ctx->host_va_range;
 		userptr = hnode->ptr;
 		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
 							&phys_pg_pack);
@@ -1040,6 +1043,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
 		}
 	} else if (*vm_type == VM_TYPE_PHYS_PACK) {
 		is_userptr = false;
+		va_range = &ctx->dram_va_range;
 		phys_pg_pack = hnode->ptr;
 	} else {
 		dev_warn(hdev->dev,
@@ -1065,12 +1069,18 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
 
 	mutex_unlock(&ctx->mmu_lock);
 
-	if (add_va_block(hdev,
-			is_userptr ? &ctx->host_va_range : &ctx->dram_va_range,
-			vaddr,
-			vaddr + phys_pg_pack->total_size - 1))
-		dev_warn(hdev->dev, "add va block failed for vaddr: 0x%llx\n",
-				vaddr);
+	/*
+	 * No point in maintaining the free VA block list if the context is
+	 * closing as the list will be freed anyway
+	 */
+	if (!ctx_free) {
+		rc = add_va_block(hdev, va_range, vaddr,
+					vaddr + phys_pg_pack->total_size - 1);
+		if (rc)
+			dev_warn(hdev->dev,
+					"add va block failed for vaddr: 0x%llx\n",
+					vaddr);
+	}
 
 	atomic_dec(&phys_pg_pack->mapping_cnt);
 	kfree(hnode);
@@ -1202,8 +1212,8 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 		break;
 
 	case HL_MEM_OP_UNMAP:
-		rc = unmap_device_va(ctx,
-				args->in.unmap.device_virt_addr);
+		rc = unmap_device_va(ctx, args->in.unmap.device_virt_addr,
+					false);
 		break;
 
 	default:
@@ -1650,7 +1660,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 		dev_dbg(hdev->dev,
 			"hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",
 			hnode->vaddr, ctx->asid);
-		unmap_device_va(ctx, hnode->vaddr);
+		unmap_device_va(ctx, hnode->vaddr, true);
 	}
 
 	spin_lock(&vm->idr_lock);

From bea84c4d67e5efd1078ef234ef1304a4d1788008 Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Thu, 14 Nov 2019 18:23:58 +0000
Subject: [PATCH 23/31] habanalabs: invalidate MMU cache only once

Reduce context close time by performing MMU cache invalidation once at the
end of the unmap loop rather in each iteration, in order to avoid hard
reset with open contexts.
Reset with open contexts can potentially lead to a kernel crash as the
generic pool of the MMU hops is destroyed while it is not empty because
some unmap operations are not done.
The commit affect mainly when running on simulator.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/memory.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index e6412e13145e..47e38c6f2d64 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -1065,7 +1065,13 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
 
 	unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);
 
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, *vm_type);
+	/*
+	 * During context free this function is called in a loop to clean all
+	 * the context mappings. Hence the cache invalidation can be called once
+	 * at the loop end rather than for each iteration
+	 */
+	if (!ctx_free)
+		hdev->asic_funcs->mmu_invalidate_cache(hdev, true, *vm_type);
 
 	mutex_unlock(&ctx->mmu_lock);
 
@@ -1663,6 +1669,10 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 		unmap_device_va(ctx, hnode->vaddr, true);
 	}
 
+	/* invalidate the cache once after the unmapping loop */
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_PHYS_PACK);
+
 	spin_lock(&vm->idr_lock);
 	idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)
 		if (phys_pg_list->asid == ctx->asid) {

From e604f551cdce07e45b6ca34eab58648185b3fba0 Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Thu, 14 Nov 2019 18:23:59 +0000
Subject: [PATCH 24/31] habanalabs: remove unnecessary checks

Now that the VA block free list is not updated on context close in order
to optimize this flow, no need in the sanity checks of the list contents
as these will fail for sure.
In addition, remove the "context closing with VA in use" print during hard
reset as this situation is a side effect of the failure that caused the
hard reset.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/memory.c | 40 +++++++-------------------------
 1 file changed, 9 insertions(+), 31 deletions(-)

diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index 47e38c6f2d64..6c72cb4eff54 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -544,7 +544,6 @@ static u64 get_va_block(struct hl_device *hdev,
 		/* calc the first possible aligned addr */
 		valid_start = va_block->start;
 
-
 		if (valid_start & (page_size - 1)) {
 			valid_start &= page_mask;
 			valid_start += page_size;
@@ -1588,43 +1587,16 @@ int hl_vm_ctx_init(struct hl_ctx *ctx)
  * @hdev                : pointer to the habanalabs structure
  * va_range             : pointer to virtual addresses range
  *
- * This function initializes the following:
- * - Checks that the given range contains the whole initial range
+ * This function does the following:
  * - Frees the virtual addresses block list and its lock
  */
 static void hl_va_range_fini(struct hl_device *hdev,
 		struct hl_va_range *va_range)
 {
-	struct hl_vm_va_block *va_block;
-
-	if (list_empty(&va_range->list)) {
-		dev_warn(hdev->dev,
-				"va list should not be empty on cleanup!\n");
-		goto out;
-	}
-
-	if (!list_is_singular(&va_range->list)) {
-		dev_warn(hdev->dev,
-			"va list should not contain multiple blocks on cleanup!\n");
-		goto free_va_list;
-	}
-
-	va_block = list_first_entry(&va_range->list, typeof(*va_block), node);
-
-	if (va_block->start != va_range->start_addr ||
-		va_block->end != va_range->end_addr) {
-		dev_warn(hdev->dev,
-			"wrong va block on cleanup, from 0x%llx to 0x%llx\n",
-				va_block->start, va_block->end);
-		goto free_va_list;
-	}
-
-free_va_list:
 	mutex_lock(&va_range->lock);
 	clear_va_list_locked(hdev, &va_range->list);
 	mutex_unlock(&va_range->lock);
 
-out:
 	mutex_destroy(&va_range->lock);
 }
 
@@ -1659,8 +1631,14 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 
 	hl_debugfs_remove_ctx_mem_hash(hdev, ctx);
 
-	if (!hash_empty(ctx->mem_hash))
-		dev_notice(hdev->dev, "ctx is freed while it has va in use\n");
+	/*
+	 * Clearly something went wrong on hard reset so no point in printing
+	 * another side effect error
+	 */
+	if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash))
+		dev_notice(hdev->dev,
+				"ctx %d is freed while it has va in use\n",
+				ctx->asid);
 
 	hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
 		dev_dbg(hdev->dev,

From 7fbdc12b91110d21e15b84d5acf8402ae608d8c1 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sat, 16 Nov 2019 12:24:19 +0200
Subject: [PATCH 25/31] habanalabs: remove prints on successful device
 initialization

Successful device initialization is mentioned in kernel log with the
message "Successfully added device to habanalabs driver". There is no point
of spamming the log with additional messages about successful queue
testing, which are implied by the above mentioned message.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>
---
 drivers/misc/habanalabs/firmware_if.c | 5 +----
 drivers/misc/habanalabs/goya/goya.c   | 3 ---
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c
index ea2ca67fbfbf..f5bd03171dac 100644
--- a/drivers/misc/habanalabs/firmware_if.c
+++ b/drivers/misc/habanalabs/firmware_if.c
@@ -143,10 +143,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
 			sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
 
 	if (!rc) {
-		if (result == ARMCP_PACKET_FENCE_VAL)
-			dev_info(hdev->dev,
-				"queue test on CPU queue succeeded\n");
-		else
+		if (result != ARMCP_PACKET_FENCE_VAL)
 			dev_err(hdev->dev,
 				"CPU queue test failed (0x%08lX)\n", result);
 	} else {
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 2935e84fe7d8..70bdaeffb6ce 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -3006,9 +3006,6 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
 			"H/W queue %d test failed (scratch(0x%08llX) == 0x%08X)\n",
 			hw_queue_id, (unsigned long long) fence_dma_addr, tmp);
 		rc = -EIO;
-	} else {
-		dev_info(hdev->dev, "queue test on H/W queue %d succeeded\n",
-			hw_queue_id);
 	}
 
 free_pkt:

From da1342a0eec038dc466742e662218f6be349d1b7 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sat, 16 Nov 2019 12:26:30 +0200
Subject: [PATCH 26/31] habanalabs: use defines for F/W files

Make the code more concise and maintainable by using defines for the F/W
files.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>
---
 drivers/misc/habanalabs/goya/goya.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 70bdaeffb6ce..c8d16aa4382c 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -72,6 +72,9 @@
  *
  */
 
+#define GOYA_UBOOT_FW_FILE	"habanalabs/goya/goya-u-boot.bin"
+#define GOYA_LINUX_FW_FILE	"habanalabs/goya/goya-fit.itb"
+
 #define GOYA_MMU_REGS_NUM		63
 
 #define GOYA_DMA_POOL_BLK_SIZE		0x100		/* 256 bytes */
@@ -2163,13 +2166,11 @@ static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
  */
 static int goya_push_uboot_to_device(struct hl_device *hdev)
 {
-	char fw_name[200];
 	void __iomem *dst;
 
-	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin");
 	dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET;
 
-	return hl_fw_push_fw_to_device(hdev, fw_name, dst);
+	return hl_fw_push_fw_to_device(hdev, GOYA_UBOOT_FW_FILE, dst);
 }
 
 /*
@@ -2182,13 +2183,11 @@ static int goya_push_uboot_to_device(struct hl_device *hdev)
  */
 static int goya_push_linux_to_device(struct hl_device *hdev)
 {
-	char fw_name[200];
 	void __iomem *dst;
 
-	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb");
 	dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET;
 
-	return hl_fw_push_fw_to_device(hdev, fw_name, dst);
+	return hl_fw_push_fw_to_device(hdev, GOYA_LINUX_FW_FILE, dst);
 }
 
 static int goya_pldm_init_cpu(struct hl_device *hdev)

From e16ee4103770acf365372886eac7c750017c918e Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sat, 16 Nov 2019 12:29:50 +0200
Subject: [PATCH 27/31] habanalabs: make code more concise

Instead of doing if inside if, just write them with && operator.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>
---
 drivers/misc/habanalabs/habanalabs_ioctl.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index 02d7491fa28f..5d9c269d99db 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -60,7 +60,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.tpc_enabled_mask = prop->tpc_enabled_mask;
 	hw_ip.sram_size = prop->sram_size - sram_kmd_size;
 	hw_ip.dram_size = prop->dram_size - dram_kmd_size;
-	if (hw_ip.dram_size > 0)
+	if (hw_ip.dram_size > PAGE_SIZE)
 		hw_ip.dram_enabled = 1;
 	hw_ip.num_of_events = prop->num_of_events;
 
@@ -184,17 +184,14 @@ static int debug_coresight(struct hl_device *hdev, struct hl_debug_args *args)
 		goto out;
 	}
 
-	if (output) {
-		if (copy_to_user((void __user *) (uintptr_t) args->output_ptr,
-					output,
-					args->output_size)) {
-			dev_err(hdev->dev,
-				"copy to user failed in debug ioctl\n");
-			rc = -EFAULT;
-			goto out;
-		}
+	if (output && copy_to_user((void __user *) (uintptr_t) args->output_ptr,
+					output, args->output_size)) {
+		dev_err(hdev->dev, "copy to user failed in debug ioctl\n");
+		rc = -EFAULT;
+		goto out;
 	}
 
+
 out:
 	kfree(params);
 	kfree(output);
@@ -434,9 +431,8 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg,
 
 	retcode = func(hpriv, kdata);
 
-	if (cmd & IOC_OUT)
-		if (copy_to_user((void __user *)arg, kdata, usize))
-			retcode = -EFAULT;
+	if ((cmd & IOC_OUT) && copy_to_user((void __user *)arg, kdata, usize))
+		retcode = -EFAULT;
 
 out_err:
 	if (retcode)

From 52c01b0137193ab0c9282ec8d09c6338446e6e9f Mon Sep 17 00:00:00 2001
From: Moti Haimovski <mhaimovski@habana.ai>
Date: Sun, 3 Nov 2019 16:26:44 +0200
Subject: [PATCH 28/31] habanalabs: expose reset counters via existing INFO
 IOCTL

Expose both soft and hard reset counts via INFO IOCTL.
This will allow system management applications to easily check
if the device has undergone reset.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/habanalabs_ioctl.c | 19 +++++++++++++++++++
 include/uapi/misc/habanalabs.h             |  9 +++++++++
 2 files changed, 28 insertions(+)

diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index 5d9c269d99db..6474b868ef27 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -242,6 +242,22 @@ static int get_clk_rate(struct hl_device *hdev, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(clk_rate))) ? -EFAULT : 0;
 }
 
+static int get_reset_count(struct hl_device *hdev, struct hl_info_args *args)
+{
+	struct hl_info_reset_count reset_count = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	reset_count.hard_reset_cnt = hdev->hard_reset_cnt;
+	reset_count.soft_reset_cnt = hdev->soft_reset_cnt;
+
+	return copy_to_user(out, &reset_count,
+		min((size_t) max_size, sizeof(reset_count))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -260,6 +276,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_DEVICE_STATUS:
 		return device_status_info(hdev, args);
 
+	case HL_INFO_RESET_COUNT:
+		return get_reset_count(hdev, args);
+
 	default:
 		break;
 	}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 716e70750f23..4faa2c9767e5 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -98,6 +98,9 @@ enum hl_device_status {
  * HL_INFO_CLK_RATE            - Retrieve the current and maximum clock rate
  *                               of the device in MHz. The maximum clock rate is
  *                               configurable via sysfs parameter
+ * HL_INFO_RESET_COUNT   - Retrieve the counts of the soft and hard reset
+ *                         operations performed on the device since the last
+ *                         time the driver was loaded.
  */
 #define HL_INFO_HW_IP_INFO		0
 #define HL_INFO_HW_EVENTS		1
@@ -107,6 +110,7 @@ enum hl_device_status {
 #define HL_INFO_DEVICE_UTILIZATION	6
 #define HL_INFO_HW_EVENTS_AGGREGATE	7
 #define HL_INFO_CLK_RATE		8
+#define HL_INFO_RESET_COUNT		9
 
 #define HL_INFO_VERSION_MAX_LEN	128
 #define HL_INFO_CARD_NAME_MAX_LEN	16
@@ -160,6 +164,11 @@ struct hl_info_clk_rate {
 	__u32 max_clk_rate_mhz;
 };
 
+struct hl_info_reset_count {
+	__u32 hard_reset_cnt;
+	__u32 soft_reset_cnt;
+};
+
 struct hl_info_args {
 	/* Location of relevant struct in userspace */
 	__u64 return_pointer;

From 1af69d30c41d0b0f15d8be80c100cefaa909816c Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sun, 17 Nov 2019 17:35:49 +0200
Subject: [PATCH 29/31] habanalabs: make the reset code more consistent

In the hl_device_reset we ask about the hard_reset argument when we want to
differentiate between soft and hard reset, except for three places where
we use "from_hard_reset_thread". Replace one of those locations with the
hard_reset argument as it is guaranteed that if we reached to that
line in the code during hard_reset, it is from a kernel thread.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
---
 drivers/misc/habanalabs/device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 2f5a4da707e7..80205d8584ce 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -891,7 +891,7 @@ again:
 	 * can't really exit until all its CSs are done, which is what we
 	 * do in cs rollback
 	 */
-	if (from_hard_reset_thread)
+	if (hard_reset)
 		device_kill_open_processes(hdev);
 
 	/* Release kernel context */

From 55f6d680970ea922d4ee23d5ac88d3a8046221fb Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sun, 17 Nov 2019 17:41:57 +0200
Subject: [PATCH 30/31] habanalabs: flush EQ workers in hard reset

During hard-reset, there can be multiple events received from the H/W. For
each event, the driver opens a worker thread to handle it. For some of the
events, the driver will read/write registers in the code that handles the
event.

In case of hard-reset, we must prevent reads/writes to the registers during
the reset operation because the device might get stuck if that happens.

Therefore, flush the EQ workers before resetting the device (in hard-reset
only). Additional events won't arrive as we synced and disabled the
interrupts.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
---
 drivers/misc/habanalabs/device.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 80205d8584ce..b155e9549076 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -887,13 +887,19 @@ again:
 	/* Go over all the queues, release all CS and their jobs */
 	hl_cs_rollback_all(hdev);
 
-	/* Kill processes here after CS rollback. This is because the process
-	 * can't really exit until all its CSs are done, which is what we
-	 * do in cs rollback
-	 */
-	if (hard_reset)
+	if (hard_reset) {
+		/* Kill processes here after CS rollback. This is because the
+		 * process can't really exit until all its CSs are done, which
+		 * is what we do in cs rollback
+		 */
 		device_kill_open_processes(hdev);
 
+		/* Flush the Event queue workers to make sure no other thread is
+		 * reading or writing to registers during the reset
+		 */
+		flush_workqueue(hdev->eq_wq);
+	}
+
 	/* Release kernel context */
 	if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
 		hdev->kernel_ctx = NULL;

From 5feccddcf9922ee3c25587d5e609bf58503ad93e Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Mon, 18 Nov 2019 09:41:08 +0200
Subject: [PATCH 31/31] habanalabs: add more protection of device during reset

Prevent accesses to the device (register read/write) from debugfs entries
during reset as that can cause the device to get stuck.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
---
 drivers/misc/habanalabs/debugfs.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c
index 1cf75010a379..20413e350343 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -528,6 +528,12 @@ static int engines_show(struct seq_file *s, void *data)
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
 
+	if (atomic_read(&hdev->in_reset)) {
+		dev_warn_ratelimited(hdev->dev,
+				"Can't check device idle during reset\n");
+		return 0;
+	}
+
 	hdev->asic_funcs->is_device_idle(hdev, NULL, s);
 
 	return 0;
@@ -640,6 +646,11 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
 	u32 val;
 	ssize_t rc;
 
+	if (atomic_read(&hdev->in_reset)) {
+		dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
+		return 0;
+	}
+
 	if (*ppos)
 		return 0;
 
@@ -669,6 +680,11 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 	u32 value;
 	ssize_t rc;
 
+	if (atomic_read(&hdev->in_reset)) {
+		dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
+		return 0;
+	}
+
 	rc = kstrtouint_from_user(buf, count, 16, &value);
 	if (rc)
 		return rc;