Path: blob/master/drivers/accel/habanalabs/common/memory.c
29281 views
// SPDX-License-Identifier: GPL-2.012/*3* Copyright 2016-2022 HabanaLabs, Ltd.4* All Rights Reserved.5*/67#include <uapi/drm/habanalabs_accel.h>8#include "habanalabs.h"9#include "../include/hw_ip/mmu/mmu_general.h"1011#include <linux/uaccess.h>12#include <linux/slab.h>13#include <linux/vmalloc.h>14#include <linux/pci-p2pdma.h>1516MODULE_IMPORT_NS("DMA_BUF");1718#define HL_MMU_DEBUG 01920/* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */21#define DRAM_POOL_PAGE_SIZE SZ_8M2223#define MEM_HANDLE_INVALID ULONG_MAX2425static int allocate_timestamps_buffers(struct hl_fpriv *hpriv,26struct hl_mem_in *args, u64 *handle);2728static int set_alloc_page_size(struct hl_device *hdev, struct hl_mem_in *args, u32 *page_size)29{30struct asic_fixed_properties *prop = &hdev->asic_prop;31u64 psize;3233/*34* for ASIC that supports setting the allocation page size by user we will address35* user's choice only if it is not 0 (as 0 means taking the default page size)36*/37if (prop->supports_user_set_page_size && args->alloc.page_size) {38psize = args->alloc.page_size;3940if (!is_power_of_2(psize)) {41dev_err(hdev->dev, "user page size (%#llx) is not power of 2\n", psize);42return -EINVAL;43}44} else {45psize = prop->device_mem_alloc_default_page_size;46}4748*page_size = psize;4950return 0;51}5253/*54* The va ranges in context object contain a list with the available chunks of55* device virtual memory.56* There is one range for host allocations and one for DRAM allocations.57*58* On initialization each range contains one chunk of all of its available59* virtual range which is a half of the total device virtual range.60*61* On each mapping of physical pages, a suitable virtual range chunk (with a62* minimum size) is selected from the list. If the chunk size equals the63* requested size, the chunk is returned. Otherwise, the chunk is split into64* two chunks - one to return as result and a remainder to stay in the list.65*66* On each Unmapping of a virtual address, the relevant virtual chunk is67* returned to the list. The chunk is added to the list and if its edges match68* the edges of the adjacent chunks (means a contiguous chunk can be created),69* the chunks are merged.70*71* On finish, the list is checked to have only one chunk of all the relevant72* virtual range (which is a half of the device total virtual range).73* If not (means not all mappings were unmapped), a warning is printed.74*/7576/*77* alloc_device_memory() - allocate device memory.78* @ctx: pointer to the context structure.79* @args: host parameters containing the requested size.80* @ret_handle: result handle.81*82* This function does the following:83* - Allocate the requested size rounded up to 'dram_page_size' pages.84* - Return unique handle for later map/unmap/free.85*/86static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,87u32 *ret_handle)88{89struct hl_device *hdev = ctx->hdev;90struct hl_vm *vm = &hdev->vm;91struct hl_vm_phys_pg_pack *phys_pg_pack;92u64 paddr = 0, total_size, num_pgs, i;93u32 num_curr_pgs, page_size;94bool contiguous;95int handle, rc;9697num_curr_pgs = 0;9899rc = set_alloc_page_size(hdev, args, &page_size);100if (rc)101return rc;102103num_pgs = DIV_ROUND_UP_ULL(args->alloc.mem_size, page_size);104total_size = num_pgs * page_size;105106if (!total_size) {107dev_err(hdev->dev, "Cannot allocate 0 bytes\n");108return -EINVAL;109}110111contiguous = args->flags & HL_MEM_CONTIGUOUS;112113if (contiguous) {114if (is_power_of_2(page_size))115paddr = (uintptr_t) gen_pool_dma_alloc_align(vm->dram_pg_pool,116total_size, NULL, page_size);117else118paddr = gen_pool_alloc(vm->dram_pg_pool, total_size);119if (!paddr) {120dev_err(hdev->dev,121"Cannot allocate %llu contiguous pages with total size of %llu\n",122num_pgs, total_size);123return -ENOMEM;124}125}126127phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);128if (!phys_pg_pack) {129rc = -ENOMEM;130goto pages_pack_err;131}132133phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK;134phys_pg_pack->asid = ctx->asid;135phys_pg_pack->npages = num_pgs;136phys_pg_pack->page_size = page_size;137phys_pg_pack->total_size = total_size;138phys_pg_pack->flags = args->flags;139phys_pg_pack->contiguous = contiguous;140141phys_pg_pack->pages = kvmalloc_array(num_pgs, sizeof(u64), GFP_KERNEL);142if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {143rc = -ENOMEM;144goto pages_arr_err;145}146147if (phys_pg_pack->contiguous) {148for (i = 0 ; i < num_pgs ; i++)149phys_pg_pack->pages[i] = paddr + i * page_size;150} else {151for (i = 0 ; i < num_pgs ; i++) {152if (is_power_of_2(page_size))153phys_pg_pack->pages[i] =154(uintptr_t)gen_pool_dma_alloc_align(vm->dram_pg_pool,155page_size, NULL,156page_size);157else158phys_pg_pack->pages[i] = gen_pool_alloc(vm->dram_pg_pool,159page_size);160161if (!phys_pg_pack->pages[i]) {162dev_err(hdev->dev,163"Cannot allocate device memory (out of memory)\n");164rc = -ENOMEM;165goto page_err;166}167168num_curr_pgs++;169}170}171172spin_lock(&vm->idr_lock);173handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0,174GFP_ATOMIC);175spin_unlock(&vm->idr_lock);176177if (handle < 0) {178dev_err(hdev->dev, "Failed to get handle for page\n");179rc = -EFAULT;180goto idr_err;181}182183for (i = 0 ; i < num_pgs ; i++)184kref_get(&vm->dram_pg_pool_refcount);185186phys_pg_pack->handle = handle;187188atomic64_add(phys_pg_pack->total_size, &ctx->dram_phys_mem);189atomic64_add(phys_pg_pack->total_size, &hdev->dram_used_mem);190191*ret_handle = handle;192193return 0;194195idr_err:196page_err:197if (!phys_pg_pack->contiguous)198for (i = 0 ; i < num_curr_pgs ; i++)199gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[i],200page_size);201202kvfree(phys_pg_pack->pages);203pages_arr_err:204kfree(phys_pg_pack);205pages_pack_err:206if (contiguous)207gen_pool_free(vm->dram_pg_pool, paddr, total_size);208209return rc;210}211212/**213* dma_map_host_va() - DMA mapping of the given host virtual address.214* @hdev: habanalabs device structure.215* @addr: the host virtual address of the memory area.216* @size: the size of the memory area.217* @p_userptr: pointer to result userptr structure.218*219* This function does the following:220* - Allocate userptr structure.221* - Pin the given host memory using the userptr structure.222* - Perform DMA mapping to have the DMA addresses of the pages.223*/224static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,225struct hl_userptr **p_userptr)226{227struct hl_userptr *userptr;228int rc;229230userptr = kzalloc(sizeof(*userptr), GFP_KERNEL);231if (!userptr) {232rc = -ENOMEM;233goto userptr_err;234}235236rc = hl_pin_host_memory(hdev, addr, size, userptr);237if (rc)238goto pin_err;239240userptr->dma_mapped = true;241userptr->dir = DMA_BIDIRECTIONAL;242userptr->vm_type = VM_TYPE_USERPTR;243244*p_userptr = userptr;245246rc = hl_dma_map_sgtable(hdev, userptr->sgt, DMA_BIDIRECTIONAL);247if (rc) {248dev_err(hdev->dev, "failed to map sgt with DMA region\n");249goto dma_map_err;250}251252return 0;253254dma_map_err:255hl_unpin_host_memory(hdev, userptr);256pin_err:257kfree(userptr);258userptr_err:259260return rc;261}262263/**264* dma_unmap_host_va() - DMA unmapping of the given host virtual address.265* @hdev: habanalabs device structure.266* @userptr: userptr to free.267*268* This function does the following:269* - Unpins the physical pages.270* - Frees the userptr structure.271*/272static void dma_unmap_host_va(struct hl_device *hdev,273struct hl_userptr *userptr)274{275hl_unpin_host_memory(hdev, userptr);276kfree(userptr);277}278279/**280* dram_pg_pool_do_release() - free DRAM pages pool281* @ref: pointer to reference object.282*283* This function does the following:284* - Frees the idr structure of physical pages handles.285* - Frees the generic pool of DRAM physical pages.286*/287static void dram_pg_pool_do_release(struct kref *ref)288{289struct hl_vm *vm = container_of(ref, struct hl_vm,290dram_pg_pool_refcount);291292/*293* free the idr here as only here we know for sure that there are no294* allocated physical pages and hence there are no handles in use295*/296idr_destroy(&vm->phys_pg_pack_handles);297gen_pool_destroy(vm->dram_pg_pool);298}299300/**301* free_phys_pg_pack() - free physical page pack.302* @hdev: habanalabs device structure.303* @phys_pg_pack: physical page pack to free.304*305* This function does the following:306* - For DRAM memory only307* - iterate over the pack, free each physical block structure by308* returning it to the general pool.309* - Free the hl_vm_phys_pg_pack structure.310*/311static void free_phys_pg_pack(struct hl_device *hdev,312struct hl_vm_phys_pg_pack *phys_pg_pack)313{314struct hl_vm *vm = &hdev->vm;315u64 i;316317if (phys_pg_pack->created_from_userptr)318goto end;319320if (phys_pg_pack->contiguous) {321gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[0],322phys_pg_pack->total_size);323324for (i = 0; i < phys_pg_pack->npages ; i++)325kref_put(&vm->dram_pg_pool_refcount,326dram_pg_pool_do_release);327} else {328for (i = 0 ; i < phys_pg_pack->npages ; i++) {329gen_pool_free(vm->dram_pg_pool,330phys_pg_pack->pages[i],331phys_pg_pack->page_size);332kref_put(&vm->dram_pg_pool_refcount,333dram_pg_pool_do_release);334}335}336337end:338kvfree(phys_pg_pack->pages);339kfree(phys_pg_pack);340341return;342}343344/**345* free_device_memory() - free device memory.346* @ctx: pointer to the context structure.347* @args: host parameters containing the requested size.348*349* This function does the following:350* - Free the device memory related to the given handle.351*/352static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args)353{354struct hl_device *hdev = ctx->hdev;355struct hl_vm *vm = &hdev->vm;356struct hl_vm_phys_pg_pack *phys_pg_pack;357u32 handle = args->free.handle;358359spin_lock(&vm->idr_lock);360phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);361if (!phys_pg_pack) {362spin_unlock(&vm->idr_lock);363dev_err(hdev->dev, "free device memory failed, no match for handle %u\n", handle);364return -EINVAL;365}366367if (atomic_read(&phys_pg_pack->mapping_cnt) > 0) {368spin_unlock(&vm->idr_lock);369dev_err(hdev->dev, "handle %u is mapped, cannot free\n", handle);370return -EINVAL;371}372373/* must remove from idr before the freeing of the physical pages as the refcount of the pool374* is also the trigger of the idr destroy375*/376idr_remove(&vm->phys_pg_pack_handles, handle);377spin_unlock(&vm->idr_lock);378379atomic64_sub(phys_pg_pack->total_size, &ctx->dram_phys_mem);380atomic64_sub(phys_pg_pack->total_size, &hdev->dram_used_mem);381382free_phys_pg_pack(hdev, phys_pg_pack);383384return 0;385}386387/**388* clear_va_list_locked() - free virtual addresses list.389* @hdev: habanalabs device structure.390* @va_list: list of virtual addresses to free.391*392* This function does the following:393* - Iterate over the list and free each virtual addresses block.394*395* This function should be called only when va_list lock is taken.396*/397static void clear_va_list_locked(struct hl_device *hdev,398struct list_head *va_list)399{400struct hl_vm_va_block *va_block, *tmp;401402list_for_each_entry_safe(va_block, tmp, va_list, node) {403list_del(&va_block->node);404kfree(va_block);405}406}407408/**409* print_va_list_locked() - print virtual addresses list.410* @hdev: habanalabs device structure.411* @va_list: list of virtual addresses to print.412*413* This function does the following:414* - Iterate over the list and print each virtual addresses block.415*416* This function should be called only when va_list lock is taken.417*/418static void print_va_list_locked(struct hl_device *hdev,419struct list_head *va_list)420{421#if HL_MMU_DEBUG422struct hl_vm_va_block *va_block;423424dev_dbg(hdev->dev, "print va list:\n");425426list_for_each_entry(va_block, va_list, node)427dev_dbg(hdev->dev,428"va block, start: 0x%llx, end: 0x%llx, size: %llu\n",429va_block->start, va_block->end, va_block->size);430#endif431}432433/**434* merge_va_blocks_locked() - merge a virtual block if possible.435* @hdev: pointer to the habanalabs device structure.436* @va_list: pointer to the virtual addresses block list.437* @va_block: virtual block to merge with adjacent blocks.438*439* This function does the following:440* - Merge the given blocks with the adjacent blocks if their virtual ranges441* create a contiguous virtual range.442*443* This Function should be called only when va_list lock is taken.444*/445static void merge_va_blocks_locked(struct hl_device *hdev,446struct list_head *va_list, struct hl_vm_va_block *va_block)447{448struct hl_vm_va_block *prev, *next;449450prev = list_prev_entry(va_block, node);451if (&prev->node != va_list && prev->end + 1 == va_block->start) {452prev->end = va_block->end;453prev->size = prev->end - prev->start + 1;454list_del(&va_block->node);455kfree(va_block);456va_block = prev;457}458459next = list_next_entry(va_block, node);460if (&next->node != va_list && va_block->end + 1 == next->start) {461next->start = va_block->start;462next->size = next->end - next->start + 1;463list_del(&va_block->node);464kfree(va_block);465}466}467468/**469* add_va_block_locked() - add a virtual block to the virtual addresses list.470* @hdev: pointer to the habanalabs device structure.471* @va_list: pointer to the virtual addresses block list.472* @start: start virtual address.473* @end: end virtual address.474*475* This function does the following:476* - Add the given block to the virtual blocks list and merge with other blocks477* if a contiguous virtual block can be created.478*479* This Function should be called only when va_list lock is taken.480*/481static int add_va_block_locked(struct hl_device *hdev,482struct list_head *va_list, u64 start, u64 end)483{484struct hl_vm_va_block *va_block, *res = NULL;485u64 size = end - start + 1;486487print_va_list_locked(hdev, va_list);488489list_for_each_entry(va_block, va_list, node) {490/* TODO: remove upon matureness */491if (hl_mem_area_crosses_range(start, size, va_block->start,492va_block->end)) {493dev_err(hdev->dev,494"block crossing ranges at start 0x%llx, end 0x%llx\n",495va_block->start, va_block->end);496return -EINVAL;497}498499if (va_block->end < start)500res = va_block;501}502503va_block = kmalloc(sizeof(*va_block), GFP_KERNEL);504if (!va_block)505return -ENOMEM;506507va_block->start = start;508va_block->end = end;509va_block->size = size;510511if (!res)512list_add(&va_block->node, va_list);513else514list_add(&va_block->node, &res->node);515516merge_va_blocks_locked(hdev, va_list, va_block);517518print_va_list_locked(hdev, va_list);519520return 0;521}522523/**524* add_va_block() - wrapper for add_va_block_locked.525* @hdev: pointer to the habanalabs device structure.526* @va_range: pointer to the virtual addresses range object.527* @start: start virtual address.528* @end: end virtual address.529*530* This function does the following:531* - Takes the list lock and calls add_va_block_locked.532*/533static inline int add_va_block(struct hl_device *hdev,534struct hl_va_range *va_range, u64 start, u64 end)535{536int rc;537538mutex_lock(&va_range->lock);539rc = add_va_block_locked(hdev, &va_range->list, start, end);540mutex_unlock(&va_range->lock);541542return rc;543}544545/**546* is_hint_crossing_range() - check if hint address crossing specified reserved.547* @range_type: virtual space range type.548* @start_addr: start virtual address.549* @size: block size.550* @prop: asic properties structure to retrieve reserved ranges from.551*/552static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,553u64 start_addr, u32 size, struct asic_fixed_properties *prop) {554bool range_cross;555556if (range_type == HL_VA_RANGE_TYPE_DRAM)557range_cross =558hl_mem_area_crosses_range(start_addr, size,559prop->hints_dram_reserved_va_range.start_addr,560prop->hints_dram_reserved_va_range.end_addr);561else if (range_type == HL_VA_RANGE_TYPE_HOST)562range_cross =563hl_mem_area_crosses_range(start_addr, size,564prop->hints_host_reserved_va_range.start_addr,565prop->hints_host_reserved_va_range.end_addr);566else567range_cross =568hl_mem_area_crosses_range(start_addr, size,569prop->hints_host_hpage_reserved_va_range.start_addr,570prop->hints_host_hpage_reserved_va_range.end_addr);571572return range_cross;573}574575/**576* get_va_block() - get a virtual block for the given size and alignment.577*578* @hdev: pointer to the habanalabs device structure.579* @va_range: pointer to the virtual addresses range.580* @size: requested block size.581* @hint_addr: hint for requested address by the user.582* @va_block_align: required alignment of the virtual block start address.583* @range_type: va range type (host, dram)584* @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT585*586* This function does the following:587* - Iterate on the virtual block list to find a suitable virtual block for the588* given size, hint address and alignment.589* - Reserve the requested block and update the list.590* - Return the start address of the virtual block.591*/592static u64 get_va_block(struct hl_device *hdev,593struct hl_va_range *va_range,594u64 size, u64 hint_addr, u32 va_block_align,595enum hl_va_range_type range_type,596u32 flags)597{598struct hl_vm_va_block *va_block, *new_va_block = NULL;599struct asic_fixed_properties *prop = &hdev->asic_prop;600u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end,601align_mask, reserved_valid_start = 0, reserved_valid_size = 0,602dram_hint_mask = prop->dram_hints_align_mask;603bool add_prev = false;604bool is_align_pow_2 = is_power_of_2(va_range->page_size);605bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);606bool force_hint = flags & HL_MEM_FORCE_HINT;607int rc;608609if (is_align_pow_2)610align_mask = ~((u64)va_block_align - 1);611else612/*613* with non-power-of-2 range we work only with page granularity614* and the start address is page aligned,615* so no need for alignment checking.616*/617size = DIV_ROUND_UP_ULL(size, va_range->page_size) *618va_range->page_size;619620tmp_hint_addr = hint_addr & ~dram_hint_mask;621622/* Check if we need to ignore hint address */623if ((is_align_pow_2 && (hint_addr & (va_block_align - 1))) ||624(!is_align_pow_2 && is_hint_dram_addr &&625do_div(tmp_hint_addr, va_range->page_size))) {626627if (force_hint) {628/* Hint must be respected, so here we just fail */629dev_err(hdev->dev,630"Hint address 0x%llx is not page aligned - cannot be respected\n",631hint_addr);632return 0;633}634635dev_dbg(hdev->dev,636"Hint address 0x%llx will be ignored because it is not aligned\n",637hint_addr);638hint_addr = 0;639}640641mutex_lock(&va_range->lock);642643print_va_list_locked(hdev, &va_range->list);644645list_for_each_entry(va_block, &va_range->list, node) {646/* Calc the first possible aligned addr */647valid_start = va_block->start;648649if (is_align_pow_2 && (valid_start & (va_block_align - 1))) {650valid_start &= align_mask;651valid_start += va_block_align;652if (valid_start > va_block->end)653continue;654}655656valid_size = va_block->end - valid_start + 1;657if (valid_size < size)658continue;659660/*661* In case hint address is 0, and hints_range_reservation662* property enabled, then avoid allocating va blocks from the663* range reserved for hint addresses664*/665if (prop->hints_range_reservation && !hint_addr)666if (is_hint_crossing_range(range_type, valid_start,667size, prop))668continue;669670/* Pick the minimal length block which has the required size */671if (!new_va_block || (valid_size < reserved_valid_size)) {672new_va_block = va_block;673reserved_valid_start = valid_start;674reserved_valid_size = valid_size;675}676677if (hint_addr && hint_addr >= valid_start &&678(hint_addr + size) <= va_block->end) {679new_va_block = va_block;680reserved_valid_start = hint_addr;681reserved_valid_size = valid_size;682break;683}684}685686if (!new_va_block) {687dev_err(hdev->dev, "no available va block for size %llu\n",688size);689goto out;690}691692if (force_hint && reserved_valid_start != hint_addr) {693/* Hint address must be respected. If we are here - this means694* we could not respect it.695*/696dev_err(hdev->dev,697"Hint address 0x%llx could not be respected\n",698hint_addr);699reserved_valid_start = 0;700goto out;701}702703/*704* Check if there is some leftover range due to reserving the new705* va block, then return it to the main virtual addresses list.706*/707if (reserved_valid_start > new_va_block->start) {708prev_start = new_va_block->start;709prev_end = reserved_valid_start - 1;710711new_va_block->start = reserved_valid_start;712new_va_block->size = reserved_valid_size;713714add_prev = true;715}716717if (new_va_block->size > size) {718new_va_block->start += size;719new_va_block->size = new_va_block->end - new_va_block->start + 1;720} else {721list_del(&new_va_block->node);722kfree(new_va_block);723}724725if (add_prev) {726rc = add_va_block_locked(hdev, &va_range->list, prev_start, prev_end);727if (rc) {728reserved_valid_start = 0;729goto out;730}731}732733print_va_list_locked(hdev, &va_range->list);734out:735mutex_unlock(&va_range->lock);736737return reserved_valid_start;738}739740/*741* hl_reserve_va_block() - reserve a virtual block of a given size.742* @hdev: pointer to the habanalabs device structure.743* @ctx: current context744* @type: virtual addresses range type.745* @size: requested block size.746* @alignment: required alignment in bytes of the virtual block start address,747* 0 means no alignment.748*749* This function does the following:750* - Iterate on the virtual block list to find a suitable virtual block for the751* given size and alignment.752* - Reserve the requested block and update the list.753* - Return the start address of the virtual block.754*/755u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,756enum hl_va_range_type type, u64 size, u32 alignment)757{758return get_va_block(hdev, ctx->va_range[type], size, 0,759max(alignment, ctx->va_range[type]->page_size),760type, 0);761}762763/**764* hl_get_va_range_type() - get va_range type for the given address and size.765* @ctx: context to fetch va_range from.766* @address: the start address of the area we want to validate.767* @size: the size in bytes of the area we want to validate.768* @type: returned va_range type.769*770* Return: true if the area is inside a valid range, false otherwise.771*/772static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size,773enum hl_va_range_type *type)774{775int i;776777for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX; i++) {778if (hl_mem_area_inside_range(address, size,779ctx->va_range[i]->start_addr,780ctx->va_range[i]->end_addr)) {781*type = i;782return 0;783}784}785786return -EINVAL;787}788789/**790* hl_unreserve_va_block() - wrapper for add_va_block to unreserve a va block.791* @hdev: pointer to the habanalabs device structure792* @ctx: pointer to the context structure.793* @start_addr: start virtual address.794* @size: number of bytes to unreserve.795*796* This function does the following:797* - Takes the list lock and calls add_va_block_locked.798*/799int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,800u64 start_addr, u64 size)801{802enum hl_va_range_type type;803int rc;804805rc = hl_get_va_range_type(ctx, start_addr, size, &type);806if (rc) {807dev_err(hdev->dev,808"cannot find va_range for va %#llx size %llu",809start_addr, size);810return rc;811}812813rc = add_va_block(hdev, ctx->va_range[type], start_addr,814start_addr + size - 1);815if (rc)816dev_warn(hdev->dev,817"add va block failed for vaddr: 0x%llx\n", start_addr);818819return rc;820}821822/**823* init_phys_pg_pack_from_userptr() - initialize physical page pack from host824* memory825* @ctx: pointer to the context structure.826* @userptr: userptr to initialize from.827* @pphys_pg_pack: result pointer.828* @force_regular_page: tell the function to ignore huge page optimization,829* even if possible. Needed for cases where the device VA830* is allocated before we know the composition of the831* physical pages832*833* This function does the following:834* - Create a physical page pack from the physical pages related to the given835* virtual block.836*/837static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,838struct hl_userptr *userptr,839struct hl_vm_phys_pg_pack **pphys_pg_pack,840bool force_regular_page)841{842u32 npages, page_size = PAGE_SIZE,843huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;844u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);845struct hl_vm_phys_pg_pack *phys_pg_pack;846bool first = true, is_huge_page_opt;847u64 page_mask, total_npages;848struct scatterlist *sg;849dma_addr_t dma_addr;850int rc, i, j;851852phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);853if (!phys_pg_pack)854return -ENOMEM;855856phys_pg_pack->vm_type = userptr->vm_type;857phys_pg_pack->created_from_userptr = true;858phys_pg_pack->asid = ctx->asid;859atomic_set(&phys_pg_pack->mapping_cnt, 1);860861is_huge_page_opt = (force_regular_page ? false : true);862863/* Only if all dma_addrs are aligned to 2MB and their864* sizes is at least 2MB, we can use huge page mapping.865* We limit the 2MB optimization to this condition,866* since later on we acquire the related VA range as one867* consecutive block.868*/869total_npages = 0;870for_each_sgtable_dma_sg(userptr->sgt, sg, i) {871npages = hl_get_sg_info(sg, &dma_addr);872873total_npages += npages;874875if ((npages % pgs_in_huge_page) ||876(dma_addr & (huge_page_size - 1)))877is_huge_page_opt = false;878}879880if (is_huge_page_opt) {881page_size = huge_page_size;882do_div(total_npages, pgs_in_huge_page);883}884885page_mask = ~(((u64) page_size) - 1);886887phys_pg_pack->pages = kvmalloc_array(total_npages, sizeof(u64),888GFP_KERNEL);889if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {890rc = -ENOMEM;891goto page_pack_arr_mem_err;892}893894phys_pg_pack->npages = total_npages;895phys_pg_pack->page_size = page_size;896phys_pg_pack->total_size = total_npages * page_size;897898j = 0;899for_each_sgtable_dma_sg(userptr->sgt, sg, i) {900npages = hl_get_sg_info(sg, &dma_addr);901902/* align down to physical page size and save the offset */903if (first) {904first = false;905phys_pg_pack->offset = dma_addr & (page_size - 1);906dma_addr &= page_mask;907}908909while (npages) {910phys_pg_pack->pages[j++] = dma_addr;911dma_addr += page_size;912913if (is_huge_page_opt)914npages -= pgs_in_huge_page;915else916npages--;917}918}919920*pphys_pg_pack = phys_pg_pack;921922return 0;923924page_pack_arr_mem_err:925kfree(phys_pg_pack);926927return rc;928}929930/**931* map_phys_pg_pack() - maps the physical page pack..932* @ctx: pointer to the context structure.933* @vaddr: start address of the virtual area to map from.934* @phys_pg_pack: the pack of physical pages to map to.935*936* This function does the following:937* - Maps each chunk of virtual memory to matching physical chunk.938* - Stores number of successful mappings in the given argument.939* - Returns 0 on success, error code otherwise.940*/941static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,942struct hl_vm_phys_pg_pack *phys_pg_pack)943{944struct hl_device *hdev = ctx->hdev;945u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = 0, i;946u32 page_size = phys_pg_pack->page_size;947int rc = 0;948bool is_host_addr;949950for (i = 0 ; i < phys_pg_pack->npages ; i++) {951paddr = phys_pg_pack->pages[i];952953rc = hl_mmu_map_page(ctx, next_vaddr, paddr, page_size,954(i + 1) == phys_pg_pack->npages);955if (rc) {956dev_err(hdev->dev,957"map failed (%d) for handle %u, npages: %llu, mapped: %llu\n",958rc, phys_pg_pack->handle, phys_pg_pack->npages,959mapped_pg_cnt);960goto err;961}962963mapped_pg_cnt++;964next_vaddr += page_size;965}966967return 0;968969err:970is_host_addr = !hl_is_dram_va(hdev, vaddr);971972next_vaddr = vaddr;973for (i = 0 ; i < mapped_pg_cnt ; i++) {974if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,975(i + 1) == mapped_pg_cnt))976dev_warn_ratelimited(hdev->dev,977"failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n",978phys_pg_pack->handle, next_vaddr,979phys_pg_pack->pages[i], page_size);980981next_vaddr += page_size;982983/*984* unmapping on Palladium can be really long, so avoid a CPU985* soft lockup bug by sleeping a little between unmapping pages986*987* In addition, on host num of pages could be huge,988* because page size could be 4KB, so when unmapping host989* pages sleep every 32K pages to avoid soft lockup990*/991if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))992usleep_range(50, 200);993}994995return rc;996}997998/**999* unmap_phys_pg_pack() - unmaps the physical page pack.1000* @ctx: pointer to the context structure.1001* @vaddr: start address of the virtual area to unmap.1002* @phys_pg_pack: the pack of physical pages to unmap.1003*/1004static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,1005struct hl_vm_phys_pg_pack *phys_pg_pack)1006{1007struct hl_device *hdev = ctx->hdev;1008u64 next_vaddr, i;1009bool is_host_addr;1010u32 page_size;10111012is_host_addr = !hl_is_dram_va(hdev, vaddr);1013page_size = phys_pg_pack->page_size;1014next_vaddr = vaddr;10151016for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {1017if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,1018(i + 1) == phys_pg_pack->npages))1019dev_warn_ratelimited(hdev->dev,1020"unmap failed for vaddr: 0x%llx\n", next_vaddr);10211022/*1023* unmapping on Palladium can be really long, so avoid a CPU1024* soft lockup bug by sleeping a little between unmapping pages1025*1026* In addition, on host num of pages could be huge,1027* because page size could be 4KB, so when unmapping host1028* pages sleep every 32K pages to avoid soft lockup1029*/1030if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))1031usleep_range(50, 200);1032}1033}10341035/**1036* map_device_va() - map the given memory.1037* @ctx: pointer to the context structure.1038* @args: host parameters with handle/host virtual address.1039* @device_addr: pointer to result device virtual address.1040*1041* This function does the following:1042* - If given a physical device memory handle, map to a device virtual block1043* and return the start address of this block.1044* - If given a host virtual address and size, find the related physical pages,1045* map a device virtual block to this pages and return the start address of1046* this block.1047*/1048static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device_addr)1049{1050struct hl_vm_phys_pg_pack *phys_pg_pack;1051enum hl_va_range_type va_range_type = 0;1052struct hl_device *hdev = ctx->hdev;1053struct hl_userptr *userptr = NULL;1054u32 handle = 0, va_block_align;1055struct hl_vm_hash_node *hnode;1056struct hl_vm *vm = &hdev->vm;1057struct hl_va_range *va_range;1058bool is_userptr, do_prefetch;1059u64 ret_vaddr, hint_addr;1060enum vm_type *vm_type;1061int rc;10621063/* set map flags */1064is_userptr = args->flags & HL_MEM_USERPTR;1065do_prefetch = hdev->supports_mmu_prefetch && (args->flags & HL_MEM_PREFETCH);10661067/* Assume failure */1068*device_addr = 0;10691070if (is_userptr) {1071u64 addr = args->map_host.host_virt_addr,1072size = args->map_host.mem_size;1073u32 page_size = hdev->asic_prop.pmmu.page_size,1074huge_page_size = hdev->asic_prop.pmmu_huge.page_size;10751076rc = dma_map_host_va(hdev, addr, size, &userptr);1077if (rc)1078return rc;10791080rc = init_phys_pg_pack_from_userptr(ctx, userptr,1081&phys_pg_pack, false);1082if (rc) {1083dev_err(hdev->dev,1084"unable to init page pack for vaddr 0x%llx\n",1085addr);1086goto init_page_pack_err;1087}10881089vm_type = (enum vm_type *) userptr;1090hint_addr = args->map_host.hint_addr;1091handle = phys_pg_pack->handle;10921093/* get required alignment */1094if (phys_pg_pack->page_size == page_size) {1095va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];1096va_range_type = HL_VA_RANGE_TYPE_HOST;1097/*1098* huge page alignment may be needed in case of regular1099* page mapping, depending on the host VA alignment1100*/1101if (addr & (huge_page_size - 1))1102va_block_align = page_size;1103else1104va_block_align = huge_page_size;1105} else {1106/*1107* huge page alignment is needed in case of huge page1108* mapping1109*/1110va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];1111va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE;1112va_block_align = huge_page_size;1113}1114} else {1115handle = lower_32_bits(args->map_device.handle);11161117spin_lock(&vm->idr_lock);1118phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);1119if (!phys_pg_pack) {1120spin_unlock(&vm->idr_lock);1121dev_err(hdev->dev,1122"no match for handle %u\n", handle);1123return -EINVAL;1124}11251126/* increment now to avoid freeing device memory while mapping */1127atomic_inc(&phys_pg_pack->mapping_cnt);11281129spin_unlock(&vm->idr_lock);11301131vm_type = (enum vm_type *) phys_pg_pack;11321133hint_addr = args->map_device.hint_addr;11341135/* DRAM VA alignment is the same as the MMU page size */1136va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];1137va_range_type = HL_VA_RANGE_TYPE_DRAM;1138va_block_align = hdev->asic_prop.dmmu.page_size;1139}11401141/*1142* relevant for mapping device physical memory only, as host memory is1143* implicitly shared1144*/1145if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) &&1146phys_pg_pack->asid != ctx->asid) {1147dev_err(hdev->dev,1148"Failed to map memory, handle %u is not shared\n",1149handle);1150rc = -EPERM;1151goto shared_err;1152}11531154hnode = kzalloc(sizeof(*hnode), GFP_KERNEL);1155if (!hnode) {1156rc = -ENOMEM;1157goto hnode_err;1158}11591160if (hint_addr && phys_pg_pack->offset) {1161if (args->flags & HL_MEM_FORCE_HINT) {1162/* Fail if hint must be respected but it can't be */1163dev_err(hdev->dev,1164"Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",1165hint_addr, phys_pg_pack->offset);1166rc = -EINVAL;1167goto va_block_err;1168}1169dev_dbg(hdev->dev,1170"Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n",1171hint_addr, phys_pg_pack->offset);1172}11731174ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,1175hint_addr, va_block_align,1176va_range_type, args->flags);1177if (!ret_vaddr) {1178dev_err(hdev->dev, "no available va block for handle %u\n",1179handle);1180rc = -ENOMEM;1181goto va_block_err;1182}11831184mutex_lock(&hdev->mmu_lock);11851186rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);1187if (rc) {1188dev_err(hdev->dev, "mapping page pack failed (%d) for handle %u\n",1189rc, handle);1190mutex_unlock(&hdev->mmu_lock);1191goto map_err;1192}11931194rc = hl_mmu_invalidate_cache_range(hdev, false, *vm_type | MMU_OP_SKIP_LOW_CACHE_INV,1195ctx->asid, ret_vaddr, phys_pg_pack->total_size);1196mutex_unlock(&hdev->mmu_lock);1197if (rc)1198goto map_err;11991200/*1201* prefetch is done upon user's request. it is performed in WQ as and so can1202* be outside the MMU lock. the operation itself is already protected by the mmu lock1203*/1204if (do_prefetch) {1205rc = hl_mmu_prefetch_cache_range(ctx, *vm_type, ctx->asid, ret_vaddr,1206phys_pg_pack->total_size);1207if (rc)1208goto map_err;1209}12101211ret_vaddr += phys_pg_pack->offset;12121213hnode->ptr = vm_type;1214hnode->vaddr = ret_vaddr;1215hnode->handle = is_userptr ? MEM_HANDLE_INVALID : handle;12161217mutex_lock(&ctx->mem_hash_lock);1218hash_add(ctx->mem_hash, &hnode->node, ret_vaddr);1219mutex_unlock(&ctx->mem_hash_lock);12201221*device_addr = ret_vaddr;12221223if (is_userptr)1224free_phys_pg_pack(hdev, phys_pg_pack);12251226return rc;12271228map_err:1229if (add_va_block(hdev, va_range, ret_vaddr,1230ret_vaddr + phys_pg_pack->total_size - 1))1231dev_warn(hdev->dev,1232"release va block failed for handle 0x%x, vaddr: 0x%llx\n",1233handle, ret_vaddr);12341235va_block_err:1236kfree(hnode);1237hnode_err:1238shared_err:1239atomic_dec(&phys_pg_pack->mapping_cnt);1240if (is_userptr)1241free_phys_pg_pack(hdev, phys_pg_pack);1242init_page_pack_err:1243if (is_userptr)1244dma_unmap_host_va(hdev, userptr);12451246return rc;1247}12481249/* Should be called while the context's mem_hash_lock is taken */1250static struct hl_vm_hash_node *get_vm_hash_node_locked(struct hl_ctx *ctx, u64 vaddr)1251{1252struct hl_vm_hash_node *hnode;12531254hash_for_each_possible(ctx->mem_hash, hnode, node, vaddr)1255if (vaddr == hnode->vaddr)1256return hnode;12571258return NULL;1259}12601261/**1262* unmap_device_va() - unmap the given device virtual address.1263* @ctx: pointer to the context structure.1264* @args: host parameters with device virtual address to unmap.1265* @ctx_free: true if in context free flow, false otherwise.1266*1267* This function does the following:1268* - unmap the physical pages related to the given virtual address.1269* - return the device virtual block to the virtual block list.1270*/1271static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,1272bool ctx_free)1273{1274struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;1275u64 vaddr = args->unmap.device_virt_addr;1276struct asic_fixed_properties *prop;1277struct hl_device *hdev = ctx->hdev;1278struct hl_userptr *userptr = NULL;1279struct hl_vm_hash_node *hnode;1280struct hl_va_range *va_range;1281enum vm_type *vm_type;1282bool is_userptr;1283int rc = 0;12841285prop = &hdev->asic_prop;12861287/* protect from double entrance */1288mutex_lock(&ctx->mem_hash_lock);1289hnode = get_vm_hash_node_locked(ctx, vaddr);1290if (!hnode) {1291mutex_unlock(&ctx->mem_hash_lock);1292dev_err(hdev->dev, "unmap failed, no mem hnode for vaddr 0x%llx\n", vaddr);1293return -EINVAL;1294}12951296if (hnode->export_cnt) {1297mutex_unlock(&ctx->mem_hash_lock);1298dev_err(hdev->dev, "failed to unmap %#llx, memory is exported\n", vaddr);1299return -EINVAL;1300}13011302hash_del(&hnode->node);1303mutex_unlock(&ctx->mem_hash_lock);13041305vm_type = hnode->ptr;13061307if (*vm_type == VM_TYPE_USERPTR) {1308is_userptr = true;1309userptr = hnode->ptr;13101311rc = init_phys_pg_pack_from_userptr(ctx, userptr, &phys_pg_pack,1312false);1313if (rc) {1314dev_err(hdev->dev,1315"unable to init page pack for vaddr 0x%llx\n",1316vaddr);1317goto vm_type_err;1318}13191320if (phys_pg_pack->page_size ==1321hdev->asic_prop.pmmu.page_size)1322va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];1323else1324va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];1325} else if (*vm_type == VM_TYPE_PHYS_PACK) {1326is_userptr = false;1327va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];1328phys_pg_pack = hnode->ptr;1329} else {1330dev_warn(hdev->dev,1331"unmap failed, unknown vm desc for vaddr 0x%llx\n",1332vaddr);1333rc = -EFAULT;1334goto vm_type_err;1335}13361337if (atomic_read(&phys_pg_pack->mapping_cnt) == 0) {1338dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr);1339rc = -EINVAL;1340goto mapping_cnt_err;1341}13421343if (!is_userptr && !is_power_of_2(phys_pg_pack->page_size))1344vaddr = prop->dram_base_address +1345DIV_ROUND_DOWN_ULL(vaddr - prop->dram_base_address,1346phys_pg_pack->page_size) *1347phys_pg_pack->page_size;1348else1349vaddr &= ~(((u64) phys_pg_pack->page_size) - 1);13501351mutex_lock(&hdev->mmu_lock);13521353unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);13541355/*1356* During context free this function is called in a loop to clean all1357* the context mappings. Hence the cache invalidation can be called once1358* at the loop end rather than for each iteration1359*/1360if (!ctx_free)1361rc = hl_mmu_invalidate_cache_range(hdev, true, *vm_type, ctx->asid, vaddr,1362phys_pg_pack->total_size);13631364mutex_unlock(&hdev->mmu_lock);13651366/*1367* If the context is closing we don't need to check for the MMU cache1368* invalidation return code and update the VA free list as in this flow1369* we invalidate the MMU cache outside of this unmap function and the VA1370* free list will be freed anyway.1371*/1372if (!ctx_free) {1373int tmp_rc;13741375tmp_rc = add_va_block(hdev, va_range, vaddr,1376vaddr + phys_pg_pack->total_size - 1);1377if (tmp_rc) {1378dev_warn(hdev->dev,1379"add va block failed for vaddr: 0x%llx\n",1380vaddr);1381if (!rc)1382rc = tmp_rc;1383}1384}13851386atomic_dec(&phys_pg_pack->mapping_cnt);1387kfree(hnode);13881389if (is_userptr) {1390free_phys_pg_pack(hdev, phys_pg_pack);1391dma_unmap_host_va(hdev, userptr);1392}13931394return rc;13951396mapping_cnt_err:1397if (is_userptr)1398free_phys_pg_pack(hdev, phys_pg_pack);1399vm_type_err:1400mutex_lock(&ctx->mem_hash_lock);1401hash_add(ctx->mem_hash, &hnode->node, vaddr);1402mutex_unlock(&ctx->mem_hash_lock);14031404return rc;1405}14061407static int map_block(struct hl_device *hdev, u64 address, u64 *handle, u32 *size)1408{1409u32 block_id;1410int rc;14111412*handle = 0;1413if (size)1414*size = 0;14151416rc = hdev->asic_funcs->get_hw_block_id(hdev, address, size, &block_id);1417if (rc)1418return rc;14191420*handle = block_id | HL_MMAP_TYPE_BLOCK;1421*handle <<= PAGE_SHIFT;14221423return 0;1424}14251426static void hw_block_vm_close(struct vm_area_struct *vma)1427{1428struct hl_vm_hw_block_list_node *lnode =1429(struct hl_vm_hw_block_list_node *) vma->vm_private_data;1430struct hl_ctx *ctx = lnode->ctx;1431long new_mmap_size;14321433new_mmap_size = lnode->mapped_size - (vma->vm_end - vma->vm_start);1434if (new_mmap_size > 0) {1435lnode->mapped_size = new_mmap_size;1436return;1437}14381439mutex_lock(&ctx->hw_block_list_lock);1440list_del(&lnode->node);1441mutex_unlock(&ctx->hw_block_list_lock);1442hl_ctx_put(ctx);1443kfree(lnode);1444vma->vm_private_data = NULL;1445}14461447static const struct vm_operations_struct hw_block_vm_ops = {1448.close = hw_block_vm_close1449};14501451/**1452* hl_hw_block_mmap() - mmap a hw block to user.1453* @hpriv: pointer to the private data of the fd1454* @vma: pointer to vm_area_struct of the process1455*1456* Driver increments context reference for every HW block mapped in order1457* to prevent user from closing FD without unmapping first1458*/1459int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)1460{1461struct hl_vm_hw_block_list_node *lnode;1462struct hl_device *hdev = hpriv->hdev;1463struct hl_ctx *ctx = hpriv->ctx;1464u32 block_id, block_size;1465int rc;14661467/* We use the page offset to hold the block id and thus we need to clear1468* it before doing the mmap itself1469*/1470block_id = vma->vm_pgoff;1471vma->vm_pgoff = 0;14721473/* Driver only allows mapping of a complete HW block */1474block_size = vma->vm_end - vma->vm_start;14751476if (!access_ok((void __user *) (uintptr_t) vma->vm_start, block_size)) {1477dev_err(hdev->dev,1478"user pointer is invalid - 0x%lx\n",1479vma->vm_start);14801481return -EINVAL;1482}14831484lnode = kzalloc(sizeof(*lnode), GFP_KERNEL);1485if (!lnode)1486return -ENOMEM;14871488rc = hdev->asic_funcs->hw_block_mmap(hdev, vma, block_id, block_size);1489if (rc) {1490kfree(lnode);1491return rc;1492}14931494hl_ctx_get(ctx);14951496lnode->ctx = ctx;1497lnode->vaddr = vma->vm_start;1498lnode->block_size = block_size;1499lnode->mapped_size = lnode->block_size;1500lnode->id = block_id;15011502vma->vm_private_data = lnode;1503vma->vm_ops = &hw_block_vm_ops;15041505mutex_lock(&ctx->hw_block_list_lock);1506list_add_tail(&lnode->node, &ctx->hw_block_mem_list);1507mutex_unlock(&ctx->hw_block_list_lock);15081509vma->vm_pgoff = block_id;15101511return 0;1512}15131514static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size,1515struct device *dev, enum dma_data_direction dir)1516{1517dma_addr_t addr;1518int rc;15191520addr = dma_map_resource(dev, bar_address, chunk_size, dir,1521DMA_ATTR_SKIP_CPU_SYNC);1522rc = dma_mapping_error(dev, addr);1523if (rc)1524return rc;15251526sg_set_page(sg, NULL, chunk_size, 0);1527sg_dma_address(sg) = addr;1528sg_dma_len(sg) = chunk_size;15291530return 0;1531}15321533static struct sg_table *alloc_sgt_from_device_pages(struct hl_device *hdev, u64 *pages, u64 npages,1534u64 page_size, u64 exported_size, u64 offset,1535struct device *dev, enum dma_data_direction dir)1536{1537u64 dma_max_seg_size, curr_page, size, chunk_size, left_size_to_export, left_size_in_page,1538left_size_in_dma_seg, device_address, bar_address, start_page;1539struct asic_fixed_properties *prop = &hdev->asic_prop;1540struct scatterlist *sg;1541unsigned int nents, i;1542struct sg_table *sgt;1543bool next_sg_entry;1544int rc;15451546/* Align max segment size to PAGE_SIZE to fit the minimal IOMMU mapping granularity */1547dma_max_seg_size = ALIGN_DOWN(dma_get_max_seg_size(dev), PAGE_SIZE);1548if (dma_max_seg_size < PAGE_SIZE) {1549dev_err_ratelimited(hdev->dev,1550"dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n",1551dma_max_seg_size);1552return ERR_PTR(-EINVAL);1553}15541555sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);1556if (!sgt)1557return ERR_PTR(-ENOMEM);15581559/* Use the offset to move to the actual first page that is exported */1560for (start_page = 0 ; start_page < npages ; ++start_page) {1561if (offset < page_size)1562break;15631564/* The offset value was validated so there can't be an underflow */1565offset -= page_size;1566}15671568/* Calculate the required number of entries for the SG table */1569curr_page = start_page;1570nents = 1;1571left_size_to_export = exported_size;1572left_size_in_page = page_size - offset;1573left_size_in_dma_seg = dma_max_seg_size;1574next_sg_entry = false;15751576while (true) {1577size = min3(left_size_to_export, left_size_in_page, left_size_in_dma_seg);1578left_size_to_export -= size;1579left_size_in_page -= size;1580left_size_in_dma_seg -= size;15811582if (!left_size_to_export)1583break;15841585if (!left_size_in_page) {1586/* left_size_to_export is not zero so there must be another page */1587if (pages[curr_page] + page_size != pages[curr_page + 1])1588next_sg_entry = true;15891590++curr_page;1591left_size_in_page = page_size;1592}15931594if (!left_size_in_dma_seg) {1595next_sg_entry = true;1596left_size_in_dma_seg = dma_max_seg_size;1597}15981599if (next_sg_entry) {1600++nents;1601next_sg_entry = false;1602}1603}16041605rc = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);1606if (rc)1607goto err_free_sgt;16081609/* Prepare the SG table entries */1610curr_page = start_page;1611device_address = pages[curr_page] + offset;1612left_size_to_export = exported_size;1613left_size_in_page = page_size - offset;1614left_size_in_dma_seg = dma_max_seg_size;1615next_sg_entry = false;16161617for_each_sgtable_dma_sg(sgt, sg, i) {1618bar_address = hdev->dram_pci_bar_start + (device_address - prop->dram_base_address);1619chunk_size = 0;16201621for ( ; curr_page < npages ; ++curr_page) {1622size = min3(left_size_to_export, left_size_in_page, left_size_in_dma_seg);1623chunk_size += size;1624left_size_to_export -= size;1625left_size_in_page -= size;1626left_size_in_dma_seg -= size;16271628if (!left_size_to_export)1629break;16301631if (!left_size_in_page) {1632/* left_size_to_export is not zero so there must be another page */1633if (pages[curr_page] + page_size != pages[curr_page + 1]) {1634device_address = pages[curr_page + 1];1635next_sg_entry = true;1636}16371638left_size_in_page = page_size;1639}16401641if (!left_size_in_dma_seg) {1642/*1643* Skip setting a new device address if already moving to a page1644* which is not contiguous with the current page.1645*/1646if (!next_sg_entry) {1647device_address += chunk_size;1648next_sg_entry = true;1649}16501651left_size_in_dma_seg = dma_max_seg_size;1652}16531654if (next_sg_entry) {1655next_sg_entry = false;1656break;1657}1658}16591660rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);1661if (rc)1662goto err_unmap;1663}16641665/* There should be nothing left to export exactly after looping over all SG elements */1666if (left_size_to_export) {1667dev_err(hdev->dev,1668"left size to export %#llx after initializing %u SG elements\n",1669left_size_to_export, sgt->nents);1670rc = -ENOMEM;1671goto err_unmap;1672}16731674/*1675* Because we are not going to include a CPU list, we want to have some chance that other1676* users will detect this when going over SG table, by setting the orig_nents to 0 and using1677* only nents (length of DMA list).1678*/1679sgt->orig_nents = 0;16801681dev_dbg(hdev->dev, "prepared SG table with %u entries for importer %s\n",1682nents, dev_name(dev));1683for_each_sgtable_dma_sg(sgt, sg, i)1684dev_dbg(hdev->dev,1685"SG entry %d: address %#llx, length %#x\n",1686i, sg_dma_address(sg), sg_dma_len(sg));16871688return sgt;16891690err_unmap:1691for_each_sgtable_dma_sg(sgt, sg, i) {1692if (!sg_dma_len(sg))1693continue;16941695dma_unmap_resource(dev, sg_dma_address(sg), sg_dma_len(sg), dir,1696DMA_ATTR_SKIP_CPU_SYNC);1697}16981699sg_free_table(sgt);17001701err_free_sgt:1702kfree(sgt);1703return ERR_PTR(rc);1704}17051706static int hl_dmabuf_attach(struct dma_buf *dmabuf,1707struct dma_buf_attachment *attachment)1708{1709struct hl_dmabuf_priv *hl_dmabuf;1710struct hl_device *hdev;1711int rc;17121713hl_dmabuf = dmabuf->priv;1714hdev = hl_dmabuf->ctx->hdev;17151716rc = pci_p2pdma_distance(hdev->pdev, attachment->dev, true);17171718if (rc < 0)1719attachment->peer2peer = false;1720return 0;1721}17221723static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment,1724enum dma_data_direction dir)1725{1726u64 *pages, npages, page_size, exported_size, offset;1727struct dma_buf *dma_buf = attachment->dmabuf;1728struct hl_vm_phys_pg_pack *phys_pg_pack;1729struct hl_dmabuf_priv *hl_dmabuf;1730struct hl_device *hdev;1731struct sg_table *sgt;17321733hl_dmabuf = dma_buf->priv;1734hdev = hl_dmabuf->ctx->hdev;17351736if (!attachment->peer2peer) {1737dev_dbg(hdev->dev, "Failed to map dmabuf because p2p is disabled\n");1738return ERR_PTR(-EPERM);1739}17401741exported_size = hl_dmabuf->dmabuf->size;1742offset = hl_dmabuf->offset;1743phys_pg_pack = hl_dmabuf->phys_pg_pack;17441745if (phys_pg_pack) {1746pages = phys_pg_pack->pages;1747npages = phys_pg_pack->npages;1748page_size = phys_pg_pack->page_size;1749} else {1750pages = &hl_dmabuf->device_phys_addr;1751npages = 1;1752page_size = hl_dmabuf->dmabuf->size;1753}17541755sgt = alloc_sgt_from_device_pages(hdev, pages, npages, page_size, exported_size, offset,1756attachment->dev, dir);1757if (IS_ERR(sgt))1758dev_err(hdev->dev, "failed (%ld) to initialize sgt for dmabuf\n", PTR_ERR(sgt));17591760return sgt;1761}17621763static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,1764struct sg_table *sgt,1765enum dma_data_direction dir)1766{1767struct scatterlist *sg;1768int i;17691770/* The memory behind the dma-buf has *always* resided on the device itself, i.e. it lives1771* only in the 'device' domain (after all, it maps a PCI bar address which points to the1772* device memory).1773*1774* Therefore, it was never in the 'CPU' domain and hence, there is no need to perform1775* a sync of the memory to the CPU's cache, as it never resided inside that cache.1776*/1777for_each_sgtable_dma_sg(sgt, sg, i)1778dma_unmap_resource(attachment->dev, sg_dma_address(sg),1779sg_dma_len(sg), dir,1780DMA_ATTR_SKIP_CPU_SYNC);17811782/* Need to restore orig_nents because sg_free_table use that field */1783sgt->orig_nents = sgt->nents;1784sg_free_table(sgt);1785kfree(sgt);1786}17871788static struct hl_vm_hash_node *memhash_node_export_get(struct hl_ctx *ctx, u64 addr)1789{1790struct hl_device *hdev = ctx->hdev;1791struct hl_vm_hash_node *hnode;17921793/* get the memory handle */1794mutex_lock(&ctx->mem_hash_lock);1795hnode = get_vm_hash_node_locked(ctx, addr);1796if (!hnode) {1797mutex_unlock(&ctx->mem_hash_lock);1798dev_dbg(hdev->dev, "map address %#llx not found\n", addr);1799return ERR_PTR(-EINVAL);1800}18011802if (upper_32_bits(hnode->handle)) {1803mutex_unlock(&ctx->mem_hash_lock);1804dev_dbg(hdev->dev, "invalid handle %#llx for map address %#llx\n",1805hnode->handle, addr);1806return ERR_PTR(-EINVAL);1807}18081809/*1810* node found, increase export count so this memory cannot be unmapped1811* and the hash node cannot be deleted.1812*/1813hnode->export_cnt++;1814mutex_unlock(&ctx->mem_hash_lock);18151816return hnode;1817}18181819static void memhash_node_export_put(struct hl_ctx *ctx, struct hl_vm_hash_node *hnode)1820{1821mutex_lock(&ctx->mem_hash_lock);1822hnode->export_cnt--;1823mutex_unlock(&ctx->mem_hash_lock);1824}18251826static void hl_release_dmabuf(struct dma_buf *dmabuf)1827{1828struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;1829struct hl_ctx *ctx;18301831ctx = hl_dmabuf->ctx;18321833if (hl_dmabuf->memhash_hnode)1834memhash_node_export_put(ctx, hl_dmabuf->memhash_hnode);18351836atomic_dec(&ctx->hdev->dmabuf_export_cnt);1837hl_ctx_put(ctx);18381839/*1840* Paired with get_file() in export_dmabuf().1841* 'ctx' can be still used here to get the file pointer, even after hl_ctx_put() was called,1842* because releasing the compute device file involves another reference decrement, and it1843* would be possible only after calling fput().1844*/1845fput(ctx->hpriv->file_priv->filp);18461847kfree(hl_dmabuf);1848}18491850static const struct dma_buf_ops habanalabs_dmabuf_ops = {1851.attach = hl_dmabuf_attach,1852.map_dma_buf = hl_map_dmabuf,1853.unmap_dma_buf = hl_unmap_dmabuf,1854.release = hl_release_dmabuf,1855};18561857static int export_dmabuf(struct hl_ctx *ctx,1858struct hl_dmabuf_priv *hl_dmabuf,1859u64 total_size, int flags, int *dmabuf_fd)1860{1861DEFINE_DMA_BUF_EXPORT_INFO(exp_info);1862struct hl_device *hdev = ctx->hdev;1863CLASS(get_unused_fd, fd)(flags);18641865if (fd < 0) {1866dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);1867return fd;1868}18691870exp_info.ops = &habanalabs_dmabuf_ops;1871exp_info.size = total_size;1872exp_info.flags = flags;1873exp_info.priv = hl_dmabuf;18741875hl_dmabuf->dmabuf = dma_buf_export(&exp_info);1876if (IS_ERR(hl_dmabuf->dmabuf)) {1877dev_err(hdev->dev, "failed to export dma-buf\n");1878return PTR_ERR(hl_dmabuf->dmabuf);1879}18801881hl_dmabuf->ctx = ctx;1882hl_ctx_get(hl_dmabuf->ctx);1883atomic_inc(&ctx->hdev->dmabuf_export_cnt);18841885/* Get compute device file to enforce release order, such that all exported dma-buf will be1886* released first and only then the compute device.1887* Paired with fput() in hl_release_dmabuf().1888*/1889get_file(ctx->hpriv->file_priv->filp);18901891*dmabuf_fd = fd;1892fd_install(take_fd(fd), hl_dmabuf->dmabuf->file);18931894return 0;1895}18961897static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)1898{1899if (!PAGE_ALIGNED(addr)) {1900dev_dbg(hdev->dev,1901"exported device memory address 0x%llx should be aligned to PAGE_SIZE 0x%lx\n",1902addr, PAGE_SIZE);1903return -EINVAL;1904}19051906if (!size || !PAGE_ALIGNED(size)) {1907dev_dbg(hdev->dev,1908"exported device memory size %llu should be a multiple of PAGE_SIZE %lu\n",1909size, PAGE_SIZE);1910return -EINVAL;1911}19121913if (!PAGE_ALIGNED(offset)) {1914dev_dbg(hdev->dev,1915"exported device memory offset %llu should be a multiple of PAGE_SIZE %lu\n",1916offset, PAGE_SIZE);1917return -EINVAL;1918}19191920return 0;1921}19221923static int validate_export_params_no_mmu(struct hl_device *hdev, u64 device_addr, u64 size)1924{1925struct asic_fixed_properties *prop = &hdev->asic_prop;1926u64 bar_address;1927int rc;19281929rc = validate_export_params_common(hdev, device_addr, size, 0);1930if (rc)1931return rc;19321933if (device_addr < prop->dram_user_base_address ||1934(device_addr + size) > prop->dram_end_address ||1935(device_addr + size) < device_addr) {1936dev_dbg(hdev->dev,1937"DRAM memory range 0x%llx (+0x%llx) is outside of DRAM boundaries\n",1938device_addr, size);1939return -EINVAL;1940}19411942bar_address = hdev->dram_pci_bar_start + (device_addr - prop->dram_base_address);19431944if ((bar_address + size) > (hdev->dram_pci_bar_start + prop->dram_pci_bar_size) ||1945(bar_address + size) < bar_address) {1946dev_dbg(hdev->dev,1947"DRAM memory range 0x%llx (+0x%llx) is outside of PCI BAR boundaries\n",1948device_addr, size);1949return -EINVAL;1950}19511952return 0;1953}19541955static int validate_export_params(struct hl_device *hdev, u64 device_addr, u64 size, u64 offset,1956struct hl_vm_phys_pg_pack *phys_pg_pack)1957{1958struct asic_fixed_properties *prop = &hdev->asic_prop;1959u64 bar_address;1960int i, rc;19611962rc = validate_export_params_common(hdev, device_addr, size, offset);1963if (rc)1964return rc;19651966if ((offset + size) > phys_pg_pack->total_size) {1967dev_dbg(hdev->dev, "offset %#llx and size %#llx exceed total map size %#llx\n",1968offset, size, phys_pg_pack->total_size);1969return -EINVAL;1970}19711972for (i = 0 ; i < phys_pg_pack->npages ; i++) {1973bar_address = hdev->dram_pci_bar_start +1974(phys_pg_pack->pages[i] - prop->dram_base_address);19751976if ((bar_address + phys_pg_pack->page_size) >1977(hdev->dram_pci_bar_start + prop->dram_pci_bar_size) ||1978(bar_address + phys_pg_pack->page_size) < bar_address) {1979dev_dbg(hdev->dev,1980"DRAM memory range 0x%llx (+0x%x) is outside of PCI BAR boundaries\n",1981phys_pg_pack->pages[i], phys_pg_pack->page_size);1982return -EINVAL;1983}1984}19851986return 0;1987}19881989static struct hl_vm_phys_pg_pack *get_phys_pg_pack_from_hash_node(struct hl_device *hdev,1990struct hl_vm_hash_node *hnode)1991{1992struct hl_vm_phys_pg_pack *phys_pg_pack;1993struct hl_vm *vm = &hdev->vm;19941995spin_lock(&vm->idr_lock);1996phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, (u32) hnode->handle);1997if (!phys_pg_pack) {1998spin_unlock(&vm->idr_lock);1999dev_dbg(hdev->dev, "no match for handle 0x%x\n", (u32) hnode->handle);2000return ERR_PTR(-EINVAL);2001}20022003spin_unlock(&vm->idr_lock);20042005if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) {2006dev_dbg(hdev->dev, "handle 0x%llx does not represent DRAM memory\n", hnode->handle);2007return ERR_PTR(-EINVAL);2008}20092010return phys_pg_pack;2011}20122013/**2014* export_dmabuf_from_addr() - export a dma-buf object for the given memory2015* address and size.2016* @ctx: pointer to the context structure.2017* @addr: device address.2018* @size: size of device memory to export.2019* @offset: the offset into the buffer from which to start exporting2020* @flags: DMA-BUF file/FD flags.2021* @dmabuf_fd: pointer to result FD that represents the dma-buf object.2022*2023* Create and export a dma-buf object for an existing memory allocation inside2024* the device memory, and return a FD which is associated with the dma-buf2025* object.2026*2027* Return: 0 on success, non-zero for failure.2028*/2029static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 addr, u64 size, u64 offset,2030int flags, int *dmabuf_fd)2031{2032struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;2033struct hl_vm_hash_node *hnode = NULL;2034struct asic_fixed_properties *prop;2035struct hl_dmabuf_priv *hl_dmabuf;2036struct hl_device *hdev;2037int rc;20382039hdev = ctx->hdev;2040prop = &hdev->asic_prop;20412042/* offset must be 0 in devices without virtual memory support */2043if (!prop->dram_supports_virtual_memory && offset) {2044dev_dbg(hdev->dev, "offset is not allowed in device without virtual memory\n");2045return -EINVAL;2046}20472048hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL);2049if (!hl_dmabuf)2050return -ENOMEM;20512052if (prop->dram_supports_virtual_memory) {2053hnode = memhash_node_export_get(ctx, addr);2054if (IS_ERR(hnode)) {2055rc = PTR_ERR(hnode);2056goto err_free_dmabuf_wrapper;2057}2058phys_pg_pack = get_phys_pg_pack_from_hash_node(hdev, hnode);2059if (IS_ERR(phys_pg_pack)) {2060rc = PTR_ERR(phys_pg_pack);2061goto dec_memhash_export_cnt;2062}2063rc = validate_export_params(hdev, addr, size, offset, phys_pg_pack);2064if (rc)2065goto dec_memhash_export_cnt;20662067hl_dmabuf->phys_pg_pack = phys_pg_pack;2068hl_dmabuf->memhash_hnode = hnode;2069hl_dmabuf->offset = offset;2070} else {2071rc = validate_export_params_no_mmu(hdev, addr, size);2072if (rc)2073goto err_free_dmabuf_wrapper;20742075hl_dmabuf->device_phys_addr = addr;2076}20772078rc = export_dmabuf(ctx, hl_dmabuf, size, flags, dmabuf_fd);2079if (rc)2080goto dec_memhash_export_cnt;20812082return 0;20832084dec_memhash_export_cnt:2085if (prop->dram_supports_virtual_memory)2086memhash_node_export_put(ctx, hnode);2087err_free_dmabuf_wrapper:2088kfree(hl_dmabuf);2089return rc;2090}20912092static void ts_buff_release(struct hl_mmap_mem_buf *buf)2093{2094struct hl_ts_buff *ts_buff = buf->private;20952096vfree(ts_buff->kernel_buff_address);2097vfree(ts_buff->user_buff_address);2098kfree(ts_buff);2099}21002101static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, void *args)2102{2103struct hl_ts_buff *ts_buff = buf->private;21042105vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE);2106return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0);2107}21082109static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args)2110{2111struct hl_ts_buff *ts_buff = NULL;2112u32 num_elements;2113size_t size;2114void *p;21152116num_elements = *(u32 *)args;21172118ts_buff = kzalloc(sizeof(*ts_buff), gfp);2119if (!ts_buff)2120return -ENOMEM;21212122/* Allocate the user buffer */2123size = num_elements * sizeof(u64);2124p = vmalloc_user(size);2125if (!p)2126goto free_mem;21272128ts_buff->user_buff_address = p;2129buf->mappable_size = size;21302131/* Allocate the internal kernel buffer */2132size = num_elements * sizeof(struct hl_user_pending_interrupt);2133p = vzalloc(size);2134if (!p)2135goto free_user_buff;21362137ts_buff->kernel_buff_address = p;2138ts_buff->kernel_buff_size = size;21392140buf->private = ts_buff;21412142return 0;21432144free_user_buff:2145vfree(ts_buff->user_buff_address);2146free_mem:2147kfree(ts_buff);2148return -ENOMEM;2149}21502151static struct hl_mmap_mem_buf_behavior hl_ts_behavior = {2152.topic = "TS",2153.mem_id = HL_MMAP_TYPE_TS_BUFF,2154.mmap = hl_ts_mmap,2155.alloc = hl_ts_alloc_buf,2156.release = ts_buff_release,2157};21582159/**2160* allocate_timestamps_buffers() - allocate timestamps buffers2161* This function will allocate ts buffer that will later on be mapped to the user2162* in order to be able to read the timestamp.2163* in addition it'll allocate an extra buffer for registration management.2164* since we cannot fail during registration for out-of-memory situation, so2165* we'll prepare a pool which will be used as user interrupt nodes and instead2166* of dynamically allocating nodes while registration we'll pick the node from2167* this pool. in addition it'll add node to the mapping hash which will be used2168* to map user ts buffer to the internal kernel ts buffer.2169* @hpriv: pointer to the private data of the fd2170* @args: ioctl input2171* @handle: user timestamp buffer handle as an output2172*/2173static int allocate_timestamps_buffers(struct hl_fpriv *hpriv, struct hl_mem_in *args, u64 *handle)2174{2175struct hl_mem_mgr *mmg = &hpriv->mem_mgr;2176struct hl_mmap_mem_buf *buf;21772178if (args->num_of_elements > TS_MAX_ELEMENTS_NUM) {2179dev_err(mmg->dev, "Num of elements exceeds Max allowed number (0x%x > 0x%x)\n",2180args->num_of_elements, TS_MAX_ELEMENTS_NUM);2181return -EINVAL;2182}21832184buf = hl_mmap_mem_buf_alloc(mmg, &hl_ts_behavior, GFP_KERNEL, &args->num_of_elements);2185if (!buf)2186return -ENOMEM;21872188*handle = buf->handle;21892190return 0;2191}21922193int hl_mem_ioctl(struct drm_device *ddev, void *data, struct drm_file *file_priv)2194{2195struct hl_fpriv *hpriv = file_priv->driver_priv;2196enum hl_device_status status;2197union hl_mem_args *args = data;2198struct hl_device *hdev = hpriv->hdev;2199struct hl_ctx *ctx = hpriv->ctx;2200u64 block_handle, device_addr = 0;2201u32 handle = 0, block_size;2202int rc, dmabuf_fd = -EBADF;22032204if (!hl_device_operational(hdev, &status)) {2205dev_dbg_ratelimited(hdev->dev,2206"Device is %s. Can't execute MEMORY IOCTL\n",2207hdev->status[status]);2208return -EBUSY;2209}22102211switch (args->in.op) {2212case HL_MEM_OP_ALLOC:2213if (args->in.alloc.mem_size == 0) {2214dev_err(hdev->dev,2215"alloc size must be larger than 0\n");2216rc = -EINVAL;2217goto out;2218}22192220/* If DRAM does not support virtual memory the driver won't2221* handle the allocation/freeing of that memory. However, for2222* system administration/monitoring purposes, the driver will2223* keep track of the amount of DRAM memory that is allocated2224* and freed by the user. Because this code totally relies on2225* the user's input, the driver can't ensure the validity2226* of this accounting.2227*/2228if (!hdev->asic_prop.dram_supports_virtual_memory) {2229atomic64_add(args->in.alloc.mem_size,2230&ctx->dram_phys_mem);2231atomic64_add(args->in.alloc.mem_size,2232&hdev->dram_used_mem);22332234dev_dbg(hdev->dev, "DRAM alloc is not supported\n");2235rc = 0;22362237memset(args, 0, sizeof(*args));2238args->out.handle = 0;2239goto out;2240}22412242rc = alloc_device_memory(ctx, &args->in, &handle);22432244memset(args, 0, sizeof(*args));2245args->out.handle = (__u64) handle;2246break;22472248case HL_MEM_OP_FREE:2249/* If DRAM does not support virtual memory the driver won't2250* handle the allocation/freeing of that memory. However, for2251* system administration/monitoring purposes, the driver will2252* keep track of the amount of DRAM memory that is allocated2253* and freed by the user. Because this code totally relies on2254* the user's input, the driver can't ensure the validity2255* of this accounting.2256*/2257if (!hdev->asic_prop.dram_supports_virtual_memory) {2258atomic64_sub(args->in.alloc.mem_size,2259&ctx->dram_phys_mem);2260atomic64_sub(args->in.alloc.mem_size,2261&hdev->dram_used_mem);22622263dev_dbg(hdev->dev, "DRAM alloc is not supported\n");2264rc = 0;22652266goto out;2267}22682269rc = free_device_memory(ctx, &args->in);2270break;22712272case HL_MEM_OP_MAP:2273rc = map_device_va(ctx, &args->in, &device_addr);22742275memset(args, 0, sizeof(*args));2276args->out.device_virt_addr = device_addr;2277break;22782279case HL_MEM_OP_UNMAP:2280rc = unmap_device_va(ctx, &args->in, false);2281break;22822283case HL_MEM_OP_MAP_BLOCK:2284rc = map_block(hdev, args->in.map_block.block_addr,2285&block_handle, &block_size);2286args->out.block_handle = block_handle;2287args->out.block_size = block_size;2288break;22892290case HL_MEM_OP_EXPORT_DMABUF_FD:2291rc = export_dmabuf_from_addr(ctx,2292args->in.export_dmabuf_fd.addr,2293args->in.export_dmabuf_fd.mem_size,2294args->in.export_dmabuf_fd.offset,2295args->in.flags,2296&dmabuf_fd);2297memset(args, 0, sizeof(*args));2298args->out.fd = dmabuf_fd;2299break;23002301case HL_MEM_OP_TS_ALLOC:2302rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle);2303break;2304default:2305dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");2306rc = -EINVAL;2307break;2308}23092310out:2311return rc;2312}23132314static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,2315u32 npages, u64 start, u32 offset,2316struct hl_userptr *userptr)2317{2318int rc;23192320if (!access_ok((void __user *) (uintptr_t) addr, size)) {2321dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);2322return -EFAULT;2323}23242325userptr->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);2326if (!userptr->pages)2327return -ENOMEM;23282329rc = pin_user_pages_fast(start, npages, FOLL_WRITE | FOLL_LONGTERM,2330userptr->pages);23312332if (rc != npages) {2333dev_err(hdev->dev,2334"Failed (%d) to pin host memory with user ptr 0x%llx, size 0x%llx, npages %d\n",2335rc, addr, size, npages);2336if (rc < 0)2337goto destroy_pages;2338npages = rc;2339rc = -ENOMEM;2340goto put_pages;2341}2342userptr->npages = npages;23432344rc = sg_alloc_table_from_pages(userptr->sgt,2345userptr->pages,2346npages, offset, size, GFP_KERNEL);2347if (rc < 0) {2348dev_err(hdev->dev, "failed to create SG table from pages\n");2349goto put_pages;2350}23512352return 0;23532354put_pages:2355unpin_user_pages(userptr->pages, npages);2356destroy_pages:2357kvfree(userptr->pages);2358return rc;2359}23602361/**2362* hl_pin_host_memory() - pins a chunk of host memory.2363* @hdev: pointer to the habanalabs device structure.2364* @addr: the host virtual address of the memory area.2365* @size: the size of the memory area.2366* @userptr: pointer to hl_userptr structure.2367*2368* This function does the following:2369* - Pins the physical pages.2370* - Create an SG list from those pages.2371*/2372int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,2373struct hl_userptr *userptr)2374{2375u64 start, end;2376u32 npages, offset;2377int rc;23782379if (!size) {2380dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);2381return -EINVAL;2382}23832384/*2385* If the combination of the address and size requested for this memory2386* region causes an integer overflow, return error.2387*/2388if (((addr + size) < addr) ||2389PAGE_ALIGN(addr + size) < (addr + size)) {2390dev_err(hdev->dev,2391"user pointer 0x%llx + %llu causes integer overflow\n",2392addr, size);2393return -EINVAL;2394}23952396userptr->pid = current->pid;2397userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_KERNEL);2398if (!userptr->sgt)2399return -ENOMEM;24002401start = addr & PAGE_MASK;2402offset = addr & ~PAGE_MASK;2403end = PAGE_ALIGN(addr + size);2404npages = (end - start) >> PAGE_SHIFT;24052406userptr->size = size;2407userptr->addr = addr;2408userptr->dma_mapped = false;2409INIT_LIST_HEAD(&userptr->job_node);24102411rc = get_user_memory(hdev, addr, size, npages, start, offset,2412userptr);2413if (rc) {2414dev_err(hdev->dev,2415"failed to get user memory for address 0x%llx\n",2416addr);2417goto free_sgt;2418}24192420hl_debugfs_add_userptr(hdev, userptr);24212422return 0;24232424free_sgt:2425kfree(userptr->sgt);2426return rc;2427}24282429/*2430* hl_unpin_host_memory - unpins a chunk of host memory.2431* @hdev: pointer to the habanalabs device structure2432* @userptr: pointer to hl_userptr structure2433*2434* This function does the following:2435* - Unpins the physical pages related to the host memory2436* - Free the SG list2437*/2438void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)2439{2440hl_debugfs_remove_userptr(hdev, userptr);24412442if (userptr->dma_mapped)2443hl_dma_unmap_sgtable(hdev, userptr->sgt, userptr->dir);24442445unpin_user_pages_dirty_lock(userptr->pages, userptr->npages, true);2446kvfree(userptr->pages);24472448list_del(&userptr->job_node);24492450sg_free_table(userptr->sgt);2451kfree(userptr->sgt);2452}24532454/**2455* hl_userptr_delete_list() - clear userptr list.2456* @hdev: pointer to the habanalabs device structure.2457* @userptr_list: pointer to the list to clear.2458*2459* This function does the following:2460* - Iterates over the list and unpins the host memory and frees the userptr2461* structure.2462*/2463void hl_userptr_delete_list(struct hl_device *hdev,2464struct list_head *userptr_list)2465{2466struct hl_userptr *userptr, *tmp;24672468list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {2469hl_unpin_host_memory(hdev, userptr);2470kfree(userptr);2471}24722473INIT_LIST_HEAD(userptr_list);2474}24752476/**2477* hl_userptr_is_pinned() - returns whether the given userptr is pinned.2478* @hdev: pointer to the habanalabs device structure.2479* @addr: user address to check.2480* @size: user block size to check.2481* @userptr_list: pointer to the list to clear.2482* @userptr: pointer to userptr to check.2483*2484* This function does the following:2485* - Iterates over the list and checks if the given userptr is in it, means is2486* pinned. If so, returns true, otherwise returns false.2487*/2488bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,2489u32 size, struct list_head *userptr_list,2490struct hl_userptr **userptr)2491{2492list_for_each_entry((*userptr), userptr_list, job_node) {2493if ((addr == (*userptr)->addr) && (size == (*userptr)->size))2494return true;2495}24962497return false;2498}24992500/**2501* va_range_init() - initialize virtual addresses range.2502* @hdev: pointer to the habanalabs device structure.2503* @va_ranges: pointer to va_ranges array.2504* @range_type: virtual address range type.2505* @start: range start address, inclusive.2506* @end: range end address, inclusive.2507* @page_size: page size for this va_range.2508*2509* This function does the following:2510* - Initializes the virtual addresses list of the given range with the given2511* addresses.2512*/2513static int va_range_init(struct hl_device *hdev, struct hl_va_range **va_ranges,2514enum hl_va_range_type range_type, u64 start,2515u64 end, u32 page_size)2516{2517struct hl_va_range *va_range = va_ranges[range_type];2518int rc;25192520INIT_LIST_HEAD(&va_range->list);25212522/*2523* PAGE_SIZE alignment2524* it is the caller's responsibility to align the addresses if the2525* page size is not a power of 22526*/25272528if (is_power_of_2(page_size)) {2529start = round_up(start, page_size);25302531/*2532* The end of the range is inclusive, hence we need to align it2533* to the end of the last full page in the range. For example if2534* end = 0x3ff5 with page size 0x1000, we need to align it to2535* 0x2fff. The remaining 0xff5 bytes do not form a full page.2536*/2537end = round_down(end + 1, page_size) - 1;2538}25392540if (start >= end) {2541dev_err(hdev->dev, "too small vm range for va list\n");2542return -EFAULT;2543}25442545rc = add_va_block(hdev, va_range, start, end);25462547if (rc) {2548dev_err(hdev->dev, "Failed to init host va list\n");2549return rc;2550}25512552va_range->start_addr = start;2553va_range->end_addr = end;2554va_range->page_size = page_size;25552556return 0;2557}25582559/**2560* va_range_fini() - clear a virtual addresses range.2561* @hdev: pointer to the habanalabs structure.2562* @va_range: pointer to virtual addresses range.2563*2564* This function does the following:2565* - Frees the virtual addresses block list and its lock.2566*/2567static void va_range_fini(struct hl_device *hdev, struct hl_va_range *va_range)2568{2569mutex_lock(&va_range->lock);2570clear_va_list_locked(hdev, &va_range->list);2571mutex_unlock(&va_range->lock);25722573mutex_destroy(&va_range->lock);2574kfree(va_range);2575}25762577/**2578* vm_ctx_init_with_ranges() - initialize virtual memory for context.2579* @ctx: pointer to the habanalabs context structure.2580* @host_range_start: host virtual addresses range start.2581* @host_range_end: host virtual addresses range end.2582* @host_page_size: host page size.2583* @host_huge_range_start: host virtual addresses range start for memory2584* allocated with huge pages.2585* @host_huge_range_end: host virtual addresses range end for memory allocated2586* with huge pages.2587* @host_huge_page_size: host huge page size.2588* @dram_range_start: dram virtual addresses range start.2589* @dram_range_end: dram virtual addresses range end.2590* @dram_page_size: dram page size.2591*2592* This function initializes the following:2593* - MMU for context.2594* - Virtual address to area descriptor hashtable.2595* - Virtual block list of available virtual memory.2596*/2597static int vm_ctx_init_with_ranges(struct hl_ctx *ctx,2598u64 host_range_start,2599u64 host_range_end,2600u32 host_page_size,2601u64 host_huge_range_start,2602u64 host_huge_range_end,2603u32 host_huge_page_size,2604u64 dram_range_start,2605u64 dram_range_end,2606u32 dram_page_size)2607{2608struct hl_device *hdev = ctx->hdev;2609int i, rc;26102611for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++) {2612ctx->va_range[i] =2613kzalloc(sizeof(struct hl_va_range), GFP_KERNEL);2614if (!ctx->va_range[i]) {2615rc = -ENOMEM;2616goto free_va_range;2617}2618}26192620rc = hl_mmu_ctx_init(ctx);2621if (rc) {2622dev_err(hdev->dev, "failed to init context %d\n", ctx->asid);2623goto free_va_range;2624}26252626mutex_init(&ctx->mem_hash_lock);2627hash_init(ctx->mem_hash);26282629mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);26302631rc = va_range_init(hdev, ctx->va_range, HL_VA_RANGE_TYPE_HOST,2632host_range_start, host_range_end, host_page_size);2633if (rc) {2634dev_err(hdev->dev, "failed to init host vm range\n");2635goto mmu_ctx_fini;2636}26372638if (hdev->pmmu_huge_range) {2639mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);26402641rc = va_range_init(hdev,2642ctx->va_range, HL_VA_RANGE_TYPE_HOST_HUGE,2643host_huge_range_start, host_huge_range_end,2644host_huge_page_size);2645if (rc) {2646dev_err(hdev->dev,2647"failed to init host huge vm range\n");2648goto clear_host_va_range;2649}2650} else {2651kfree(ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);2652ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE] =2653ctx->va_range[HL_VA_RANGE_TYPE_HOST];2654}26552656mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);26572658rc = va_range_init(hdev, ctx->va_range, HL_VA_RANGE_TYPE_DRAM,2659dram_range_start, dram_range_end, dram_page_size);2660if (rc) {2661dev_err(hdev->dev, "failed to init dram vm range\n");2662goto clear_host_huge_va_range;2663}26642665hl_debugfs_add_ctx_mem_hash(hdev, ctx);26662667return 0;26682669clear_host_huge_va_range:2670mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);26712672if (hdev->pmmu_huge_range) {2673mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);2674clear_va_list_locked(hdev,2675&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->list);2676mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);2677}2678clear_host_va_range:2679if (hdev->pmmu_huge_range)2680mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);2681mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);2682clear_va_list_locked(hdev, &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->list);2683mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);2684mmu_ctx_fini:2685mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);2686mutex_destroy(&ctx->mem_hash_lock);2687hl_mmu_ctx_fini(ctx);2688free_va_range:2689for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++)2690kfree(ctx->va_range[i]);26912692return rc;2693}26942695int hl_vm_ctx_init(struct hl_ctx *ctx)2696{2697struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;2698u64 host_range_start, host_range_end, host_huge_range_start,2699host_huge_range_end, dram_range_start, dram_range_end;2700u32 host_page_size, host_huge_page_size, dram_page_size;27012702atomic64_set(&ctx->dram_phys_mem, 0);27032704/*2705* In case of DRAM mapping, the returned address is the physical2706* address of the memory related to the given handle.2707*/2708if (ctx->hdev->mmu_disable)2709return 0;27102711dram_range_start = prop->dmmu.start_addr;2712dram_range_end = prop->dmmu.end_addr - 1;2713dram_page_size = prop->dram_page_size ?2714prop->dram_page_size : prop->dmmu.page_size;2715host_range_start = prop->pmmu.start_addr;2716host_range_end = prop->pmmu.end_addr - 1;2717host_page_size = prop->pmmu.page_size;2718host_huge_range_start = prop->pmmu_huge.start_addr;2719host_huge_range_end = prop->pmmu_huge.end_addr - 1;2720host_huge_page_size = prop->pmmu_huge.page_size;27212722return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,2723host_page_size, host_huge_range_start,2724host_huge_range_end, host_huge_page_size,2725dram_range_start, dram_range_end, dram_page_size);2726}27272728/**2729* hl_vm_ctx_fini() - virtual memory teardown of context.2730* @ctx: pointer to the habanalabs context structure.2731*2732* This function perform teardown the following:2733* - Virtual block list of available virtual memory.2734* - Virtual address to area descriptor hashtable.2735* - MMU for context.2736*2737* In addition this function does the following:2738* - Unmaps the existing hashtable nodes if the hashtable is not empty. The2739* hashtable should be empty as no valid mappings should exist at this2740* point.2741* - Frees any existing physical page list from the idr which relates to the2742* current context asid.2743* - This function checks the virtual block list for correctness. At this point2744* the list should contain one element which describes the whole virtual2745* memory range of the context. Otherwise, a warning is printed.2746*/2747void hl_vm_ctx_fini(struct hl_ctx *ctx)2748{2749struct hl_vm_phys_pg_pack *phys_pg_list, *tmp_phys_node;2750struct hl_device *hdev = ctx->hdev;2751struct hl_vm_hash_node *hnode;2752struct hl_vm *vm = &hdev->vm;2753struct hlist_node *tmp_node;2754struct list_head free_list;2755struct hl_mem_in args;2756int i;27572758if (hdev->mmu_disable)2759return;27602761hl_debugfs_remove_ctx_mem_hash(hdev, ctx);27622763/*2764* Clearly something went wrong on hard reset so no point in printing2765* another side effect error2766*/2767if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash))2768dev_dbg(hdev->dev,2769"user released device without removing its memory mappings\n");27702771hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {2772dev_dbg(hdev->dev,2773"hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",2774hnode->vaddr, ctx->asid);2775args.unmap.device_virt_addr = hnode->vaddr;2776unmap_device_va(ctx, &args, true);2777}27782779mutex_lock(&hdev->mmu_lock);27802781/* invalidate the cache once after the unmapping loop */2782hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);2783hl_mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK);27842785mutex_unlock(&hdev->mmu_lock);27862787INIT_LIST_HEAD(&free_list);27882789spin_lock(&vm->idr_lock);2790idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)2791if (phys_pg_list->asid == ctx->asid) {2792dev_dbg(hdev->dev,2793"page list 0x%px of asid %d is still alive\n",2794phys_pg_list, ctx->asid);27952796atomic64_sub(phys_pg_list->total_size, &hdev->dram_used_mem);2797idr_remove(&vm->phys_pg_pack_handles, i);2798list_add(&phys_pg_list->node, &free_list);2799}2800spin_unlock(&vm->idr_lock);28012802list_for_each_entry_safe(phys_pg_list, tmp_phys_node, &free_list, node)2803free_phys_pg_pack(hdev, phys_pg_list);28042805va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM]);2806va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST]);28072808if (hdev->pmmu_huge_range)2809va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);28102811mutex_destroy(&ctx->mem_hash_lock);2812hl_mmu_ctx_fini(ctx);28132814/* In this case we need to clear the global accounting of DRAM usage2815* because the user notifies us on allocations. If the user is no more,2816* all DRAM is available2817*/2818if (ctx->asid != HL_KERNEL_ASID_ID &&2819!hdev->asic_prop.dram_supports_virtual_memory)2820atomic64_set(&hdev->dram_used_mem, 0);2821}28222823/**2824* hl_vm_init() - initialize virtual memory module.2825* @hdev: pointer to the habanalabs device structure.2826*2827* This function initializes the following:2828* - MMU module.2829* - DRAM physical pages pool of 2MB.2830* - Idr for device memory allocation handles.2831*/2832int hl_vm_init(struct hl_device *hdev)2833{2834struct asic_fixed_properties *prop = &hdev->asic_prop;2835struct hl_vm *vm = &hdev->vm;2836int rc;28372838if (is_power_of_2(prop->dram_page_size))2839vm->dram_pg_pool =2840gen_pool_create(__ffs(prop->dram_page_size), -1);2841else2842vm->dram_pg_pool =2843gen_pool_create(__ffs(DRAM_POOL_PAGE_SIZE), -1);28442845if (!vm->dram_pg_pool) {2846dev_err(hdev->dev, "Failed to create dram page pool\n");2847return -ENOMEM;2848}28492850kref_init(&vm->dram_pg_pool_refcount);28512852rc = gen_pool_add(vm->dram_pg_pool, prop->dram_user_base_address,2853prop->dram_end_address - prop->dram_user_base_address,2854-1);28552856if (rc) {2857dev_err(hdev->dev,2858"Failed to add memory to dram page pool %d\n", rc);2859goto pool_add_err;2860}28612862spin_lock_init(&vm->idr_lock);2863idr_init(&vm->phys_pg_pack_handles);28642865atomic64_set(&hdev->dram_used_mem, 0);28662867vm->init_done = true;28682869return 0;28702871pool_add_err:2872gen_pool_destroy(vm->dram_pg_pool);28732874return rc;2875}28762877/**2878* hl_vm_fini() - virtual memory module teardown.2879* @hdev: pointer to the habanalabs device structure.2880*2881* This function perform teardown to the following:2882* - Idr for device memory allocation handles.2883* - DRAM physical pages pool of 2MB.2884* - MMU module.2885*/2886void hl_vm_fini(struct hl_device *hdev)2887{2888struct hl_vm *vm = &hdev->vm;28892890if (!vm->init_done)2891return;28922893/*2894* At this point all the contexts should be freed and hence no DRAM2895* memory should be in use. Hence the DRAM pool should be freed here.2896*/2897if (kref_put(&vm->dram_pg_pool_refcount, dram_pg_pool_do_release) != 1)2898dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n",2899__func__);29002901vm->init_done = false;2902}29032904/**2905* hl_hw_block_mem_init() - HW block memory initialization.2906* @ctx: pointer to the habanalabs context structure.2907*2908* This function initializes the HW block virtual mapped addresses list and2909* it's lock.2910*/2911void hl_hw_block_mem_init(struct hl_ctx *ctx)2912{2913mutex_init(&ctx->hw_block_list_lock);2914INIT_LIST_HEAD(&ctx->hw_block_mem_list);2915}29162917/**2918* hl_hw_block_mem_fini() - HW block memory teardown.2919* @ctx: pointer to the habanalabs context structure.2920*2921* This function clears the HW block virtual mapped addresses list and destroys2922* it's lock.2923*/2924void hl_hw_block_mem_fini(struct hl_ctx *ctx)2925{2926struct hl_vm_hw_block_list_node *lnode, *tmp;29272928if (!list_empty(&ctx->hw_block_mem_list))2929dev_crit(ctx->hdev->dev, "HW block mem list isn't empty\n");29302931list_for_each_entry_safe(lnode, tmp, &ctx->hw_block_mem_list, node) {2932list_del(&lnode->node);2933kfree(lnode);2934}29352936mutex_destroy(&ctx->hw_block_list_lock);2937}293829392940