dma-mapping-fast: reduce TLBI during map
Fastmap rely on minimizing tlbi to improve map performance. At present only one marker is used to indicate if stale TLB might be used. Instead use a clean bitmap for mapping to further reduce the TLB invalidations. Change-Id: I5a8dffde31f8804811357a83893b6ce0d863e8a9 Signed-off-by: Prakash Gupta <guptap@codeaurora.org>
This commit is contained in:
parent
1526c9c655
commit
ad5f02c192
2 changed files with 62 additions and 179 deletions
|
@ -1,6 +1,6 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
|
||||
* Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <linux/dma-contiguous.h>
|
||||
|
@ -117,197 +117,69 @@ static struct dma_fast_smmu_mapping *dev_get_mapping(struct device *dev)
|
|||
return domain->iova_cookie;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks if the allocated range (ending at @end) covered the upcoming
|
||||
* stale bit. We don't need to know exactly where the range starts since
|
||||
* we already know where the candidate search range started. If, starting
|
||||
* from the beginning of the candidate search range, we had to step over
|
||||
* (or landed directly on top of) the upcoming stale bit, then we return
|
||||
* true.
|
||||
*
|
||||
* Due to wrapping, there are two scenarios we'll need to check: (1) if the
|
||||
* range [search_start, upcoming_stale] spans 0 (i.e. search_start >
|
||||
* upcoming_stale), and, (2) if the range: [search_start, upcoming_stale]
|
||||
* does *not* span 0 (i.e. search_start <= upcoming_stale). And for each
|
||||
* of those two scenarios we need to handle three cases: (1) the bit was
|
||||
* found before wrapping or
|
||||
*/
|
||||
static bool __bit_covered_stale(unsigned long upcoming_stale,
|
||||
unsigned long search_start,
|
||||
unsigned long end)
|
||||
{
|
||||
if (search_start > upcoming_stale) {
|
||||
if (end >= search_start) {
|
||||
/*
|
||||
* We started searching above upcoming_stale and we
|
||||
* didn't wrap, so we couldn't have crossed
|
||||
* upcoming_stale.
|
||||
*/
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
* We wrapped. Did we cross (or land on top of)
|
||||
* upcoming_stale?
|
||||
*/
|
||||
return end >= upcoming_stale;
|
||||
}
|
||||
|
||||
if (search_start <= upcoming_stale) {
|
||||
if (end >= search_start) {
|
||||
/*
|
||||
* We didn't wrap. Did we cross (or land on top
|
||||
* of) upcoming_stale?
|
||||
*/
|
||||
return end >= upcoming_stale;
|
||||
}
|
||||
/*
|
||||
* We wrapped. So we must have crossed upcoming_stale
|
||||
* (since we started searching below it).
|
||||
*/
|
||||
return true;
|
||||
}
|
||||
|
||||
/* we should have covered all logical combinations... */
|
||||
WARN_ON(1);
|
||||
return true;
|
||||
}
|
||||
|
||||
static dma_addr_t __fast_smmu_alloc_iova(struct dma_fast_smmu_mapping *mapping,
|
||||
unsigned long attrs,
|
||||
size_t size)
|
||||
{
|
||||
unsigned long bit, prev_search_start, nbits = size >> FAST_PAGE_SHIFT;
|
||||
unsigned long bit, nbits = size >> FAST_PAGE_SHIFT;
|
||||
unsigned long align = (1 << get_order(size)) - 1;
|
||||
|
||||
bit = bitmap_find_next_zero_area(
|
||||
mapping->bitmap, mapping->num_4k_pages, mapping->next_start,
|
||||
nbits, align);
|
||||
bit = bitmap_find_next_zero_area(mapping->clean_bitmap,
|
||||
mapping->num_4k_pages,
|
||||
mapping->next_start, nbits, align);
|
||||
if (unlikely(bit > mapping->num_4k_pages)) {
|
||||
/* try wrapping */
|
||||
bit = bitmap_find_next_zero_area(
|
||||
mapping->bitmap, mapping->num_4k_pages, 0, nbits,
|
||||
mapping->clean_bitmap, mapping->num_4k_pages, 0, nbits,
|
||||
align);
|
||||
if (unlikely(bit > mapping->num_4k_pages))
|
||||
return DMA_ERROR_CODE;
|
||||
if (unlikely(bit > mapping->num_4k_pages)) {
|
||||
/*
|
||||
* If we just re-allocated a VA whose TLB hasn't been
|
||||
* invalidated since it was last used and unmapped, we
|
||||
* need to invalidate it here. We actually invalidate
|
||||
* the entire TLB so that we don't have to invalidate
|
||||
* the TLB again until we wrap back around.
|
||||
*/
|
||||
if (mapping->have_stale_tlbs) {
|
||||
bool skip_sync = (attrs &
|
||||
DMA_ATTR_SKIP_CPU_SYNC);
|
||||
struct iommu_domain_geometry *geometry =
|
||||
&(mapping->domain->geometry);
|
||||
|
||||
iommu_tlbiall(mapping->domain);
|
||||
bitmap_copy(mapping->clean_bitmap,
|
||||
mapping->bitmap,
|
||||
mapping->num_4k_pages);
|
||||
mapping->have_stale_tlbs = false;
|
||||
av8l_fast_clear_stale_ptes(mapping->pgtbl_ops,
|
||||
geometry->aperture_start,
|
||||
mapping->base,
|
||||
mapping->base +
|
||||
mapping->size - 1,
|
||||
skip_sync);
|
||||
bit = bitmap_find_next_zero_area(
|
||||
mapping->clean_bitmap,
|
||||
mapping->num_4k_pages,
|
||||
0, nbits,
|
||||
align);
|
||||
if (unlikely(bit > mapping->num_4k_pages))
|
||||
return DMA_ERROR_CODE;
|
||||
|
||||
} else {
|
||||
return DMA_ERROR_CODE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bitmap_set(mapping->bitmap, bit, nbits);
|
||||
prev_search_start = mapping->next_start;
|
||||
bitmap_set(mapping->clean_bitmap, bit, nbits);
|
||||
mapping->next_start = bit + nbits;
|
||||
if (unlikely(mapping->next_start >= mapping->num_4k_pages))
|
||||
mapping->next_start = 0;
|
||||
|
||||
/*
|
||||
* If we just re-allocated a VA whose TLB hasn't been invalidated
|
||||
* since it was last used and unmapped, we need to invalidate it
|
||||
* here. We actually invalidate the entire TLB so that we don't
|
||||
* have to invalidate the TLB again until we wrap back around.
|
||||
*/
|
||||
if (mapping->have_stale_tlbs &&
|
||||
__bit_covered_stale(mapping->upcoming_stale_bit,
|
||||
prev_search_start,
|
||||
bit + nbits - 1)) {
|
||||
bool skip_sync = (attrs & DMA_ATTR_SKIP_CPU_SYNC);
|
||||
|
||||
iommu_tlbiall(mapping->domain);
|
||||
mapping->have_stale_tlbs = false;
|
||||
av8l_fast_clear_stale_ptes(mapping->pgtbl_ops,
|
||||
mapping->domain->geometry.aperture_start,
|
||||
mapping->base,
|
||||
mapping->base + mapping->size - 1,
|
||||
skip_sync);
|
||||
}
|
||||
|
||||
return (bit << FAST_PAGE_SHIFT) + mapping->base;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether the candidate bit will be allocated sooner than the
|
||||
* current upcoming stale bit. We can say candidate will be upcoming
|
||||
* sooner than the current upcoming stale bit if it lies between the
|
||||
* starting bit of the next search range and the upcoming stale bit
|
||||
* (allowing for wrap-around).
|
||||
*
|
||||
* Stated differently, we're checking the relative ordering of three
|
||||
* unsigned numbers. So we need to check all 6 (i.e. 3!) permutations,
|
||||
* namely:
|
||||
*
|
||||
* 0 |---A---B---C---| TOP (Case 1)
|
||||
* 0 |---A---C---B---| TOP (Case 2)
|
||||
* 0 |---B---A---C---| TOP (Case 3)
|
||||
* 0 |---B---C---A---| TOP (Case 4)
|
||||
* 0 |---C---A---B---| TOP (Case 5)
|
||||
* 0 |---C---B---A---| TOP (Case 6)
|
||||
*
|
||||
* Note that since we're allowing numbers to wrap, the following three
|
||||
* scenarios are all equivalent for Case 1:
|
||||
*
|
||||
* 0 |---A---B---C---| TOP
|
||||
* 0 |---C---A---B---| TOP (C has wrapped. This is Case 5.)
|
||||
* 0 |---B---C---A---| TOP (C and B have wrapped. This is Case 4.)
|
||||
*
|
||||
* In any of these cases, if we start searching from A, we will find B
|
||||
* before we find C.
|
||||
*
|
||||
* We can also find two equivalent cases for Case 2:
|
||||
*
|
||||
* 0 |---A---C---B---| TOP
|
||||
* 0 |---B---A---C---| TOP (B has wrapped. This is Case 3.)
|
||||
* 0 |---C---B---A---| TOP (B and C have wrapped. This is Case 6.)
|
||||
*
|
||||
* In any of these cases, if we start searching from A, we will find C
|
||||
* before we find B.
|
||||
*/
|
||||
static bool __bit_is_sooner(unsigned long candidate,
|
||||
struct dma_fast_smmu_mapping *mapping)
|
||||
{
|
||||
unsigned long A = mapping->next_start;
|
||||
unsigned long B = candidate;
|
||||
unsigned long C = mapping->upcoming_stale_bit;
|
||||
|
||||
if ((A < B && B < C) || /* Case 1 */
|
||||
(C < A && A < B) || /* Case 5 */
|
||||
(B < C && C < A)) /* Case 4 */
|
||||
return true;
|
||||
|
||||
if ((A < C && C < B) || /* Case 2 */
|
||||
(B < A && A < C) || /* Case 3 */
|
||||
(C < B && B < A)) /* Case 6 */
|
||||
return false;
|
||||
|
||||
/*
|
||||
* For simplicity, we've been ignoring the possibility of any of
|
||||
* our three numbers being equal. Handle those cases here (they
|
||||
* shouldn't happen very often, (I think?)).
|
||||
*/
|
||||
|
||||
/*
|
||||
* If candidate is the next bit to be searched then it's definitely
|
||||
* sooner.
|
||||
*/
|
||||
if (A == B)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If candidate is the next upcoming stale bit we'll return false
|
||||
* to avoid doing `upcoming = candidate' in the caller (which would
|
||||
* be useless since they're already equal)
|
||||
*/
|
||||
if (B == C)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If next start is the upcoming stale bit then candidate can't
|
||||
* possibly be sooner. The "soonest" bit is already selected.
|
||||
*/
|
||||
if (A == C)
|
||||
return false;
|
||||
|
||||
/* We should have covered all logical combinations. */
|
||||
WARN(1, "Well, that's awkward. A=%ld, B=%ld, C=%ld\n", A, B, C);
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARM64
|
||||
static int __init atomic_pool_init(void)
|
||||
{
|
||||
|
@ -381,12 +253,8 @@ static void __fast_smmu_free_iova(struct dma_fast_smmu_mapping *mapping,
|
|||
/*
|
||||
* We don't invalidate TLBs on unmap. We invalidate TLBs on map
|
||||
* when we're about to re-allocate a VA that was previously
|
||||
* unmapped but hasn't yet been invalidated. So we need to keep
|
||||
* track of which bit is the closest to being re-allocated here.
|
||||
* unmapped but hasn't yet been invalidated.
|
||||
*/
|
||||
if (__bit_is_sooner(start_bit, mapping))
|
||||
mapping->upcoming_stale_bit = start_bit;
|
||||
|
||||
bitmap_clear(mapping->bitmap, start_bit, nbits);
|
||||
mapping->have_stale_tlbs = true;
|
||||
}
|
||||
|
@ -1107,6 +975,14 @@ static struct dma_fast_smmu_mapping *__fast_smmu_create_mapping_sized(
|
|||
if (!fast->bitmap)
|
||||
goto err2;
|
||||
|
||||
fast->clean_bitmap = kzalloc(fast->bitmap_size, GFP_KERNEL |
|
||||
__GFP_NOWARN | __GFP_NORETRY);
|
||||
if (!fast->clean_bitmap)
|
||||
fast->clean_bitmap = vzalloc(fast->bitmap_size);
|
||||
|
||||
if (!fast->clean_bitmap)
|
||||
goto err3;
|
||||
|
||||
spin_lock_init(&fast->lock);
|
||||
|
||||
fast->iovad = kzalloc(sizeof(*fast->iovad), GFP_KERNEL);
|
||||
|
@ -1118,6 +994,8 @@ static struct dma_fast_smmu_mapping *__fast_smmu_create_mapping_sized(
|
|||
return fast;
|
||||
|
||||
err_free_bitmap:
|
||||
kvfree(fast->clean_bitmap);
|
||||
err3:
|
||||
kvfree(fast->bitmap);
|
||||
err2:
|
||||
kfree(fast);
|
||||
|
@ -1184,6 +1062,9 @@ void fast_smmu_put_dma_cookie(struct iommu_domain *domain)
|
|||
if (fast->bitmap)
|
||||
kvfree(fast->bitmap);
|
||||
|
||||
if (fast->clean_bitmap)
|
||||
kvfree(fast->clean_bitmap);
|
||||
|
||||
kfree(fast);
|
||||
domain->iova_cookie = NULL;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
|
||||
* Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef __LINUX_DMA_MAPPING_FAST_H
|
||||
|
@ -23,9 +23,11 @@ struct dma_fast_smmu_mapping {
|
|||
size_t num_4k_pages;
|
||||
|
||||
unsigned int bitmap_size;
|
||||
/* bitmap has 1s marked only valid mappings */
|
||||
unsigned long *bitmap;
|
||||
/* clean_bitmap has 1s marked for both valid and stale tlb mappings */
|
||||
unsigned long *clean_bitmap;
|
||||
unsigned long next_start;
|
||||
unsigned long upcoming_stale_bit;
|
||||
bool have_stale_tlbs;
|
||||
|
||||
dma_addr_t pgtbl_dma_handle;
|
||||
|
|
Loading…
Reference in a new issue