dma-mapping-fast: reduce TLBI during map

Fastmap rely on minimizing tlbi to improve map performance.
At present only one marker is used to indicate if stale TLB might be used.
Instead use a clean bitmap for mapping to further reduce the TLB
invalidations.

Change-Id: I5a8dffde31f8804811357a83893b6ce0d863e8a9
Signed-off-by: Prakash Gupta <guptap@codeaurora.org>
This commit is contained in:
Prakash Gupta 2019-12-24 16:12:45 +05:30
parent 1526c9c655
commit ad5f02c192
2 changed files with 62 additions and 179 deletions

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
* Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
*/
#include <linux/dma-contiguous.h>
@ -117,197 +117,69 @@ static struct dma_fast_smmu_mapping *dev_get_mapping(struct device *dev)
return domain->iova_cookie;
}
/*
* Checks if the allocated range (ending at @end) covered the upcoming
* stale bit. We don't need to know exactly where the range starts since
* we already know where the candidate search range started. If, starting
* from the beginning of the candidate search range, we had to step over
* (or landed directly on top of) the upcoming stale bit, then we return
* true.
*
* Due to wrapping, there are two scenarios we'll need to check: (1) if the
* range [search_start, upcoming_stale] spans 0 (i.e. search_start >
* upcoming_stale), and, (2) if the range: [search_start, upcoming_stale]
* does *not* span 0 (i.e. search_start <= upcoming_stale). And for each
* of those two scenarios we need to handle three cases: (1) the bit was
* found before wrapping or
*/
static bool __bit_covered_stale(unsigned long upcoming_stale,
unsigned long search_start,
unsigned long end)
{
if (search_start > upcoming_stale) {
if (end >= search_start) {
/*
* We started searching above upcoming_stale and we
* didn't wrap, so we couldn't have crossed
* upcoming_stale.
*/
return false;
}
/*
* We wrapped. Did we cross (or land on top of)
* upcoming_stale?
*/
return end >= upcoming_stale;
}
if (search_start <= upcoming_stale) {
if (end >= search_start) {
/*
* We didn't wrap. Did we cross (or land on top
* of) upcoming_stale?
*/
return end >= upcoming_stale;
}
/*
* We wrapped. So we must have crossed upcoming_stale
* (since we started searching below it).
*/
return true;
}
/* we should have covered all logical combinations... */
WARN_ON(1);
return true;
}
static dma_addr_t __fast_smmu_alloc_iova(struct dma_fast_smmu_mapping *mapping,
unsigned long attrs,
size_t size)
{
unsigned long bit, prev_search_start, nbits = size >> FAST_PAGE_SHIFT;
unsigned long bit, nbits = size >> FAST_PAGE_SHIFT;
unsigned long align = (1 << get_order(size)) - 1;
bit = bitmap_find_next_zero_area(
mapping->bitmap, mapping->num_4k_pages, mapping->next_start,
nbits, align);
bit = bitmap_find_next_zero_area(mapping->clean_bitmap,
mapping->num_4k_pages,
mapping->next_start, nbits, align);
if (unlikely(bit > mapping->num_4k_pages)) {
/* try wrapping */
bit = bitmap_find_next_zero_area(
mapping->bitmap, mapping->num_4k_pages, 0, nbits,
mapping->clean_bitmap, mapping->num_4k_pages, 0, nbits,
align);
if (unlikely(bit > mapping->num_4k_pages))
return DMA_ERROR_CODE;
if (unlikely(bit > mapping->num_4k_pages)) {
/*
* If we just re-allocated a VA whose TLB hasn't been
* invalidated since it was last used and unmapped, we
* need to invalidate it here. We actually invalidate
* the entire TLB so that we don't have to invalidate
* the TLB again until we wrap back around.
*/
if (mapping->have_stale_tlbs) {
bool skip_sync = (attrs &
DMA_ATTR_SKIP_CPU_SYNC);
struct iommu_domain_geometry *geometry =
&(mapping->domain->geometry);
iommu_tlbiall(mapping->domain);
bitmap_copy(mapping->clean_bitmap,
mapping->bitmap,
mapping->num_4k_pages);
mapping->have_stale_tlbs = false;
av8l_fast_clear_stale_ptes(mapping->pgtbl_ops,
geometry->aperture_start,
mapping->base,
mapping->base +
mapping->size - 1,
skip_sync);
bit = bitmap_find_next_zero_area(
mapping->clean_bitmap,
mapping->num_4k_pages,
0, nbits,
align);
if (unlikely(bit > mapping->num_4k_pages))
return DMA_ERROR_CODE;
} else {
return DMA_ERROR_CODE;
}
}
}
bitmap_set(mapping->bitmap, bit, nbits);
prev_search_start = mapping->next_start;
bitmap_set(mapping->clean_bitmap, bit, nbits);
mapping->next_start = bit + nbits;
if (unlikely(mapping->next_start >= mapping->num_4k_pages))
mapping->next_start = 0;
/*
* If we just re-allocated a VA whose TLB hasn't been invalidated
* since it was last used and unmapped, we need to invalidate it
* here. We actually invalidate the entire TLB so that we don't
* have to invalidate the TLB again until we wrap back around.
*/
if (mapping->have_stale_tlbs &&
__bit_covered_stale(mapping->upcoming_stale_bit,
prev_search_start,
bit + nbits - 1)) {
bool skip_sync = (attrs & DMA_ATTR_SKIP_CPU_SYNC);
iommu_tlbiall(mapping->domain);
mapping->have_stale_tlbs = false;
av8l_fast_clear_stale_ptes(mapping->pgtbl_ops,
mapping->domain->geometry.aperture_start,
mapping->base,
mapping->base + mapping->size - 1,
skip_sync);
}
return (bit << FAST_PAGE_SHIFT) + mapping->base;
}
/*
* Checks whether the candidate bit will be allocated sooner than the
* current upcoming stale bit. We can say candidate will be upcoming
* sooner than the current upcoming stale bit if it lies between the
* starting bit of the next search range and the upcoming stale bit
* (allowing for wrap-around).
*
* Stated differently, we're checking the relative ordering of three
* unsigned numbers. So we need to check all 6 (i.e. 3!) permutations,
* namely:
*
* 0 |---A---B---C---| TOP (Case 1)
* 0 |---A---C---B---| TOP (Case 2)
* 0 |---B---A---C---| TOP (Case 3)
* 0 |---B---C---A---| TOP (Case 4)
* 0 |---C---A---B---| TOP (Case 5)
* 0 |---C---B---A---| TOP (Case 6)
*
* Note that since we're allowing numbers to wrap, the following three
* scenarios are all equivalent for Case 1:
*
* 0 |---A---B---C---| TOP
* 0 |---C---A---B---| TOP (C has wrapped. This is Case 5.)
* 0 |---B---C---A---| TOP (C and B have wrapped. This is Case 4.)
*
* In any of these cases, if we start searching from A, we will find B
* before we find C.
*
* We can also find two equivalent cases for Case 2:
*
* 0 |---A---C---B---| TOP
* 0 |---B---A---C---| TOP (B has wrapped. This is Case 3.)
* 0 |---C---B---A---| TOP (B and C have wrapped. This is Case 6.)
*
* In any of these cases, if we start searching from A, we will find C
* before we find B.
*/
static bool __bit_is_sooner(unsigned long candidate,
struct dma_fast_smmu_mapping *mapping)
{
unsigned long A = mapping->next_start;
unsigned long B = candidate;
unsigned long C = mapping->upcoming_stale_bit;
if ((A < B && B < C) || /* Case 1 */
(C < A && A < B) || /* Case 5 */
(B < C && C < A)) /* Case 4 */
return true;
if ((A < C && C < B) || /* Case 2 */
(B < A && A < C) || /* Case 3 */
(C < B && B < A)) /* Case 6 */
return false;
/*
* For simplicity, we've been ignoring the possibility of any of
* our three numbers being equal. Handle those cases here (they
* shouldn't happen very often, (I think?)).
*/
/*
* If candidate is the next bit to be searched then it's definitely
* sooner.
*/
if (A == B)
return true;
/*
* If candidate is the next upcoming stale bit we'll return false
* to avoid doing `upcoming = candidate' in the caller (which would
* be useless since they're already equal)
*/
if (B == C)
return false;
/*
* If next start is the upcoming stale bit then candidate can't
* possibly be sooner. The "soonest" bit is already selected.
*/
if (A == C)
return false;
/* We should have covered all logical combinations. */
WARN(1, "Well, that's awkward. A=%ld, B=%ld, C=%ld\n", A, B, C);
return true;
}
#ifdef CONFIG_ARM64
static int __init atomic_pool_init(void)
{
@ -381,12 +253,8 @@ static void __fast_smmu_free_iova(struct dma_fast_smmu_mapping *mapping,
/*
* We don't invalidate TLBs on unmap. We invalidate TLBs on map
* when we're about to re-allocate a VA that was previously
* unmapped but hasn't yet been invalidated. So we need to keep
* track of which bit is the closest to being re-allocated here.
* unmapped but hasn't yet been invalidated.
*/
if (__bit_is_sooner(start_bit, mapping))
mapping->upcoming_stale_bit = start_bit;
bitmap_clear(mapping->bitmap, start_bit, nbits);
mapping->have_stale_tlbs = true;
}
@ -1107,6 +975,14 @@ static struct dma_fast_smmu_mapping *__fast_smmu_create_mapping_sized(
if (!fast->bitmap)
goto err2;
fast->clean_bitmap = kzalloc(fast->bitmap_size, GFP_KERNEL |
__GFP_NOWARN | __GFP_NORETRY);
if (!fast->clean_bitmap)
fast->clean_bitmap = vzalloc(fast->bitmap_size);
if (!fast->clean_bitmap)
goto err3;
spin_lock_init(&fast->lock);
fast->iovad = kzalloc(sizeof(*fast->iovad), GFP_KERNEL);
@ -1118,6 +994,8 @@ static struct dma_fast_smmu_mapping *__fast_smmu_create_mapping_sized(
return fast;
err_free_bitmap:
kvfree(fast->clean_bitmap);
err3:
kvfree(fast->bitmap);
err2:
kfree(fast);
@ -1184,6 +1062,9 @@ void fast_smmu_put_dma_cookie(struct iommu_domain *domain)
if (fast->bitmap)
kvfree(fast->bitmap);
if (fast->clean_bitmap)
kvfree(fast->clean_bitmap);
kfree(fast);
domain->iova_cookie = NULL;
}

View file

@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
* Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
*/
#ifndef __LINUX_DMA_MAPPING_FAST_H
@ -23,9 +23,11 @@ struct dma_fast_smmu_mapping {
size_t num_4k_pages;
unsigned int bitmap_size;
/* bitmap has 1s marked only valid mappings */
unsigned long *bitmap;
/* clean_bitmap has 1s marked for both valid and stale tlb mappings */
unsigned long *clean_bitmap;
unsigned long next_start;
unsigned long upcoming_stale_bit;
bool have_stale_tlbs;
dma_addr_t pgtbl_dma_handle;