Discussion:
[PATCH v7 01/12] iommu/amd: Detect pre enabled translation
Baoquan He
2016-11-25 05:13:08 UTC
Permalink
Add functions to check whether translation is already enabled in IOMMU.

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/amd_iommu_init.c | 25 +++++++++++++++++++++++++
drivers/iommu/amd_iommu_proto.h | 1 +
drivers/iommu/amd_iommu_types.h | 4 ++++
3 files changed, 30 insertions(+)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 157e934..5ad1e023 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -251,6 +251,26 @@ static int amd_iommu_enable_interrupts(void);
static int __init iommu_go_to_state(enum iommu_init_state state);
static void init_device_table_dma(void);

+
+bool translation_pre_enabled(struct amd_iommu *iommu)
+{
+ return (iommu->flags & AMD_IOMMU_FLAG_TRANS_PRE_ENABLED);
+}
+
+static void clear_translation_pre_enabled(struct amd_iommu *iommu)
+{
+ iommu->flags &= ~AMD_IOMMU_FLAG_TRANS_PRE_ENABLED;
+}
+
+static void init_translation_status(struct amd_iommu *iommu)
+{
+ u32 ctrl;
+
+ ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ if (ctrl & (1<<CONTROL_IOMMU_EN))
+ iommu->flags |= AMD_IOMMU_FLAG_TRANS_PRE_ENABLED;
+}
+
static int iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
u8 bank, u8 cntr, u8 fxn,
u64 *value, bool is_write);
@@ -1389,6 +1409,11 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)

iommu->int_enabled = false;

+ init_translation_status(iommu);
+
+ if (translation_pre_enabled(iommu))
+ pr_warn("Translation is already enabled - trying to copy translation structures\n");
+
ret = init_iommu_from_acpi(iommu, h);
if (ret)
return ret;
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 7eb60c1..9560183 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -93,4 +93,5 @@ static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
return !!(iommu->features & f);
}

+extern bool translation_pre_enabled(struct amd_iommu *iommu);
#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 0d91785..2bbc19d 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -416,6 +416,7 @@ extern struct kmem_cache *amd_iommu_irq_cache;
#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)


+
/*
* This struct is used to pass information about
* incoming PPR faults around.
@@ -434,6 +435,8 @@ struct iommu_domain;
struct irq_domain;
struct amd_irte_ops;

+#define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0)
+
/*
* This structure contains generic data for IOMMU protection domains
* independent of their use.
@@ -566,6 +569,7 @@ struct amd_iommu {
struct amd_irte_ops *irte_ops;
#endif

+ u32 flags;
volatile u64 __aligned(8) cmd_sem;
};
--
2.5.5
Baoquan He
2016-11-25 05:13:09 UTC
Permalink
Move per iommu enabling code into a wrapper function early_enable_iommu().
This can make later kdump change easier.

And also add iommu_disable_command_buffer and iommu_disable_event_buffer
for later usage.

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/amd_iommu_init.c | 42 +++++++++++++++++++++++++++++++-----------
1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 5ad1e023..9458f7c 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -625,6 +625,14 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
amd_iommu_reset_cmd_buffer(iommu);
}

+/*
+ * This function disables the command buffer
+ */
+static void iommu_disable_command_buffer(struct amd_iommu *iommu)
+{
+ iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+}
+
static void __init free_command_buffer(struct amd_iommu *iommu)
{
free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
@@ -657,6 +665,14 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu)
iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
}

+/*
+ * This function disables the command buffer
+ */
+static void iommu_disable_event_buffer(struct amd_iommu *iommu)
+{
+ iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
+}
+
static void __init free_event_buffer(struct amd_iommu *iommu)
{
free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
@@ -2026,6 +2042,19 @@ static void iommu_enable_ga(struct amd_iommu *iommu)
#endif
}

+static void early_enable_iommu(struct amd_iommu *iommu)
+{
+ iommu_disable(iommu);
+ iommu_init_flags(iommu);
+ iommu_set_device_table(iommu);
+ iommu_enable_command_buffer(iommu);
+ iommu_enable_event_buffer(iommu);
+ iommu_set_exclusion_range(iommu);
+ iommu_enable_ga(iommu);
+ iommu_enable(iommu);
+ iommu_flush_all_caches(iommu);
+}
+
/*
* This function finally enables all IOMMUs found in the system after
* they have been initialized
@@ -2034,17 +2063,8 @@ static void early_enable_iommus(void)
{
struct amd_iommu *iommu;

- for_each_iommu(iommu) {
- iommu_disable(iommu);
- iommu_init_flags(iommu);
- iommu_set_device_table(iommu);
- iommu_enable_command_buffer(iommu);
- iommu_enable_event_buffer(iommu);
- iommu_set_exclusion_range(iommu);
- iommu_enable_ga(iommu);
- iommu_enable(iommu);
- iommu_flush_all_caches(iommu);
- }
+ for_each_iommu(iommu)
+ early_enable_iommu(iommu);

#ifdef CONFIG_IRQ_REMAP
if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
--
2.5.5
Baoquan He
2016-11-25 05:13:10 UTC
Permalink
In amd-vi spec several bits of IO PTE fields and DTE fields are similar
so that both of them can share the same MACRO definition. However
defining their respecitve bit fields can make code more read-able. So
do it in this patch.

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/amd_iommu.c | 8 ++++----
drivers/iommu/amd_iommu_types.h | 18 ++++++++++++++----
2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 754595e..0b0e50e 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1476,9 +1476,9 @@ static int iommu_map_page(struct protection_domain *dom,

if (count > 1) {
__pte = PAGE_SIZE_PTE(phys_addr, page_size);
- __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
+ __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
} else
- __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
+ __pte = phys_addr | IOMMU_PTE_PR | IOMMU_PTE_FC;

if (prot & IOMMU_PROT_IR)
__pte |= IOMMU_PTE_IR;
@@ -1805,7 +1805,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)

pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
+ pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV;

flags = amd_iommu_dev_table[devid].data[1];

@@ -1848,7 +1848,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
static void clear_dte_entry(u16 devid)
{
/* remove entry from the device table seen by the hardware */
- amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
+ amd_iommu_dev_table[devid].data[0] = DTE_FLAG_V | DTE_FLAG_TV;
amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK;

amd_iommu_apply_erratum_63(devid);
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 2bbc19d..6a4378f 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -265,7 +265,7 @@
#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
- IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
+ IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW)
#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)

#define PM_MAP_4k 0
@@ -314,13 +314,23 @@
#define PTE_LEVEL_PAGE_SIZE(level) \
(1ULL << (12 + (9 * (level))))

-#define IOMMU_PTE_P (1ULL << 0)
-#define IOMMU_PTE_TV (1ULL << 1)
+/*
+ * Bit value definition for I/O PTE fields
+ */
+#define IOMMU_PTE_PR (1ULL << 0)
#define IOMMU_PTE_U (1ULL << 59)
#define IOMMU_PTE_FC (1ULL << 60)
#define IOMMU_PTE_IR (1ULL << 61)
#define IOMMU_PTE_IW (1ULL << 62)

+/*
+ * Bit value definition for DTE fields
+ */
+#define DTE_FLAG_V (1ULL << 0)
+#define DTE_FLAG_TV (1ULL << 1)
+#define DTE_FLAG_IR (1ULL << 61)
+#define DTE_FLAG_IW (1ULL << 62)
+
#define DTE_FLAG_IOTLB (1ULL << 32)
#define DTE_FLAG_GV (1ULL << 55)
#define DTE_FLAG_MASK (0x3ffULL << 32)
@@ -342,7 +352,7 @@
#define GCR3_VALID 0x01ULL

#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
+#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
--
2.5.5
Baoquan He
2016-11-25 05:13:11 UTC
Permalink
Add function copy_dev_tables to copy the old DEV table entries of the panicked
kernel to the new allocated DEV table. Since all iommus share the same DTE table
the copy only need be done once as long as the physical address of old DEV table
is retrieved from iommu reg. Besides, we also need to:

- Check whether all IOMMUs actually use the same device table with the same size

- Verify that the size of the old device table is the expected size.

- Reserve the old domain id occupied in 1st kernel to avoid touching the old
io-page tables. Then on-flight DMA can continue looking it up.

And define MACRO DEV_DOMID_MASK to replace magic number 0xffffULL because
it need be reused in copy_dev_tables.

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/amd_iommu.c | 2 +-
drivers/iommu/amd_iommu_init.c | 55 +++++++++++++++++++++++++++++++++++++++++
drivers/iommu/amd_iommu_types.h | 1 +
3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 0b0e50e..d5aef72 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1838,7 +1838,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
flags |= tmp;
}

- flags &= ~(0xffffUL);
+ flags &= ~DEV_DOMID_MASK;
flags |= domain->id;

amd_iommu_dev_table[devid].data[1] = flags;
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 9458f7c..8fc9840 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -834,6 +834,61 @@ static int get_dev_entry_bit(u16 devid, u8 bit)
}


+static int copy_dev_tables(void)
+{
+ struct dev_table_entry *old_devtb = NULL;
+ u32 lo, hi, devid, old_devtb_size;
+ phys_addr_t old_devtb_phys;
+ u64 entry, last_entry = 0;
+ struct amd_iommu *iommu;
+ u16 dom_id, dte_v;
+ static int copied;
+
+ for_each_iommu(iommu) {
+ if (!translation_pre_enabled(iommu)) {
+ pr_err("IOMMU:%d is not pre-enabled!/n",
+ iommu->index);
+ return -1;
+ }
+
+ /* All IOMMUs should use the same device table with the same size */
+ lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET);
+ hi = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET + 4);
+ entry = (((u64) hi) << 32) + lo;
+ if (last_entry && last_entry != entry) {
+ pr_err("IOMMU:%d should use the same dev table as others!/n",
+ iommu->index);
+ return -1;
+ }
+ last_entry = entry;
+
+ old_devtb_size = ((entry & ~PAGE_MASK) + 1) << 12;
+ if (old_devtb_size != dev_table_size) {
+ pr_err("The device table size of IOMMU:%d is not expected!/n",
+ iommu->index);
+ return -1;
+ }
+
+ old_devtb_phys = entry & PAGE_MASK;
+ old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+ if (!old_devtb)
+ return -1;
+
+ if (copied)
+ continue;
+ for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+ amd_iommu_dev_table[devid] = old_devtb[devid];
+ dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK;
+ dte_v = old_devtb[devid].data[0] & DTE_FLAG_V;
+ if (dte_v && dom_id)
+ __set_bit(dom_id, amd_iommu_pd_alloc_bitmap);
+ }
+ memunmap(old_devtb);
+ copied = 1;
+ }
+ return 0;
+}
+
void amd_iommu_apply_erratum_63(u16 devid)
{
int sysmgt;
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 6a4378f..79ec841 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -336,6 +336,7 @@
#define DTE_FLAG_MASK (0x3ffULL << 32)
#define DTE_GLX_SHIFT (56)
#define DTE_GLX_MASK (3)
+#define DEV_DOMID_MASK 0xffffULL

#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL)
#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL)
--
2.5.5
Baoquan He
2016-11-25 05:13:13 UTC
Permalink
This new call-back will be used to check if the domain attach need be
deferred for now. If yes, the domain attach/detach will return directly.

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/iommu.c | 8 ++++++++
include/linux/iommu.h | 1 +
2 files changed, 9 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9a2f196..0262eee 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1083,6 +1083,10 @@ static int __iommu_attach_device(struct iommu_domain *domain,
struct device *dev)
{
int ret;
+ if ((domain->ops->is_attach_deferred != NULL) &&
+ domain->ops->is_attach_deferred(domain, dev))
+ return 0;
+
if (unlikely(domain->ops->attach_dev == NULL))
return -ENODEV;

@@ -1124,6 +1128,10 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
static void __iommu_detach_device(struct iommu_domain *domain,
struct device *dev)
{
+ if ((domain->ops->is_attach_deferred != NULL) &&
+ domain->ops->is_attach_deferred(domain, dev))
+ return;
+
if (unlikely(domain->ops->detach_dev == NULL))
return;

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 436dc21..e179313 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -200,6 +200,7 @@ struct iommu_ops {
u32 (*domain_get_windows)(struct iommu_domain *domain);

int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
+ bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);

unsigned long pgsize_bitmap;
};
--
2.5.5
Baoquan He
2016-11-25 05:13:12 UTC
Permalink
Here several things need be done:
- If iommu is pre-enabled in a normal kernel, just disable it and print
warning.

- If failed to copy dev table of old kernel, continue to proceed as
it does in normal kernel.

- Disable and Re-enable event/cmd buffer, install the copied DTE table
to reg, and detect and enable guest vapic.

- Flush all caches

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/amd_iommu_init.c | 51 ++++++++++++++++++++++++++++++++++++------
1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 8fc9840..4233f26 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -36,6 +36,7 @@
#include <asm/io_apic.h>
#include <asm/irq_remapping.h>

+#include <linux/crash_dump.h>
#include "amd_iommu_proto.h"
#include "amd_iommu_types.h"
#include "irq_remapping.h"
@@ -1481,9 +1482,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
iommu->int_enabled = false;

init_translation_status(iommu);
-
- if (translation_pre_enabled(iommu))
- pr_warn("Translation is already enabled - trying to copy translation structures\n");
+ if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
+ iommu_disable(iommu);
+ clear_translation_pre_enabled(iommu);
+ pr_warn("Translation was enabled for IOMMU:%d but we are not in kdump mode\n",
+ iommu->index);
+ }

ret = init_iommu_from_acpi(iommu, h);
if (ret)
@@ -1975,8 +1979,7 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
}

/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
+ * Init the device table to not allow DMA access for devices
*/
static void init_device_table_dma(void)
{
@@ -2117,9 +2120,43 @@ static void early_enable_iommu(struct amd_iommu *iommu)
static void early_enable_iommus(void)
{
struct amd_iommu *iommu;
+ bool is_pre_enabled = false;

- for_each_iommu(iommu)
- early_enable_iommu(iommu);
+ for_each_iommu(iommu) {
+ if (translation_pre_enabled(iommu)) {
+ is_pre_enabled = true;
+ break;
+ }
+ }
+
+ if (!is_pre_enabled) {
+ for_each_iommu(iommu)
+ early_enable_iommu(iommu);
+ } else {
+ pr_warn("Translation is already enabled - trying to copy translation structures\n");
+ if (copy_dev_tables()) {
+ pr_err("Failed to copy DEV table from previous kernel.\n");
+ /*
+ * If failed to copy dev tables from old kernel, continue to proceed
+ * as it does in normal kernel.
+ */
+ for_each_iommu(iommu) {
+ clear_translation_pre_enabled(iommu);
+ early_enable_iommu(iommu);
+ }
+ } else {
+ pr_info("Copied DEV table from previous kernel.\n");
+ for_each_iommu(iommu) {
+ iommu_disable_command_buffer(iommu);
+ iommu_disable_event_buffer(iommu);
+ iommu_enable_command_buffer(iommu);
+ iommu_enable_event_buffer(iommu);
+ iommu_enable_ga(iommu);
+ iommu_set_device_table(iommu);
+ iommu_flush_all_caches(iommu);
+ }
+ }
+ }

#ifdef CONFIG_IRQ_REMAP
if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
--
2.5.5
Baoquan He
2016-11-25 05:13:14 UTC
Permalink
Implement call-back is_attach_deferred and use it to defer the
domain attach from iommu driver init to device driver init when
iommu is pre-enabled in kdump kernel.

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/amd_iommu.c | 23 ++++++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index d5aef72..fc8ecfb 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -138,6 +138,7 @@ struct iommu_dev_data {
PPR completions */
u32 errata; /* Bitmap for errata to apply */
bool use_vapic; /* Enable device to use vapic mode */
+ bool defer_attach;
};

/*
@@ -340,12 +341,17 @@ static u16 get_alias(struct device *dev)
static struct iommu_dev_data *find_dev_data(u16 devid)
{
struct iommu_dev_data *dev_data;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];

dev_data = search_dev_data(devid);

- if (dev_data == NULL)
+ if (dev_data == NULL) {
dev_data = alloc_dev_data(devid);

+ if (translation_pre_enabled(iommu))
+ dev_data->defer_attach = true;
+ }
+
return dev_data;
}

@@ -2315,11 +2321,18 @@ static void queue_add(struct dma_ops_domain *dma_dom,
static struct protection_domain *get_domain(struct device *dev)
{
struct protection_domain *domain;
+ struct iommu_domain *io_domain;

if (!check_device(dev))
return ERR_PTR(-EINVAL);

domain = get_dev_data(dev)->domain;
+ if (domain == NULL && get_dev_data(dev)->defer_attach) {
+ get_dev_data(dev)->defer_attach = false;
+ io_domain = iommu_get_domain_for_dev(dev);
+ domain = to_pdomain(io_domain);
+ attach_device(dev, domain);
+ }
if (!dma_ops_domain(domain))
return ERR_PTR(-EBUSY);

@@ -3215,6 +3228,13 @@ static void amd_iommu_apply_dm_region(struct device *dev,
WARN_ON_ONCE(reserve_iova(&dma_dom->iovad, start, end) == NULL);
}

+static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct iommu_dev_data *dev_data = dev->archdata.iommu;
+ return dev_data->defer_attach;
+}
+
static const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.domain_alloc = amd_iommu_domain_alloc,
@@ -3231,6 +3251,7 @@ static const struct iommu_ops amd_iommu_ops = {
.get_dm_regions = amd_iommu_get_dm_regions,
.put_dm_regions = amd_iommu_put_dm_regions,
.apply_dm_region = amd_iommu_apply_dm_region,
+ .is_attach_deferred = amd_iommu_is_attach_deferred,
.pgsize_bitmap = AMD_IOMMU_PGSIZES,
};
--
2.5.5
Baoquan He
2016-11-25 05:13:15 UTC
Permalink
Firstly split the dev table entry copy into address translation part and
irq remapping part. Because these two parts could be configured to
be available indepentently.

Secondly check if IntCtl and IntTabLen are 10b and 1000b if they are
set.

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/amd_iommu.c | 5 -----
drivers/iommu/amd_iommu_init.c | 25 ++++++++++++++++++++++---
drivers/iommu/amd_iommu_types.h | 8 ++++++++
3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index fc8ecfb..cea90d5 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3640,11 +3640,6 @@ EXPORT_SYMBOL(amd_iommu_device_info);

static struct irq_chip amd_ir_chip;

-#define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6)
-#define DTE_IRQ_REMAP_INTCTL (2ULL << 60)
-#define DTE_IRQ_TABLE_LEN (8ULL << 1)
-#define DTE_IRQ_REMAP_ENABLE 1ULL
-
static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
{
u64 dte;
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 4233f26..4427c63 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -837,12 +837,12 @@ static int get_dev_entry_bit(u16 devid, u8 bit)

static int copy_dev_tables(void)
{
+ u64 int_ctl, int_tab_len, entry, last_entry = 0;
struct dev_table_entry *old_devtb = NULL;
u32 lo, hi, devid, old_devtb_size;
phys_addr_t old_devtb_phys;
- u64 entry, last_entry = 0;
struct amd_iommu *iommu;
- u16 dom_id, dte_v;
+ u16 dom_id, dte_v, irq_v;
static int copied;

for_each_iommu(iommu) {
@@ -881,8 +881,27 @@ static int copy_dev_tables(void)
amd_iommu_dev_table[devid] = old_devtb[devid];
dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK;
dte_v = old_devtb[devid].data[0] & DTE_FLAG_V;
- if (dte_v && dom_id)
+ if (dte_v && dom_id) {
+ amd_iommu_dev_table[devid].data[0]
+ = old_devtb[devid].data[0];
+ amd_iommu_dev_table[devid].data[1]
+ = old_devtb[devid].data[1];
__set_bit(dom_id, amd_iommu_pd_alloc_bitmap);
+ }
+
+ irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE;
+ int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK;
+ int_tab_len = old_devtb[devid].data[2] & DTE_IRQ_TABLE_LEN_MASK;
+ if (irq_v && (int_ctl || int_tab_len)) {
+ if ((int_ctl != DTE_IRQ_REMAP_INTCTL) ||
+ (int_tab_len != DTE_IRQ_TABLE_LEN)) {
+ pr_err("Wrong old irq remapping flag: %#x\n", devid);
+ return -1;
+ }
+
+ amd_iommu_dev_table[devid].data[2]
+ = old_devtb[devid].data[2];
+ }
}
memunmap(old_devtb);
copied = 1;
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 79ec841..b5ae18e 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -250,6 +250,14 @@

#define GA_GUEST_NR 0x1

+/* Bit value definition for dte irq remapping fields*/
+#define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6)
+#define DTE_IRQ_REMAP_INTCTL_MASK (0x3ULL << 60)
+#define DTE_IRQ_TABLE_LEN_MASK (0xfULL << 1)
+#define DTE_IRQ_REMAP_INTCTL (2ULL << 60)
+#define DTE_IRQ_TABLE_LEN (8ULL << 1)
+#define DTE_IRQ_REMAP_ENABLE 1ULL
+
#define PAGE_MODE_NONE 0x00
#define PAGE_MODE_1_LEVEL 0x01
#define PAGE_MODE_2_LEVEL 0x02
--
2.5.5
Baoquan He
2016-11-25 05:13:16 UTC
Permalink
When in kdump kernel iommu is pre_enabled, if a device is set up with
guest translations (DTE.GV=1), then don't copy GCR3 table root pointer
but move the device over to an empty guest-cr3 table and handle the
faults in the PPR log (which answer them with INVALID). After all these
PPR faults are recoverable for the device and we should not allow the
device to change old-kernels data when we don't have to.

Signed-off-by: Baoquan He <***@redhat.com>
Suggested-by: Joerg Roedel <***@suse.de>
---
drivers/iommu/amd_iommu.c | 26 +++-----------------------
drivers/iommu/amd_iommu_init.c | 11 +++++++++++
drivers/iommu/amd_iommu_proto.h | 1 +
drivers/iommu/amd_iommu_types.h | 22 ++++++++++++++++++++++
drivers/iommu/amd_iommu_v2.c | 18 +++++++++++++++++-
5 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index cea90d5..22520f6 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -120,28 +120,6 @@ int amd_iommu_max_glx_val = -1;
static struct dma_map_ops amd_iommu_dma_ops;

/*
- * This struct contains device specific data for the IOMMU
- */
-struct iommu_dev_data {
- struct list_head list; /* For domain->dev_list */
- struct list_head dev_data_list; /* For global dev_data_list */
- struct protection_domain *domain; /* Domain the device is bound to */
- u16 devid; /* PCI Device ID */
- u16 alias; /* Alias Device ID */
- bool iommu_v2; /* Device can make use of IOMMUv2 */
- bool passthrough; /* Device is identity mapped */
- struct {
- bool enabled;
- int qdep;
- } ats; /* ATS state */
- bool pri_tlp; /* PASID TLB required for
- PPR completions */
- u32 errata; /* Bitmap for errata to apply */
- bool use_vapic; /* Enable device to use vapic mode */
- bool defer_attach;
-};
-
-/*
* general struct to manage commands send to an IOMMU
*/
struct iommu_cmd {
@@ -355,10 +333,11 @@ static struct iommu_dev_data *find_dev_data(u16 devid)
return dev_data;
}

-static struct iommu_dev_data *get_dev_data(struct device *dev)
+struct iommu_dev_data *get_dev_data(struct device *dev)
{
return dev->archdata.iommu;
}
+EXPORT_SYMBOL(get_dev_data);

/*
* Find or create an IOMMU group for a acpihid device.
@@ -2378,6 +2357,7 @@ static int dir2prot(enum dma_data_direction direction)
else
return 0;
}
+
/*
* This function contains common code for mapping of a physically
* contiguous memory region into DMA address space. It is used by all
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 4427c63..d362b63 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -204,6 +204,7 @@ u16 *amd_iommu_alias_table;
* for a specific device. It is also indexed by the PCI device id.
*/
struct amd_iommu **amd_iommu_rlookup_table;
+EXPORT_SYMBOL(amd_iommu_rlookup_table);

/*
* This table is used to find the irq remapping table for a given device id
@@ -257,6 +258,7 @@ bool translation_pre_enabled(struct amd_iommu *iommu)
{
return (iommu->flags & AMD_IOMMU_FLAG_TRANS_PRE_ENABLED);
}
+EXPORT_SYMBOL(translation_pre_enabled);

static void clear_translation_pre_enabled(struct amd_iommu *iommu)
{
@@ -844,6 +846,7 @@ static int copy_dev_tables(void)
struct amd_iommu *iommu;
u16 dom_id, dte_v, irq_v;
static int copied;
+ u64 tmp;

for_each_iommu(iommu) {
if (!translation_pre_enabled(iommu)) {
@@ -887,6 +890,14 @@ static int copy_dev_tables(void)
amd_iommu_dev_table[devid].data[1]
= old_devtb[devid].data[1];
__set_bit(dom_id, amd_iommu_pd_alloc_bitmap);
+ /* If gcr3 table existed, mask it out */
+ if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
+ tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
+ tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+ amd_iommu_dev_table[devid].data[1] &= ~tmp;
+ tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A;
+ amd_iommu_dev_table[devid].data[0] &= ~tmp;
+ }
}

irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE;
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 9560183..d6a2c36 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -94,4 +94,5 @@ static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
}

extern bool translation_pre_enabled(struct amd_iommu *iommu);
+extern struct iommu_dev_data *get_dev_data(struct device *dev);
#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index b5ae18e..5524cdd 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -612,6 +612,28 @@ struct devid_map {
bool cmd_line;
};

+/*
+ * This struct contains device specific data for the IOMMU
+ */
+struct iommu_dev_data {
+ struct list_head list; /* For domain->dev_list */
+ struct list_head dev_data_list; /* For global dev_data_list */
+ struct protection_domain *domain; /* Domain the device is bound to */
+ u16 devid; /* PCI Device ID */
+ u16 alias; /* Alias Device ID */
+ bool iommu_v2; /* Device can make use of IOMMUv2 */
+ bool passthrough; /* Device is identity mapped */
+ struct {
+ bool enabled;
+ int qdep;
+ } ats; /* ATS state */
+ bool pri_tlp; /* PASID TLB required for
+ PPR completions */
+ u32 errata; /* Bitmap for errata to apply */
+ bool use_vapic; /* Enable device to use vapic mode */
+ bool defer_attach;
+};
+
/* Map HPET and IOAPIC ids to the devid used by the IOMMU */
extern struct list_head ioapic_map;
extern struct list_head hpet_map;
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 594849a..f66110b 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -561,13 +561,29 @@ static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data)
unsigned long flags;
struct fault *fault;
bool finish;
- u16 tag;
+ u16 tag, devid;
int ret;
+ struct iommu_dev_data *dev_data;
+ struct pci_dev *pdev = NULL;

iommu_fault = data;
tag = iommu_fault->tag & 0x1ff;
finish = (iommu_fault->tag >> 9) & 1;

+ devid = iommu_fault->device_id;
+ pdev = pci_get_bus_and_slot(PCI_BUS_NUM(devid), devid & 0xff);
+ if (!pdev)
+ return -ENODEV;
+ dev_data = get_dev_data(&pdev->dev);
+
+ /* In kdump kernel pci dev is not initialized yet -> send INVALID */
+ if (translation_pre_enabled(amd_iommu_rlookup_table[devid])
+ && dev_data->defer_attach) {
+ amd_iommu_complete_ppr(pdev, iommu_fault->pasid,
+ PPR_INVALID, tag);
+ goto out;
+ }
+
ret = NOTIFY_DONE;
dev_state = get_device_state(iommu_fault->device_id);
if (dev_state == NULL)
--
2.5.5
Baoquan He
2016-11-25 05:13:17 UTC
Permalink
When handle deferred domain attach, we need check if the domain is
v2. If not, should try to clear out the GV flag which could be
copied from the old device table entry.

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/amd_iommu.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 22520f6..3a8e4ae 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1839,6 +1839,11 @@ static void clear_dte_entry(u16 devid)
amd_iommu_apply_erratum_63(devid);
}

+static void clear_dte_flag_gv(u16 devid)
+{
+ amd_iommu_dev_table[devid].data[0] &= (~DTE_FLAG_GV);
+}
+
static void do_attach(struct iommu_dev_data *dev_data,
struct protection_domain *domain)
{
@@ -2299,6 +2304,7 @@ static void queue_add(struct dma_ops_domain *dma_dom,
*/
static struct protection_domain *get_domain(struct device *dev)
{
+ struct iommu_dev_data *dev_data = get_dev_data(dev);
struct protection_domain *domain;
struct iommu_domain *io_domain;

@@ -2306,11 +2312,21 @@ static struct protection_domain *get_domain(struct device *dev)
return ERR_PTR(-EINVAL);

domain = get_dev_data(dev)->domain;
- if (domain == NULL && get_dev_data(dev)->defer_attach) {
+ if (domain == NULL && dev_data->defer_attach) {
+ u16 alias = amd_iommu_alias_table[dev_data->devid];
get_dev_data(dev)->defer_attach = false;
io_domain = iommu_get_domain_for_dev(dev);
domain = to_pdomain(io_domain);
attach_device(dev, domain);
+ /*
+ * If the deferred attached domain is not v2, should clear out
+ * the old GV flag.
+ */
+ if (!(domain->flags & PD_IOMMUV2_MASK)) {
+ clear_dte_flag_gv(dev_data->devid);
+ if (alias != dev_data->devid)
+ clear_dte_flag_gv(dev_data->devid);
+ }
}
if (!dma_ops_domain(domain))
return ERR_PTR(-EBUSY);
--
2.5.5
Baoquan He
2016-11-25 05:13:18 UTC
Permalink
In iommu_request_dm_for_dev(), devices of group have all been attached
to newly created direct mapped domain. We should store the domain into
group->domain so that it works for iommu_get_domain_for_dev() and
get_domain().

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/iommu.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0262eee..a2d1a8f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1612,6 +1612,7 @@ int iommu_request_dm_for_dev(struct device *dev)
if (group->default_domain)
iommu_domain_free(group->default_domain);
group->default_domain = dm_domain;
+ group->domain = dm_domain;

pr_info("Using direct mapping for device %s\n", dev_name(dev));
--
2.5.5
Baoquan He
2016-11-25 05:13:19 UTC
Permalink
AMD pointed out it's unsafe to update the device-table while iommu
is enabled. It turns out that device-table pointer update is split
up into two 32bit writes in the IOMMU hardware. So updating it while
the IOMMU is enabled could have some nasty side effects.

The only way to work around this is to allocate the device-table below
4GB if translation is pre-enabled in kdump kernel. If allocation failed,
still use the old one.

Signed-off-by: Baoquan He <***@redhat.com>
---
drivers/iommu/amd_iommu_init.c | 12 ++++++++++++
1 file changed, 12 insertions(+)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index d362b63..f17f297 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -2149,11 +2149,23 @@ static void early_enable_iommu(struct amd_iommu *iommu)
*/
static void early_enable_iommus(void)
{
+ struct dev_table_entry *dev_tbl;
struct amd_iommu *iommu;
bool is_pre_enabled = false;

for_each_iommu(iommu) {
if (translation_pre_enabled(iommu)) {
+ gfp_t gfp_flag = GFP_KERNEL | __GFP_ZERO | GFP_DMA32;;
+
+ dev_tbl = (void *)__get_free_pages(gfp_flag,
+ get_order(dev_table_size));
+ if (dev_tbl != NULL) {
+ memcpy(dev_tbl, amd_iommu_dev_table, dev_table_size);
+ free_pages((unsigned long)amd_iommu_dev_table,
+ get_order(dev_table_size));
+ amd_iommu_dev_table = dev_tbl;
+ }
+
is_pre_enabled = true;
break;
}
--
2.5.5
Baoquan He
2016-12-24 03:47:05 UTC
Permalink
Hi Joerg,

Ping!

Could you help review this version? Not sure this could catch up to
v4.10 merging.

Thanks
Baoqaun
This is v7 post.
The principle of the fix is similar to intel iommu. Just defer the assignment
of device to domain to device driver init. In this version of post, a new
call-back is_attach_deferred is added to iommu-ops, it's used to check whether
we need defer the domain attach/detach in iommu-core code.
bnx2 NIC can't reset itself during driver init. Post patch to reset
it during driver init. IO_PAGE_FAULT can't be seen anymore.
Below is link of v5 post.
https://lists.linuxfoundation.org/pipermail/iommu/2016-September/018527.html
- Add sanity check when copy old dev tables.
- If a device is set up with guest translations (DTE.GV=1), then don't
copy that information but move the device over to an empty guest-cr3
table and handle the faults in the PPR log (which just answer them
with INVALID).
- Add is_attach_deferred call-back to iommu-ops. With this domain
can be deferred to device driver init cleanly.
- Allocate memory below 4G for dev table if translation pre-enabled.
AMD engineer pointed out that it's unsafe to update the device-table
while iommu is enabled. device-table pointer update is split up into
two 32bit writes in the IOMMU hardware. So updating it while the IOMMU
is enabled could have some nasty side effects.
iommu/amd: Detect pre enabled translation
iommu/amd: add several helper function
iommu/amd: Define bit fields for DTE particularly
iommu/amd: Add function copy_dev_tables
iommu/amd: copy old trans table from old kernel
iommu: Add is_attach_deferred call-back to iommu-ops
iommu/amd: Use is_attach_deferred call-back
iommu/amd: Add sanity check of irq remap information of old dev table
entry
iommu/amd: Don't copy GCR3 table root pointer
iommu/amd: Clear out the GV flag when handle deferred domain attach
iommu: Assign the direct mapped domain to group->domain
iommu/amd: Allocate memory below 4G for dev table if translation
pre-enabled
drivers/iommu/amd_iommu.c | 78 +++++++++-------
drivers/iommu/amd_iommu_init.c | 201 +++++++++++++++++++++++++++++++++++++---
drivers/iommu/amd_iommu_proto.h | 2 +
drivers/iommu/amd_iommu_types.h | 53 ++++++++++-
drivers/iommu/amd_iommu_v2.c | 18 +++-
drivers/iommu/iommu.c | 9 ++
include/linux/iommu.h | 1 +
7 files changed, 313 insertions(+), 49 deletions(-)
--
2.5.5
Loading...