Discussion:
[PATCH Makedumpfile 0/4] x86_64: Fix page_offset for randomized base enabled
Pratyush Anand
2016-10-24 16:48:41 UTC
Permalink
Patch 1/4 fixes page_offset calculation, so that it is correctly calculated
on KASLR enabled kernel as well.
Patch 2/4 simplifies VA to PA translation. New code has been benchmarked
against old code on a 4T system.
Patch 3/4 and 4/4 is removal of (now) unnecessary code.

I think, we should find a way to kill find_vememmap() as well, so that
VMEMMAP_START can be removed. I have very limited idea about x86, so unable
to do that as of now.

Pratyush Anand (4):
x86_64: Calculate page_offset from pt_load
x86_64: translate all VA to PA using page table values
x86_64: kill is_vmalloc_addr_x86_64()
x86_64: kill some unused initialization

arch/x86_64.c | 84 ++++++++++++++++++++--------------------------------------
makedumpfile.h | 9 +++----
2 files changed, 32 insertions(+), 61 deletions(-)
--
2.7.4
Pratyush Anand
2016-10-24 16:48:42 UTC
Permalink
Signed-off-by: Pratyush Anand <***@redhat.com>
---
arch/x86_64.c | 10 ++++++++--
makedumpfile.h | 4 ++--
2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86_64.c b/arch/x86_64.c
index ddf7be6bc57b..3a53b4fa03ed 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -187,6 +187,12 @@ vtop4_x86_64(unsigned long vaddr)
{
unsigned long page_dir, pml4, pgd_paddr, pgd_pte, pmd_paddr, pmd_pte;
unsigned long pte_paddr, pte;
+ unsigned long phys_base;
+
+ if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
+ phys_base = info->phys_base;
+ else
+ phys_base = 0;

if (SYMBOL(init_level4_pgt) == NOT_FOUND_SYMBOL) {
ERRMSG("Can't get the symbol of init_level4_pgt.\n");
@@ -196,9 +202,9 @@ vtop4_x86_64(unsigned long vaddr)
/*
* Get PGD.
*/
- page_dir = SYMBOL(init_level4_pgt);
+ page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base;
page_dir += pml4_index(vaddr) * sizeof(unsigned long);
- if (!readmem(VADDR, page_dir, &pml4, sizeof pml4)) {
+ if (!readmem(PADDR, page_dir, &pml4, sizeof pml4)) {
ERRMSG("Can't get pml4 (page_dir:%lx).\n", page_dir);
return NOT_PADDR;
}
diff --git a/makedumpfile.h b/makedumpfile.h
index f0154226bcb8..f64652e34901 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -876,12 +876,12 @@ int is_vmalloc_addr_x86_64(ulong vaddr);
int get_phys_base_x86_64(void);
int get_machdep_info_x86_64(void);
int get_versiondep_info_x86_64(void);
-unsigned long long vaddr_to_paddr_x86_64(unsigned long vaddr);
+unsigned long long vtop4_x86_64(unsigned long vaddr);
#define find_vmemmap() find_vmemmap_x86_64()
#define get_phys_base() get_phys_base_x86_64()
#define get_machdep_info() get_machdep_info_x86_64()
#define get_versiondep_info() get_versiondep_info_x86_64()
-#define vaddr_to_paddr(X) vaddr_to_paddr_x86_64(X)
+#define vaddr_to_paddr(X) vtop4_x86_64(X)
#define is_phys_addr(X) (!is_vmalloc_addr_x86_64(X))
#endif /* x86_64 */
--
2.7.4
Pratyush Anand
2016-10-24 16:51:07 UTC
Permalink
Ohhh..Please ignore this patch. This temporary changes were lying in
my directory and went away.

Sorry for this noise.

~Pratyush
Post by Pratyush Anand
---
arch/x86_64.c | 10 ++++++++--
makedumpfile.h | 4 ++--
2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/arch/x86_64.c b/arch/x86_64.c
index ddf7be6bc57b..3a53b4fa03ed 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -187,6 +187,12 @@ vtop4_x86_64(unsigned long vaddr)
{
unsigned long page_dir, pml4, pgd_paddr, pgd_pte, pmd_paddr, pmd_pte;
unsigned long pte_paddr, pte;
+ unsigned long phys_base;
+
+ if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
+ phys_base = info->phys_base;
+ else
+ phys_base = 0;
if (SYMBOL(init_level4_pgt) == NOT_FOUND_SYMBOL) {
ERRMSG("Can't get the symbol of init_level4_pgt.\n");
@@ -196,9 +202,9 @@ vtop4_x86_64(unsigned long vaddr)
/*
* Get PGD.
*/
- page_dir = SYMBOL(init_level4_pgt);
+ page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base;
page_dir += pml4_index(vaddr) * sizeof(unsigned long);
- if (!readmem(VADDR, page_dir, &pml4, sizeof pml4)) {
+ if (!readmem(PADDR, page_dir, &pml4, sizeof pml4)) {
ERRMSG("Can't get pml4 (page_dir:%lx).\n", page_dir);
return NOT_PADDR;
}
diff --git a/makedumpfile.h b/makedumpfile.h
index f0154226bcb8..f64652e34901 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -876,12 +876,12 @@ int is_vmalloc_addr_x86_64(ulong vaddr);
int get_phys_base_x86_64(void);
int get_machdep_info_x86_64(void);
int get_versiondep_info_x86_64(void);
-unsigned long long vaddr_to_paddr_x86_64(unsigned long vaddr);
+unsigned long long vtop4_x86_64(unsigned long vaddr);
#define find_vmemmap() find_vmemmap_x86_64()
#define get_phys_base() get_phys_base_x86_64()
#define get_machdep_info() get_machdep_info_x86_64()
#define get_versiondep_info() get_versiondep_info_x86_64()
-#define vaddr_to_paddr(X) vaddr_to_paddr_x86_64(X)
+#define vaddr_to_paddr(X) vtop4_x86_64(X)
#define is_phys_addr(X) (!is_vmalloc_addr_x86_64(X))
#endif /* x86_64 */
--
2.7.4
Pratyush Anand
2016-10-24 16:48:45 UTC
Permalink
From kernel documentation:
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space

So, it is only the module area which is lying above __START_KERNEL_map.
However, kexec-tools only creates PT_LOAD segments for kernel text region
and crash memory region. So, we can safely remove the check for
!is_vmalloc_addr_x86_64() from get_phys_base_x86_64().

Since, this was the last usage of is_vmalloc_addr_x86_64(), so kill it as
well.

Signed-off-by: Pratyush Anand <***@redhat.com>
---
arch/x86_64.c | 14 +-------------
makedumpfile.h | 3 +--
2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/arch/x86_64.c b/arch/x86_64.c
index fe2764a8bec2..597cdac36dfc 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -21,17 +21,6 @@
extern struct vmap_pfns *gvmem_pfns;
extern int nr_gvmem_pfns;

-int
-is_vmalloc_addr_x86_64(ulong vaddr)
-{
- /*
- * vmalloc, virtual memmap, and module space as VMALLOC space.
- */
- return ((vaddr >= VMALLOC_START && vaddr <= VMALLOC_END)
- || (vaddr >= VMEMMAP_START && vaddr <= VMEMMAP_END)
- || (vaddr >= MODULES_VADDR && vaddr <= MODULES_END));
-}
-
static unsigned long
get_xen_p2m_mfn(void)
{
@@ -75,8 +64,7 @@ get_phys_base_x86_64(void)
info->phys_base = 0; /* default/traditional */

for (i = 0; get_pt_load(i, &phys_start, NULL, &virt_start, NULL); i++) {
- if ((virt_start >= __START_KERNEL_map) &&
- !(is_vmalloc_addr_x86_64(virt_start))) {
+ if (virt_start >= __START_KERNEL_map) {

info->phys_base = phys_start -
(virt_start & ~(__START_KERNEL_map));
diff --git a/makedumpfile.h b/makedumpfile.h
index 13559651feb6..8a96da1f61bd 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -859,7 +859,6 @@ unsigned long long vaddr_to_paddr_x86(unsigned long vaddr);
#endif /* x86 */

#ifdef __x86_64__
-int is_vmalloc_addr_x86_64(ulong vaddr);
int get_phys_base_x86_64(void);
int get_machdep_info_x86_64(void);
int get_versiondep_info_x86_64(void);
@@ -869,7 +868,7 @@ unsigned long long vtop4_x86_64(unsigned long vaddr);
#define get_machdep_info() get_machdep_info_x86_64()
#define get_versiondep_info() get_versiondep_info_x86_64()
#define vaddr_to_paddr(X) vtop4_x86_64(X)
-#define is_phys_addr(X) (!is_vmalloc_addr_x86_64(X))
+#define is_phys_addr(X) stub_true_ul(X)
#endif /* x86_64 */

#ifdef __powerpc64__ /* powerpc64 */
--
2.7.4
Pratyush Anand
2016-10-24 16:48:43 UTC
Permalink
page_offset can always be calculated as 'virtual - physical' for a direct
mapping area on x86. Therefore, remove the version dependent calculation
and use this method.

Signed-off-by: Pratyush Anand <***@redhat.com>
---
arch/x86_64.c | 24 ++++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/arch/x86_64.c b/arch/x86_64.c
index ddf7be6bc57b..a96fd8ae00a1 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -44,6 +44,24 @@ get_xen_p2m_mfn(void)
return NOT_FOUND_LONG_VALUE;
}

+static int
+get_page_offset_x86_64(void)
+{
+ int i;
+ unsigned long long phys_start;
+ unsigned long long virt_start;
+
+ for (i = 0; get_pt_load(i, &phys_start, NULL, &virt_start, NULL); i++) {
+ if (virt_start >= __START_KERNEL_map) {
+ info->page_offset = virt_start - phys_start;
+ return TRUE;
+ }
+ }
+
+ ERRMSG("Can't get any pt_load to calculate page offset.\n");
+ return FALSE;
+}
+
int
get_phys_base_x86_64(void)
{
@@ -159,10 +177,8 @@ get_versiondep_info_x86_64(void)
else
info->max_physmem_bits = _MAX_PHYSMEM_BITS_2_6_31;

- if (info->kernel_version < KERNEL_VERSION(2, 6, 27))
- info->page_offset = __PAGE_OFFSET_ORIG;
- else
- info->page_offset = __PAGE_OFFSET_2_6_27;
+ if (!get_page_offset_x86_64())
+ return FALSE;

if (info->kernel_version < KERNEL_VERSION(2, 6, 31)) {
info->vmalloc_start = VMALLOC_START_ORIG;
--
2.7.4
Pratyush Anand
2016-10-27 06:03:36 UTC
Permalink
Post by Pratyush Anand
page_offset can always be calculated as 'virtual - physical' for a direct
mapping area on x86. Therefore, remove the version dependent calculation
and use this method.
---
arch/x86_64.c | 24 ++++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/arch/x86_64.c b/arch/x86_64.c
index ddf7be6bc57b..a96fd8ae00a1 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -44,6 +44,24 @@ get_xen_p2m_mfn(void)
return NOT_FOUND_LONG_VALUE;
}
+static int
+get_page_offset_x86_64(void)
+{
+ int i;
+ unsigned long long phys_start;
+ unsigned long long virt_start;
+
+ for (i = 0; get_pt_load(i, &phys_start, NULL, &virt_start, NULL); i++) {
+ if (virt_start >= __START_KERNEL_map) {
OK..So, this is the problem. We should have

if (virt_start < __START_KERNEL_map) {

Kernel text region lies above __START_KERNEL_map, which is linearly
mapped however not a direct mapping. Direct mapping region lies below it
instead. So, page_offset can only be calculated with a region which is
below __START_KERNEL_map.

Thanks Baoquan for finding it.

~Pratyush
Post by Pratyush Anand
+ info->page_offset = virt_start - phys_start;
+ return TRUE;
+ }
+ }
+
+ ERRMSG("Can't get any pt_load to calculate page offset.\n");
+ return FALSE;
+}
+
int
get_phys_base_x86_64(void)
{
@@ -159,10 +177,8 @@ get_versiondep_info_x86_64(void)
else
info->max_physmem_bits = _MAX_PHYSMEM_BITS_2_6_31;
- if (info->kernel_version < KERNEL_VERSION(2, 6, 27))
- info->page_offset = __PAGE_OFFSET_ORIG;
- else
- info->page_offset = __PAGE_OFFSET_2_6_27;
+ if (!get_page_offset_x86_64())
+ return FALSE;
if (info->kernel_version < KERNEL_VERSION(2, 6, 31)) {
info->vmalloc_start = VMALLOC_START_ORIG;
Atsushi Kumagai
2016-11-02 07:40:27 UTC
Permalink
Hello Pratyush,
Post by Pratyush Anand
Post by Pratyush Anand
page_offset can always be calculated as 'virtual - physical' for a direct
mapping area on x86. Therefore, remove the version dependent calculation
and use this method.
---
arch/x86_64.c | 24 ++++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/arch/x86_64.c b/arch/x86_64.c
index ddf7be6bc57b..a96fd8ae00a1 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -44,6 +44,24 @@ get_xen_p2m_mfn(void)
return NOT_FOUND_LONG_VALUE;
}
+static int
+get_page_offset_x86_64(void)
+{
+ int i;
+ unsigned long long phys_start;
+ unsigned long long virt_start;
+
+ for (i = 0; get_pt_load(i, &phys_start, NULL, &virt_start, NULL); i++) {
+ if (virt_start >= __START_KERNEL_map) {
OK..So, this is the problem. We should have
if (virt_start < __START_KERNEL_map) {
Kernel text region lies above __START_KERNEL_map, which is linearly
mapped however not a direct mapping. Direct mapping region lies below it
instead. So, page_offset can only be calculated with a region which is
below __START_KERNEL_map.
Thanks Baoquan for finding it.
Could you explain the issue in more detail, what is the difference
between "linear map" and "direct map" ?

BTW, I confirmed that the difference of v2 patch is only this and
the result of benchmark looks great, thank you very much.


Regards,
Atsushi Kumagai
Post by Pratyush Anand
Post by Pratyush Anand
+ info->page_offset = virt_start - phys_start;
+ return TRUE;
+ }
+ }
+
+ ERRMSG("Can't get any pt_load to calculate page offset.\n");
+ return FALSE;
+}
+
int
get_phys_base_x86_64(void)
{
@@ -159,10 +177,8 @@ get_versiondep_info_x86_64(void)
else
info->max_physmem_bits = _MAX_PHYSMEM_BITS_2_6_31;
- if (info->kernel_version < KERNEL_VERSION(2, 6, 27))
- info->page_offset = __PAGE_OFFSET_ORIG;
- else
- info->page_offset = __PAGE_OFFSET_2_6_27;
+ if (!get_page_offset_x86_64())
+ return FALSE;
if (info->kernel_version < KERNEL_VERSION(2, 6, 31)) {
info->vmalloc_start = VMALLOC_START_ORIG;
_______________________________________________
kexec mailing list
http://lists.infradead.org/mailman/listinfo/kexec
b***@redhat.com
2016-11-02 08:02:05 UTC
Permalink
Post by Atsushi Kumagai
Hello Pratyush,
Post by Pratyush Anand
Post by Pratyush Anand
page_offset can always be calculated as 'virtual - physical' for a direct
mapping area on x86. Therefore, remove the version dependent calculation
and use this method.
---
arch/x86_64.c | 24 ++++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/arch/x86_64.c b/arch/x86_64.c
index ddf7be6bc57b..a96fd8ae00a1 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -44,6 +44,24 @@ get_xen_p2m_mfn(void)
return NOT_FOUND_LONG_VALUE;
}
+static int
+get_page_offset_x86_64(void)
+{
+ int i;
+ unsigned long long phys_start;
+ unsigned long long virt_start;
+
+ for (i = 0; get_pt_load(i, &phys_start, NULL, &virt_start, NULL); i++) {
+ if (virt_start >= __START_KERNEL_map) {
OK..So, this is the problem. We should have
if (virt_start < __START_KERNEL_map) {
Kernel text region lies above __START_KERNEL_map, which is linearly
mapped however not a direct mapping. Direct mapping region lies below it
instead. So, page_offset can only be calculated with a region which is
below __START_KERNEL_map.
Thanks Baoquan for finding it.
Could you explain the issue in more detail, what is the difference
between "linear map" and "direct map" ?
Well, these could be easily misunderstood. Virtual address in kernel text
region or within direct mapping which is from PAGE_OFFSET are all linear
mapping. Because the related PA can be calculated by:
PA= VA - PAGE_OFFSET; //y=x - a;
PA= VA - __START_KERNEL_map + phys_base; //y=x-a+b

In user-space kexec-tools we created pt_load program segments for kernel
text region and crash memory regions. The first program header is kernel
text region. So here if we want to get page_offset, just need to ignore
the 1st one, kernel text region.
Post by Atsushi Kumagai
BTW, I confirmed that the difference of v2 patch is only this and
the result of benchmark looks great, thank you very much.
Regards,
Atsushi Kumagai
Post by Pratyush Anand
Post by Pratyush Anand
+ info->page_offset = virt_start - phys_start;
+ return TRUE;
+ }
+ }
+
+ ERRMSG("Can't get any pt_load to calculate page offset.\n");
+ return FALSE;
+}
+
int
get_phys_base_x86_64(void)
{
@@ -159,10 +177,8 @@ get_versiondep_info_x86_64(void)
else
info->max_physmem_bits = _MAX_PHYSMEM_BITS_2_6_31;
- if (info->kernel_version < KERNEL_VERSION(2, 6, 27))
- info->page_offset = __PAGE_OFFSET_ORIG;
- else
- info->page_offset = __PAGE_OFFSET_2_6_27;
+ if (!get_page_offset_x86_64())
+ return FALSE;
if (info->kernel_version < KERNEL_VERSION(2, 6, 31)) {
info->vmalloc_start = VMALLOC_START_ORIG;
_______________________________________________
kexec mailing list
http://lists.infradead.org/mailman/listinfo/kexec
Atsushi Kumagai
2016-11-04 10:35:49 UTC
Permalink
Hello,
Post by b***@redhat.com
Post by Atsushi Kumagai
Hello Pratyush,
Post by Pratyush Anand
Post by Pratyush Anand
page_offset can always be calculated as 'virtual - physical' for a direct
mapping area on x86. Therefore, remove the version dependent calculation
and use this method.
---
arch/x86_64.c | 24 ++++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/arch/x86_64.c b/arch/x86_64.c
index ddf7be6bc57b..a96fd8ae00a1 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -44,6 +44,24 @@ get_xen_p2m_mfn(void)
return NOT_FOUND_LONG_VALUE;
}
+static int
+get_page_offset_x86_64(void)
+{
+ int i;
+ unsigned long long phys_start;
+ unsigned long long virt_start;
+
+ for (i = 0; get_pt_load(i, &phys_start, NULL, &virt_start, NULL); i++) {
+ if (virt_start >= __START_KERNEL_map) {
OK..So, this is the problem. We should have
if (virt_start < __START_KERNEL_map) {
Kernel text region lies above __START_KERNEL_map, which is linearly
mapped however not a direct mapping. Direct mapping region lies below it
instead. So, page_offset can only be calculated with a region which is
below __START_KERNEL_map.
Thanks Baoquan for finding it.
Could you explain the issue in more detail, what is the difference
between "linear map" and "direct map" ?
Well, these could be easily misunderstood. Virtual address in kernel text
region or within direct mapping which is from PAGE_OFFSET are all linear
PA= VA - PAGE_OFFSET; //y=x - a;
PA= VA - __START_KERNEL_map + phys_base; //y=x-a+b
In user-space kexec-tools we created pt_load program segments for kernel
text region and crash memory regions. The first program header is kernel
text region. So here if we want to get page_offset, just need to ignore
the 1st one, kernel text region.
Thanks, I understand.
Now I have no concerns.

Regards,
Atsushi Kumagai
Post by b***@redhat.com
Post by Atsushi Kumagai
BTW, I confirmed that the difference of v2 patch is only this and
the result of benchmark looks great, thank you very much.
Regards,
Atsushi Kumagai
Post by Pratyush Anand
Post by Pratyush Anand
+ info->page_offset = virt_start - phys_start;
+ return TRUE;
+ }
+ }
+
+ ERRMSG("Can't get any pt_load to calculate page offset.\n");
+ return FALSE;
+}
+
int
get_phys_base_x86_64(void)
{
@@ -159,10 +177,8 @@ get_versiondep_info_x86_64(void)
else
info->max_physmem_bits = _MAX_PHYSMEM_BITS_2_6_31;
- if (info->kernel_version < KERNEL_VERSION(2, 6, 27))
- info->page_offset = __PAGE_OFFSET_ORIG;
- else
- info->page_offset = __PAGE_OFFSET_2_6_27;
+ if (!get_page_offset_x86_64())
+ return FALSE;
if (info->kernel_version < KERNEL_VERSION(2, 6, 31)) {
info->vmalloc_start = VMALLOC_START_ORIG;
_______________________________________________
kexec mailing list
http://lists.infradead.org/mailman/listinfo/kexec
_______________________________________________
kexec mailing list
http://lists.infradead.org/mailman/listinfo/kexec
Pratyush Anand
2016-10-24 16:48:46 UTC
Permalink
VMALLOC_START, VMALLOC_END, MODULES_VADDR and MODULES_END are mo more
needed for x86_64 now. So, kill their initialization.

Signed-off-by: Pratyush Anand <***@redhat.com>
---
arch/x86_64.c | 4 ----
makedumpfile.h | 2 --
2 files changed, 6 deletions(-)

diff --git a/arch/x86_64.c b/arch/x86_64.c
index 597cdac36dfc..13990cef839b 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -169,13 +169,9 @@ get_versiondep_info_x86_64(void)
return FALSE;

if (info->kernel_version < KERNEL_VERSION(2, 6, 31)) {
- info->vmalloc_start = VMALLOC_START_ORIG;
- info->vmalloc_end = VMALLOC_END_ORIG;
info->vmemmap_start = VMEMMAP_START_ORIG;
info->vmemmap_end = VMEMMAP_END_ORIG;
} else {
- info->vmalloc_start = VMALLOC_START_2_6_31;
- info->vmalloc_end = VMALLOC_END_2_6_31;
info->vmemmap_start = VMEMMAP_START_2_6_31;
info->vmemmap_end = VMEMMAP_END_2_6_31;
}
diff --git a/makedumpfile.h b/makedumpfile.h
index 8a96da1f61bd..338c651388f0 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -575,8 +575,6 @@ int get_va_bits_arm64(void);
#define __START_KERNEL_map (0xffffffff80000000)
#define KERNEL_IMAGE_SIZE_ORIG (0x0000000008000000) /* 2.6.25, or former */
#define KERNEL_IMAGE_SIZE_2_6_26 (0x0000000020000000) /* 2.6.26, or later */
-#define MODULES_VADDR (__START_KERNEL_map + NUMBER(KERNEL_IMAGE_SIZE))
-#define MODULES_END (0xfffffffffff00000)
#define KVBASE PAGE_OFFSET
#define _SECTION_SIZE_BITS (27)
#define _MAX_PHYSMEM_BITS_ORIG (40)
--
2.7.4
Pratyush Anand
2016-10-24 16:48:44 UTC
Permalink
Currently we translate some of the VA areas using linear mapping while some
other(which can not be linearly mapped) using page table.

However, we will have entry of a page in the page table irrespective of its
virtual region. So, we can always look into page table for any VA to PA
translation. This approach will solve lot of complexity in makedumpfile. It
will in turn remove dependency over variables like VMALLOC_START,
MODULES_VADDR etc whose definition keeps changing in newer kernel version.

Moreover, I do not see any side effect of this approach in terms of
execution timing. I tested with IBM x3950 X6 machine having 4136359 MB of
memory. These are the results of makedumpfile execution time:

Without this patch:
===================
With -d 31:
Trial 1: 237.59526248 S
Trial 2: 235.236914962 S
Trail 3: 237.678712045 S

With -d 1:
Trial 1: 2548.905296877 S
Trial 2: 2549.759881756 S

With this patch:
===================
With -d 31:
Trial 1: 232.713841516 S
Trial 2: 228.45697177 S
Trail 3: 232.942262441 S

With -d 1:
Trial 1: 2768.424565806 S
Trial 2: 2749.622115455 S
Trail 3: 2537.770359073 S

Signed-off-by: Pratyush Anand <***@redhat.com>
---
arch/x86_64.c | 42 ++++++++----------------------------------
makedumpfile.h | 4 ++--
2 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/arch/x86_64.c b/arch/x86_64.c
index a96fd8ae00a1..fe2764a8bec2 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -203,6 +203,12 @@ vtop4_x86_64(unsigned long vaddr)
{
unsigned long page_dir, pml4, pgd_paddr, pgd_pte, pmd_paddr, pmd_pte;
unsigned long pte_paddr, pte;
+ unsigned long phys_base;
+
+ if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
+ phys_base = info->phys_base;
+ else
+ phys_base = 0;

if (SYMBOL(init_level4_pgt) == NOT_FOUND_SYMBOL) {
ERRMSG("Can't get the symbol of init_level4_pgt.\n");
@@ -212,9 +218,9 @@ vtop4_x86_64(unsigned long vaddr)
/*
* Get PGD.
*/
- page_dir = SYMBOL(init_level4_pgt);
+ page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base;
page_dir += pml4_index(vaddr) * sizeof(unsigned long);
- if (!readmem(VADDR, page_dir, &pml4, sizeof pml4)) {
+ if (!readmem(PADDR, page_dir, &pml4, sizeof pml4)) {
ERRMSG("Can't get pml4 (page_dir:%lx).\n", page_dir);
return NOT_PADDR;
}
@@ -285,38 +291,6 @@ vtop4_x86_64(unsigned long vaddr)
return (pte & ENTRY_MASK) + PAGEOFFSET(vaddr);
}

-unsigned long long
-vaddr_to_paddr_x86_64(unsigned long vaddr)
-{
- unsigned long phys_base;
- unsigned long long paddr;
-
- /*
- * Check the relocatable kernel.
- */
- if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
- phys_base = info->phys_base;
- else
- phys_base = 0;
-
- if (is_vmalloc_addr_x86_64(vaddr)) {
- if ((paddr = vtop4_x86_64(vaddr)) == NOT_PADDR) {
- ERRMSG("Can't convert a virtual address(%lx) to " \
- "physical address.\n", vaddr);
- return NOT_PADDR;
- }
- } else if (vaddr >= __START_KERNEL_map) {
- paddr = vaddr - __START_KERNEL_map + phys_base;
-
- } else {
- if (is_xen_memory())
- paddr = vaddr - PAGE_OFFSET_XEN_DOM0;
- else
- paddr = vaddr - PAGE_OFFSET;
- }
- return paddr;
-}
-
/*
* for Xen extraction
*/
diff --git a/makedumpfile.h b/makedumpfile.h
index a5955ff750e5..13559651feb6 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -863,12 +863,12 @@ int is_vmalloc_addr_x86_64(ulong vaddr);
int get_phys_base_x86_64(void);
int get_machdep_info_x86_64(void);
int get_versiondep_info_x86_64(void);
-unsigned long long vaddr_to_paddr_x86_64(unsigned long vaddr);
+unsigned long long vtop4_x86_64(unsigned long vaddr);
#define find_vmemmap() find_vmemmap_x86_64()
#define get_phys_base() get_phys_base_x86_64()
#define get_machdep_info() get_machdep_info_x86_64()
#define get_versiondep_info() get_versiondep_info_x86_64()
-#define vaddr_to_paddr(X) vaddr_to_paddr_x86_64(X)
+#define vaddr_to_paddr(X) vtop4_x86_64(X)
#define is_phys_addr(X) (!is_vmalloc_addr_x86_64(X))
#endif /* x86_64 */
--
2.7.4
Atsushi Kumagai
2016-10-25 09:20:28 UTC
Permalink
Hello Pratysh,
Post by Pratyush Anand
Currently we translate some of the VA areas using linear mapping while some
other(which can not be linearly mapped) using page table.
However, we will have entry of a page in the page table irrespective of its
virtual region. So, we can always look into page table for any VA to PA
translation. This approach will solve lot of complexity in makedumpfile. It
will in turn remove dependency over variables like VMALLOC_START,
MODULES_VADDR etc whose definition keeps changing in newer kernel version.
Moreover, I do not see any side effect of this approach in terms of
execution timing. I tested with IBM x3950 X6 machine having 4136359 MB of
===================
Trial 1: 237.59526248 S
Trial 2: 235.236914962 S
Trail 3: 237.678712045 S
Trial 1: 2548.905296877 S
Trial 2: 2549.759881756 S
===================
Trial 1: 232.713841516 S
Trial 2: 228.45697177 S
Trail 3: 232.942262441 S
Trial 1: 2768.424565806 S
Trial 2: 2749.622115455 S
Trail 3: 2537.770359073 S
Could you increase the number of trials ?
If the average time is close to the results of Trial 1 (2768s) and 2 (2749s),
the regression rate is 8% and it sounds neither large nor small.
If the average is a level of 2500s like Trial 3, it's ideal.
Post by Pratyush Anand
---
arch/x86_64.c | 42 ++++++++----------------------------------
makedumpfile.h | 4 ++--
2 files changed, 10 insertions(+), 36 deletions(-)
diff --git a/arch/x86_64.c b/arch/x86_64.c
index a96fd8ae00a1..fe2764a8bec2 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -203,6 +203,12 @@ vtop4_x86_64(unsigned long vaddr)
{
unsigned long page_dir, pml4, pgd_paddr, pgd_pte, pmd_paddr, pmd_pte;
unsigned long pte_paddr, pte;
+ unsigned long phys_base;
+
+ if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
+ phys_base = info->phys_base;
+ else
+ phys_base = 0;
if (SYMBOL(init_level4_pgt) == NOT_FOUND_SYMBOL) {
ERRMSG("Can't get the symbol of init_level4_pgt.\n");
@@ -212,9 +218,9 @@ vtop4_x86_64(unsigned long vaddr)
/*
* Get PGD.
*/
- page_dir = SYMBOL(init_level4_pgt);
+ page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base;
I want to confirm that this VA to PA translation is always safe,
otherwise we should do the condition check which was done in
vaddr_to_paddr_x86_64(), isn't it ?


Thanks,
Atsushi Kumagai
Post by Pratyush Anand
page_dir += pml4_index(vaddr) * sizeof(unsigned long);
- if (!readmem(VADDR, page_dir, &pml4, sizeof pml4)) {
+ if (!readmem(PADDR, page_dir, &pml4, sizeof pml4)) {
ERRMSG("Can't get pml4 (page_dir:%lx).\n", page_dir);
return NOT_PADDR;
}
@@ -285,38 +291,6 @@ vtop4_x86_64(unsigned long vaddr)
return (pte & ENTRY_MASK) + PAGEOFFSET(vaddr);
}
-unsigned long long
-vaddr_to_paddr_x86_64(unsigned long vaddr)
-{
- unsigned long phys_base;
- unsigned long long paddr;
-
- /*
- * Check the relocatable kernel.
- */
- if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
- phys_base = info->phys_base;
- else
- phys_base = 0;
-
- if (is_vmalloc_addr_x86_64(vaddr)) {
- if ((paddr = vtop4_x86_64(vaddr)) == NOT_PADDR) {
- ERRMSG("Can't convert a virtual address(%lx) to " \
- "physical address.\n", vaddr);
- return NOT_PADDR;
- }
- } else if (vaddr >= __START_KERNEL_map) {
- paddr = vaddr - __START_KERNEL_map + phys_base;
-
- } else {
- if (is_xen_memory())
- paddr = vaddr - PAGE_OFFSET_XEN_DOM0;
- else
- paddr = vaddr - PAGE_OFFSET;
- }
- return paddr;
-}
-
/*
* for Xen extraction
*/
diff --git a/makedumpfile.h b/makedumpfile.h
index a5955ff750e5..13559651feb6 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -863,12 +863,12 @@ int is_vmalloc_addr_x86_64(ulong vaddr);
int get_phys_base_x86_64(void);
int get_machdep_info_x86_64(void);
int get_versiondep_info_x86_64(void);
-unsigned long long vaddr_to_paddr_x86_64(unsigned long vaddr);
+unsigned long long vtop4_x86_64(unsigned long vaddr);
#define find_vmemmap() find_vmemmap_x86_64()
#define get_phys_base() get_phys_base_x86_64()
#define get_machdep_info() get_machdep_info_x86_64()
#define get_versiondep_info() get_versiondep_info_x86_64()
-#define vaddr_to_paddr(X) vaddr_to_paddr_x86_64(X)
+#define vaddr_to_paddr(X) vtop4_x86_64(X)
#define is_phys_addr(X) (!is_vmalloc_addr_x86_64(X))
#endif /* x86_64 */
--
2.7.4
_______________________________________________
kexec mailing list
http://lists.infradead.org/mailman/listinfo/kexec
Pratyush Anand
2016-10-25 15:12:02 UTC
Permalink
Hi Atsushi,

Thanks for the review.
Post by Atsushi Kumagai
Hello Pratysh,
Post by Pratyush Anand
Currently we translate some of the VA areas using linear mapping while some
other(which can not be linearly mapped) using page table.
However, we will have entry of a page in the page table irrespective of its
virtual region. So, we can always look into page table for any VA to PA
translation. This approach will solve lot of complexity in makedumpfile. It
will in turn remove dependency over variables like VMALLOC_START,
MODULES_VADDR etc whose definition keeps changing in newer kernel version.
Moreover, I do not see any side effect of this approach in terms of
execution timing. I tested with IBM x3950 X6 machine having 4136359 MB of
===================
Trial 1: 237.59526248 S
Trial 2: 235.236914962 S
Trail 3: 237.678712045 S
Trial 1: 2548.905296877 S
Trial 2: 2549.759881756 S
===================
Trial 1: 232.713841516 S
Trial 2: 228.45697177 S
Trail 3: 232.942262441 S
Trial 1: 2768.424565806 S
Trial 2: 2749.622115455 S
Trail 3: 2537.770359073 S
Could you increase the number of trials ?
OK, I can do that. Might take some time, as I will have to arrange that
high memory machine again.
Post by Atsushi Kumagai
If the average time is close to the results of Trial 1 (2768s) and 2 (2749s),
the regression rate is 8% and it sounds neither large nor small.
If the average is a level of 2500s like Trial 3, it's ideal.
Post by Pratyush Anand
---
arch/x86_64.c | 42 ++++++++----------------------------------
makedumpfile.h | 4 ++--
2 files changed, 10 insertions(+), 36 deletions(-)
diff --git a/arch/x86_64.c b/arch/x86_64.c
index a96fd8ae00a1..fe2764a8bec2 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -203,6 +203,12 @@ vtop4_x86_64(unsigned long vaddr)
{
unsigned long page_dir, pml4, pgd_paddr, pgd_pte, pmd_paddr, pmd_pte;
unsigned long pte_paddr, pte;
+ unsigned long phys_base;
+
+ if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
+ phys_base = info->phys_base;
+ else
+ phys_base = 0;
if (SYMBOL(init_level4_pgt) == NOT_FOUND_SYMBOL) {
ERRMSG("Can't get the symbol of init_level4_pgt.\n");
@@ -212,9 +218,9 @@ vtop4_x86_64(unsigned long vaddr)
/*
* Get PGD.
*/
- page_dir = SYMBOL(init_level4_pgt);
+ page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base;
I want to confirm that this VA to PA translation is always safe,
otherwise we should do the condition check which was done in
vaddr_to_paddr_x86_64(), isn't it ?
I think this should be safe, however x86 expert can comment better.
Baoquan any comment here?

~Pratyush
Post by Atsushi Kumagai
Thanks,
Atsushi Kumagai
Post by Pratyush Anand
page_dir += pml4_index(vaddr) * sizeof(unsigned long);
- if (!readmem(VADDR, page_dir, &pml4, sizeof pml4)) {
+ if (!readmem(PADDR, page_dir, &pml4, sizeof pml4)) {
ERRMSG("Can't get pml4 (page_dir:%lx).\n", page_dir);
return NOT_PADDR;
}
@@ -285,38 +291,6 @@ vtop4_x86_64(unsigned long vaddr)
return (pte & ENTRY_MASK) + PAGEOFFSET(vaddr);
}
-unsigned long long
-vaddr_to_paddr_x86_64(unsigned long vaddr)
-{
- unsigned long phys_base;
- unsigned long long paddr;
-
- /*
- * Check the relocatable kernel.
- */
- if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
- phys_base = info->phys_base;
- else
- phys_base = 0;
-
- if (is_vmalloc_addr_x86_64(vaddr)) {
- if ((paddr = vtop4_x86_64(vaddr)) == NOT_PADDR) {
- ERRMSG("Can't convert a virtual address(%lx) to " \
- "physical address.\n", vaddr);
- return NOT_PADDR;
- }
- } else if (vaddr >= __START_KERNEL_map) {
- paddr = vaddr - __START_KERNEL_map + phys_base;
-
- } else {
- if (is_xen_memory())
- paddr = vaddr - PAGE_OFFSET_XEN_DOM0;
- else
- paddr = vaddr - PAGE_OFFSET;
- }
- return paddr;
-}
-
/*
* for Xen extraction
*/
diff --git a/makedumpfile.h b/makedumpfile.h
index a5955ff750e5..13559651feb6 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -863,12 +863,12 @@ int is_vmalloc_addr_x86_64(ulong vaddr);
int get_phys_base_x86_64(void);
int get_machdep_info_x86_64(void);
int get_versiondep_info_x86_64(void);
-unsigned long long vaddr_to_paddr_x86_64(unsigned long vaddr);
+unsigned long long vtop4_x86_64(unsigned long vaddr);
#define find_vmemmap() find_vmemmap_x86_64()
#define get_phys_base() get_phys_base_x86_64()
#define get_machdep_info() get_machdep_info_x86_64()
#define get_versiondep_info() get_versiondep_info_x86_64()
-#define vaddr_to_paddr(X) vaddr_to_paddr_x86_64(X)
+#define vaddr_to_paddr(X) vtop4_x86_64(X)
#define is_phys_addr(X) (!is_vmalloc_addr_x86_64(X))
#endif /* x86_64 */
--
2.7.4
_______________________________________________
kexec mailing list
http://lists.infradead.org/mailman/listinfo/kexec
b***@redhat.com
2016-10-25 23:28:04 UTC
Permalink
Post by Atsushi Kumagai
Post by Pratyush Anand
Trial 1: 2768.424565806 S
Trial 2: 2749.622115455 S
Trail 3: 2537.770359073 S
Could you increase the number of trials ?
OK, I can do that. Might take some time, as I will have to arrange that high
memory machine again.
Post by Atsushi Kumagai
If the average time is close to the results of Trial 1 (2768s) and 2 (2749s),
the regression rate is 8% and it sounds neither large nor small.
If the average is a level of 2500s like Trial 3, it's ideal.
Post by Pratyush Anand
---
arch/x86_64.c | 42 ++++++++----------------------------------
makedumpfile.h | 4 ++--
2 files changed, 10 insertions(+), 36 deletions(-)
diff --git a/arch/x86_64.c b/arch/x86_64.c
index a96fd8ae00a1..fe2764a8bec2 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -203,6 +203,12 @@ vtop4_x86_64(unsigned long vaddr)
{
unsigned long page_dir, pml4, pgd_paddr, pgd_pte, pmd_paddr, pmd_pte;
unsigned long pte_paddr, pte;
+ unsigned long phys_base;
+
+ if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
+ phys_base = info->phys_base;
+ else
+ phys_base = 0;
if (SYMBOL(init_level4_pgt) == NOT_FOUND_SYMBOL) {
ERRMSG("Can't get the symbol of init_level4_pgt.\n");
@@ -212,9 +218,9 @@ vtop4_x86_64(unsigned long vaddr)
/*
* Get PGD.
*/
- page_dir = SYMBOL(init_level4_pgt);
+ page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base;
I want to confirm that this VA to PA translation is always safe,
otherwise we should do the condition check which was done in
vaddr_to_paddr_x86_64(), isn't it ?
I think this should be safe, however x86 expert can comment better. Baoquan
any comment here?
Yes, I think this is safe. Below is the physical to virtual address
translation function in x86 64. And init_level4_pgt is a global variable
located in kernel text region.

arch/x86/include/asm/page_64.h

static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;

/* use the carry flag to determine if x was < __START_KERNEL_map */
x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));

return x;
}
Atsushi Kumagai
2016-10-26 06:24:43 UTC
Permalink
Post by b***@redhat.com
Post by Atsushi Kumagai
Post by Pratyush Anand
Trial 1: 2768.424565806 S
Trial 2: 2749.622115455 S
Trail 3: 2537.770359073 S
Could you increase the number of trials ?
OK, I can do that. Might take some time, as I will have to arrange that high
memory machine again.
Post by Atsushi Kumagai
If the average time is close to the results of Trial 1 (2768s) and 2 (2749s),
the regression rate is 8% and it sounds neither large nor small.
If the average is a level of 2500s like Trial 3, it's ideal.
Post by Pratyush Anand
---
arch/x86_64.c | 42 ++++++++----------------------------------
makedumpfile.h | 4 ++--
2 files changed, 10 insertions(+), 36 deletions(-)
diff --git a/arch/x86_64.c b/arch/x86_64.c
index a96fd8ae00a1..fe2764a8bec2 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -203,6 +203,12 @@ vtop4_x86_64(unsigned long vaddr)
{
unsigned long page_dir, pml4, pgd_paddr, pgd_pte, pmd_paddr, pmd_pte;
unsigned long pte_paddr, pte;
+ unsigned long phys_base;
+
+ if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
+ phys_base = info->phys_base;
+ else
+ phys_base = 0;
if (SYMBOL(init_level4_pgt) == NOT_FOUND_SYMBOL) {
ERRMSG("Can't get the symbol of init_level4_pgt.\n");
@@ -212,9 +218,9 @@ vtop4_x86_64(unsigned long vaddr)
/*
* Get PGD.
*/
- page_dir = SYMBOL(init_level4_pgt);
+ page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base;
I want to confirm that this VA to PA translation is always safe,
otherwise we should do the condition check which was done in
vaddr_to_paddr_x86_64(), isn't it ?
I think this should be safe, however x86 expert can comment better. Baoquan
any comment here?
Yes, I think this is safe. Below is the physical to virtual address
translation function in x86 64. And init_level4_pgt is a global variable
located in kernel text region.
arch/x86/include/asm/page_64.h
static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
/* use the carry flag to determine if x was < __START_KERNEL_map */
x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
return x;
}
Good, thanks for your comment.
I'll wait for the result of the benchmark.

Regards,
Atsushi Kumagai
Louis Bouchard
2016-10-25 09:17:00 UTC
Permalink
Hello,
Post by Pratyush Anand
Patch 1/4 fixes page_offset calculation, so that it is correctly calculated
on KASLR enabled kernel as well.
Patch 2/4 simplifies VA to PA translation. New code has been benchmarked
against old code on a 4T system.
Patch 3/4 and 4/4 is removal of (now) unnecessary code.
I think, we should find a way to kill find_vememmap() as well, so that
VMEMMAP_START can be removed. I have very limited idea about x86, so unable
to do that as of now.
x86_64: Calculate page_offset from pt_load
x86_64: translate all VA to PA using page table values
x86_64: kill is_vmalloc_addr_x86_64()
x86_64: kill some unused initialization
arch/x86_64.c | 84 ++++++++++++++++++++--------------------------------------
makedumpfile.h | 9 +++----
2 files changed, 32 insertions(+), 61 deletions(-)
Cross-posting but FYI, this patch fixes the issue reported in the following
thread : makedumpfile issues many readpage_elf: Attempt to read non-existent page[1]

It corresponds to the commit identified by my kernel bisection.

I will wait until it is accepted in Kumagai-san's tree to include it in
Debian/Ubuntu.

Kind regards,

...Louis

[1] https://www.mail-archive.com/***@lists.infradead.org/msg16434.html
--
Louis Bouchard
Software engineer, Cloud & Sustaining eng.
Canonical Ltd
Ubuntu developer Debian Maintainer
GPG : 429D 7A3B DD05 B6F8 AF63 B9C4 8B3D 867C 823E 7A61
Pratyush Anand
2016-10-25 09:20:38 UTC
Permalink
Hi Louis,

On Tue, Oct 25, 2016 at 2:47 PM, Louis Bouchard
Post by Louis Bouchard
Hello,
Post by Pratyush Anand
Patch 1/4 fixes page_offset calculation, so that it is correctly calculated
on KASLR enabled kernel as well.
Patch 2/4 simplifies VA to PA translation. New code has been benchmarked
against old code on a 4T system.
Patch 3/4 and 4/4 is removal of (now) unnecessary code.
I think, we should find a way to kill find_vememmap() as well, so that
VMEMMAP_START can be removed. I have very limited idea about x86, so unable
to do that as of now.
x86_64: Calculate page_offset from pt_load
x86_64: translate all VA to PA using page table values
x86_64: kill is_vmalloc_addr_x86_64()
x86_64: kill some unused initialization
arch/x86_64.c | 84 ++++++++++++++++++++--------------------------------------
makedumpfile.h | 9 +++----
2 files changed, 32 insertions(+), 61 deletions(-)
Cross-posting but FYI, this patch fixes the issue reported in the following
thread : makedumpfile issues many readpage_elf: Attempt to read non-existent page[1]
It corresponds to the commit identified by my kernel bisection.
I will wait until it is accepted in Kumagai-san's tree to include it in
Debian/Ubuntu.
Thanks a lot for your testing.

~Pratyush
Dave Young
2016-10-27 02:37:43 UTC
Permalink
Hi Pratyush,
Post by Pratyush Anand
Patch 1/4 fixes page_offset calculation, so that it is correctly calculated
on KASLR enabled kernel as well.
Patch 2/4 simplifies VA to PA translation. New code has been benchmarked
against old code on a 4T system.
Patch 3/4 and 4/4 is removal of (now) unnecessary code.
I think, we should find a way to kill find_vememmap() as well, so that
VMEMMAP_START can be removed. I have very limited idea about x86, so unable
to do that as of now.
x86_64: Calculate page_offset from pt_load
x86_64: translate all VA to PA using page table values
x86_64: kill is_vmalloc_addr_x86_64()
x86_64: kill some unused initialization
arch/x86_64.c | 84 ++++++++++++++++++++--------------------------------------
makedumpfile.h | 9 +++----
2 files changed, 32 insertions(+), 61 deletions(-)
According to our test, with these patches the dumped vmcore is correct
which means simple crash test `bt` works. But the saved vmcore size is
larger than before.

I collected two --message-level 31 logs with/without your patches, the
kernel kaslr was disabled during my test so that we can focus on the
vmcore size issue, it looks like mem_map address was not collected
correctly.

[please ignore the patched log extra debug message I added]

--- unpatched-makedumpfile.txt 2016-10-27 10:31:34.152962407 +0800
+++ patched-makedumpfile.txt 2016-10-27 10:28:44.213952179 +0800
@@ -1,4 +1,4 @@
-[127.0.0.1-2016-10-27-10:27:03]# /tmp/makedumpfile -l -d 31 --message-level 31 vmcore vmcore.m.1
+[127.0.0.1-2016-10-27-10:27:03]# /home/dyoung/git/makedumpfile/makedumpfile -l -d 31 --message-level 31 vmcore vmcore.m
sadump: does not have partition header
sadump: read dump device as unknown format
sadump: unknown format
@@ -36,70 +36,76 @@

Memory type : SPARSEMEM_EX

+printing memmap .....
mem_map (0)
- mem_map : ffffea0000000000
+ mem_map : 0
pfn_start : 0
pfn_end : 8000
+printing memmap .....
mem_map (1)
- mem_map : ffffea0000200000
+ mem_map : 0
pfn_start : 8000
pfn_end : 10000
+printing memmap .....
mem_map (2)
- mem_map : ffffea0000400000
+ mem_map : 0
pfn_start : 10000
pfn_end : 18000
+printing memmap .....
mem_map (3)
- mem_map : ffffea0000600000
+ mem_map : 0
pfn_start : 18000
pfn_end : 20000
+printing memmap .....
mem_map (4)
- mem_map : ffffea0000800000
+ mem_map : 0
pfn_start : 20000
pfn_end : 28000
+printing memmap .....
mem_map (5)
- mem_map : ffffea0000a00000
+ mem_map : 0
pfn_start : 28000
pfn_end : 30000
+printing memmap .....
mem_map (6)
- mem_map : ffffea0000c00000
+ mem_map : 0
pfn_start : 30000
pfn_end : 38000
+printing memmap .....
mem_map (7)
- mem_map : ffffea0000e00000
+ mem_map : 0
pfn_start : 38000
pfn_end : 3ffda
mmap() is available on the kernel.
Checking for memory holes : [100.0 %] |STEP [Checking for memory holes ] : 0.000030 seconds
-Excluding unnecessary pages : [100.0 %] \STEP [Excluding unnecessary pages] : 0.007547 seconds
-Checking for memory holes : [100.0 %] -STEP [Checking for memory holes ] : 0.000016 seconds
-Checking for memory holes : [100.0 %] /STEP [Checking for memory holes ] : 0.000009 seconds
-Excluding unnecessary pages : [100.0 %] |STEP [Excluding unnecessary pages] : 0.006623 seconds
-Copying data : [100.0 %] \STEP [Copying data ] : 0.184442 seconds
-Copying data : [100.0 %] -STEP [Copying data ] : 0.000041 seconds
+Excluding unnecessary pages : [100.0 %] \STEP [Excluding unnecessary pages] : 0.000007 seconds
+Checking for memory holes : [100.0 %] -STEP [Checking for memory holes ] : 0.000014 seconds
+Checking for memory holes : [100.0 %] /STEP [Checking for memory holes ] : 0.000008 seconds
+Excluding unnecessary pages : [100.0 %] |STEP [Excluding unnecessary pages] : 0.000007 seconds
+Copying data : [100.0 %] /STEP [Copying data ] : 1.421661 seconds
+Copying data : [100.0 %] |STEP [Copying data ] : 0.000036 seconds

Writing erase info...
-offset_eraseinfo: dd5c4e, size_eraseinfo: 0
+offset_eraseinfo: 888c149, size_eraseinfo: 0

Original pages : 0x0000000000030c7d
- Excluded pages : 0x000000000002cf58
- Pages filled with zero : 0x00000000000002be
- Non-private cache pages : 0x0000000000006574
- Private cache pages : 0x0000000000000f27
- User process data pages : 0x0000000000003481
- Free pages : 0x000000000002237e
+ Excluded pages : 0x000000000001d534
+ Pages filled with zero : 0x000000000001d534
+ Non-private cache pages : 0x0000000000000000
+ Private cache pages : 0x0000000000000000
+ User process data pages : 0x0000000000000000
+ Free pages : 0x0000000000000000
Hwpoison pages : 0x0000000000000000
- Remaining pages : 0x0000000000003d25
- (The number of pages is reduced to 7%.)
+ Remaining pages : 0x0000000000013749
+ (The number of pages is reduced to 39%.)
Memory Hole : 0x000000000000f35d
--------------------------------------------------
Total pages : 0x000000000003ffda

-Cache hit: 29946, miss: 47, hit rate: 99.8%
+Cache hit: 196285, miss: 201, hit rate: 99.9%


-The dumpfile is saved to vmcore.m.1.
+The dumpfile is saved to vmcore.m.

makedumpfile Completed.
-[***@dhcp-128-65 127.0.0.1-2016-10-27-10:27:03]# ls -lth vmcore.m*
--rw------- 1 root root 14M Oct 27 10:30 vmcore.m.1
--rw------- 1 root root 137M Oct 27 10:28 vmcore.m
+[***@dhcp-128-65 127.0.0.1-2016-10-27-10:27:03]#
Dave Young
2016-10-27 02:54:37 UTC
Permalink
I put the cped-vmcore/vmlinux here:
https://people.redhat.com/~ruyang/test/

Adding Dave Anderson for any comments about the vmcore correctness from
crash point of view..
Post by Dave Young
Hi Pratyush,
Post by Pratyush Anand
Patch 1/4 fixes page_offset calculation, so that it is correctly calculated
on KASLR enabled kernel as well.
Patch 2/4 simplifies VA to PA translation. New code has been benchmarked
against old code on a 4T system.
Patch 3/4 and 4/4 is removal of (now) unnecessary code.
I think, we should find a way to kill find_vememmap() as well, so that
VMEMMAP_START can be removed. I have very limited idea about x86, so unable
to do that as of now.
x86_64: Calculate page_offset from pt_load
x86_64: translate all VA to PA using page table values
x86_64: kill is_vmalloc_addr_x86_64()
x86_64: kill some unused initialization
arch/x86_64.c | 84 ++++++++++++++++++++--------------------------------------
makedumpfile.h | 9 +++----
2 files changed, 32 insertions(+), 61 deletions(-)
According to our test, with these patches the dumped vmcore is correct
which means simple crash test `bt` works. But the saved vmcore size is
larger than before.
I collected two --message-level 31 logs with/without your patches, the
kernel kaslr was disabled during my test so that we can focus on the
vmcore size issue, it looks like mem_map address was not collected
correctly.
[please ignore the patched log extra debug message I added]
--- unpatched-makedumpfile.txt 2016-10-27 10:31:34.152962407 +0800
+++ patched-makedumpfile.txt 2016-10-27 10:28:44.213952179 +0800
@@ -1,4 +1,4 @@
-[127.0.0.1-2016-10-27-10:27:03]# /tmp/makedumpfile -l -d 31 --message-level 31 vmcore vmcore.m.1
+[127.0.0.1-2016-10-27-10:27:03]# /home/dyoung/git/makedumpfile/makedumpfile -l -d 31 --message-level 31 vmcore vmcore.m
sadump: does not have partition header
sadump: read dump device as unknown format
sadump: unknown format
@@ -36,70 +36,76 @@
Memory type : SPARSEMEM_EX
+printing memmap .....
mem_map (0)
- mem_map : ffffea0000000000
+ mem_map : 0
pfn_start : 0
pfn_end : 8000
+printing memmap .....
mem_map (1)
- mem_map : ffffea0000200000
+ mem_map : 0
pfn_start : 8000
pfn_end : 10000
+printing memmap .....
mem_map (2)
- mem_map : ffffea0000400000
+ mem_map : 0
pfn_start : 10000
pfn_end : 18000
+printing memmap .....
mem_map (3)
- mem_map : ffffea0000600000
+ mem_map : 0
pfn_start : 18000
pfn_end : 20000
+printing memmap .....
mem_map (4)
- mem_map : ffffea0000800000
+ mem_map : 0
pfn_start : 20000
pfn_end : 28000
+printing memmap .....
mem_map (5)
- mem_map : ffffea0000a00000
+ mem_map : 0
pfn_start : 28000
pfn_end : 30000
+printing memmap .....
mem_map (6)
- mem_map : ffffea0000c00000
+ mem_map : 0
pfn_start : 30000
pfn_end : 38000
+printing memmap .....
mem_map (7)
- mem_map : ffffea0000e00000
+ mem_map : 0
pfn_start : 38000
pfn_end : 3ffda
mmap() is available on the kernel.
Checking for memory holes : [100.0 %] |STEP [Checking for memory holes ] : 0.000030 seconds
-Excluding unnecessary pages : [100.0 %] \STEP [Excluding unnecessary pages] : 0.007547 seconds
-Checking for memory holes : [100.0 %] -STEP [Checking for memory holes ] : 0.000016 seconds
-Checking for memory holes : [100.0 %] /STEP [Checking for memory holes ] : 0.000009 seconds
-Excluding unnecessary pages : [100.0 %] |STEP [Excluding unnecessary pages] : 0.006623 seconds
-Copying data : [100.0 %] \STEP [Copying data ] : 0.184442 seconds
-Copying data : [100.0 %] -STEP [Copying data ] : 0.000041 seconds
+Excluding unnecessary pages : [100.0 %] \STEP [Excluding unnecessary pages] : 0.000007 seconds
+Checking for memory holes : [100.0 %] -STEP [Checking for memory holes ] : 0.000014 seconds
+Checking for memory holes : [100.0 %] /STEP [Checking for memory holes ] : 0.000008 seconds
+Excluding unnecessary pages : [100.0 %] |STEP [Excluding unnecessary pages] : 0.000007 seconds
+Copying data : [100.0 %] /STEP [Copying data ] : 1.421661 seconds
+Copying data : [100.0 %] |STEP [Copying data ] : 0.000036 seconds
Writing erase info...
-offset_eraseinfo: dd5c4e, size_eraseinfo: 0
+offset_eraseinfo: 888c149, size_eraseinfo: 0
Original pages : 0x0000000000030c7d
- Excluded pages : 0x000000000002cf58
- Pages filled with zero : 0x00000000000002be
- Non-private cache pages : 0x0000000000006574
- Private cache pages : 0x0000000000000f27
- User process data pages : 0x0000000000003481
- Free pages : 0x000000000002237e
+ Excluded pages : 0x000000000001d534
+ Pages filled with zero : 0x000000000001d534
+ Non-private cache pages : 0x0000000000000000
+ Private cache pages : 0x0000000000000000
+ User process data pages : 0x0000000000000000
+ Free pages : 0x0000000000000000
Hwpoison pages : 0x0000000000000000
- Remaining pages : 0x0000000000003d25
- (The number of pages is reduced to 7%.)
+ Remaining pages : 0x0000000000013749
+ (The number of pages is reduced to 39%.)
Memory Hole : 0x000000000000f35d
--------------------------------------------------
Total pages : 0x000000000003ffda
-Cache hit: 29946, miss: 47, hit rate: 99.8%
+Cache hit: 196285, miss: 201, hit rate: 99.9%
-The dumpfile is saved to vmcore.m.1.
+The dumpfile is saved to vmcore.m.
makedumpfile Completed.
--rw------- 1 root root 14M Oct 27 10:30 vmcore.m.1
--rw------- 1 root root 137M Oct 27 10:28 vmcore.m
_______________________________________________
kexec mailing list
http://lists.infradead.org/mailman/listinfo/kexec
Dave Young
2016-10-27 06:19:45 UTC
Permalink
Post by Dave Young
https://people.redhat.com/~ruyang/test/
Adding Dave Anderson for any comments about the vmcore correctness from
crash point of view..
Since the reason has been found, the page_offset was not correct, I
tested with a changed version of patch 1, it works. Please ignore the
test vmcore, I deleted them.

Here is a new tarball which could be used by Dave to verify the kaslr, I
enabled kaslr, also saved vmlinux/cped vmcore/makedumpfile saved vmcore
in a tarball below.
http://people.redhat.com/~ruyang/test/vmcore-test.tar.gz

Thanks
Dave
Dave Anderson
2016-10-27 15:25:20 UTC
Permalink
----- Original Message -----
Post by Dave Anderson
That being said, my recent 4.8 and 4.9 KASLR testing has been on live
systems and compressed kdumps, so the old tried-and-true manner of
calculating the phys_base from the ELF PT_LOAD segments apparently
no longer works with KASLR.
It would be so much more helpful if the VMCOREINFO data in the ELF
crash> help -D
...
SYMBOL(phys_base)=ffffffffa740b010
...
which is completely useless unless the phys_base value is known.
Anyway, can you send me the makedumpfile code that calculates the
phys_base value?
Dave
As it turns out, the problem with the crash utility is that it has to
calculate phys_base well before it even knows the kernel has been relocated
by KASLR. So when it sees the __START_KERNEL_map PT_LOAD segment, it mistakes
it for the kernel modules' virtual address region and skips it.

The kernel has this:

#if defined(CONFIG_RANDOMIZE_BASE)
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
#endif

and then this:

#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)

So with KASLR, MODULES_VADDR gets pushed up from the traditional ffffffffa0000000
up to ffffffffc0000000.

So I'm curious as to what you use in makedumpfile to determine whether
CONFIG_RANDOMIZE_BASE has been configured?

Thanks,
Dave
Dave Anderson
2016-10-27 15:41:51 UTC
Permalink
----- Original Message -----
Post by Dave Anderson
----- Original Message -----
Post by Dave Anderson
That being said, my recent 4.8 and 4.9 KASLR testing has been on live
systems and compressed kdumps, so the old tried-and-true manner of
calculating the phys_base from the ELF PT_LOAD segments apparently
no longer works with KASLR.
It would be so much more helpful if the VMCOREINFO data in the ELF
crash> help -D
...
SYMBOL(phys_base)=ffffffffa740b010
...
which is completely useless unless the phys_base value is known.
Anyway, can you send me the makedumpfile code that calculates the
phys_base value?
Dave
As it turns out, the problem with the crash utility is that it has to
calculate phys_base well before it even knows the kernel has been relocated
by KASLR. So when it sees the __START_KERNEL_map PT_LOAD segment, it mistakes
it for the kernel modules' virtual address region and skips it.
#if defined(CONFIG_RANDOMIZE_BASE)
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
#endif
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
So with KASLR, MODULES_VADDR gets pushed up from the traditional ffffffffa0000000
up to ffffffffc0000000.
So I'm curious as to what you use in makedumpfile to determine whether
CONFIG_RANDOMIZE_BASE has been configured?
Thanks,
Dave
Hey, sorry, I didn't notice that this was added upstream:

commit 1303a27c9c32020a3b6ac89be270d2ab1f28be24
Author: Baoquan He <***@redhat.com>
Date: Wed Sep 9 15:39:03 2015 -0700

kexec: export KERNEL_IMAGE_SIZE to vmcoreinfo

With that in place, it will be an easy fix for the crash utility.

Thanks,
Dave
Dave Young
2016-10-28 02:04:19 UTC
Permalink
Post by Dave Anderson
----- Original Message -----
Post by Dave Anderson
----- Original Message -----
Post by Dave Anderson
That being said, my recent 4.8 and 4.9 KASLR testing has been on live
systems and compressed kdumps, so the old tried-and-true manner of
calculating the phys_base from the ELF PT_LOAD segments apparently
no longer works with KASLR.
It would be so much more helpful if the VMCOREINFO data in the ELF
crash> help -D
...
SYMBOL(phys_base)=ffffffffa740b010
...
which is completely useless unless the phys_base value is known.
Anyway, can you send me the makedumpfile code that calculates the
phys_base value?
Dave
As it turns out, the problem with the crash utility is that it has to
calculate phys_base well before it even knows the kernel has been relocated
by KASLR. So when it sees the __START_KERNEL_map PT_LOAD segment, it mistakes
it for the kernel modules' virtual address region and skips it.
#if defined(CONFIG_RANDOMIZE_BASE)
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
#endif
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
So with KASLR, MODULES_VADDR gets pushed up from the traditional ffffffffa0000000
up to ffffffffc0000000.
So I'm curious as to what you use in makedumpfile to determine whether
CONFIG_RANDOMIZE_BASE has been configured?
Thanks,
Dave
commit 1303a27c9c32020a3b6ac89be270d2ab1f28be24
Date: Wed Sep 9 15:39:03 2015 -0700
kexec: export KERNEL_IMAGE_SIZE to vmcoreinfo
With that in place, it will be an easy fix for the crash utility.
Dave, I confirmed your fix in crash git tree works, it also works for
the elf format vmcore, /proc/vmcore and makedumpfile -E created vmcore.

Thanks for the quick fix..
Post by Dave Anderson
Thanks,
Dave
_______________________________________________
kexec mailing list
http://lists.infradead.org/mailman/listinfo/kexec
Pratyush Anand
2016-10-27 15:59:27 UTC
Permalink
Post by Dave Anderson
As it turns out, the problem with the crash utility is that it has to
calculate phys_base well before it even knows the kernel has been relocated
by KASLR. So when it sees the __START_KERNEL_map PT_LOAD segment, it mistakes
it for the kernel modules' virtual address region and skips it.
#if defined(CONFIG_RANDOMIZE_BASE)
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
#endif
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
So with KASLR, MODULES_VADDR gets pushed up from the traditional ffffffffa0000000
up to ffffffffc0000000.
So I'm curious as to what you use in makedumpfile to determine whether
CONFIG_RANDOMIZE_BASE has been configured?
So far we are trying to avoid to know that in makedumpfile. makedumpfile
needed to know MODULES_VADDR, VMALLOC_START etc only to know that
whether a VA to PA translation can be done using direct mapping or to be
done by reading corresponding page table entry.

Now, we do VA to PA for all VAs using page table entry only.

https://github.com/pratyushanand/makedumpfile/blob/x86_devel/arch/x86_64.c#L186

~Pratyush
Baoquan He
2016-10-27 03:25:48 UTC
Permalink
Post by Dave Young
Hi Pratyush,
Post by Pratyush Anand
Patch 1/4 fixes page_offset calculation, so that it is correctly calculated
on KASLR enabled kernel as well.
Patch 2/4 simplifies VA to PA translation. New code has been benchmarked
against old code on a 4T system.
Patch 3/4 and 4/4 is removal of (now) unnecessary code.
I think, we should find a way to kill find_vememmap() as well, so that
VMEMMAP_START can be removed. I have very limited idea about x86, so unable
to do that as of now.
x86_64: Calculate page_offset from pt_load
x86_64: translate all VA to PA using page table values
x86_64: kill is_vmalloc_addr_x86_64()
x86_64: kill some unused initialization
arch/x86_64.c | 84 ++++++++++++++++++++--------------------------------------
makedumpfile.h | 9 +++----
2 files changed, 32 insertions(+), 61 deletions(-)
According to our test, with these patches the dumped vmcore is correct
which means simple crash test `bt` works. But the saved vmcore size is
larger than before.
I collected two --message-level 31 logs with/without your patches, the
kernel kaslr was disabled during my test so that we can focus on the
vmcore size issue, it looks like mem_map address was not collected
correctly.
From printing log, this is not correct. Without memmap, filtering can't
be done correctly.
Post by Dave Young
[please ignore the patched log extra debug message I added]
--- unpatched-makedumpfile.txt 2016-10-27 10:31:34.152962407 +0800
+++ patched-makedumpfile.txt 2016-10-27 10:28:44.213952179 +0800
@@ -1,4 +1,4 @@
-[127.0.0.1-2016-10-27-10:27:03]# /tmp/makedumpfile -l -d 31 --message-level 31 vmcore vmcore.m.1
+[127.0.0.1-2016-10-27-10:27:03]# /home/dyoung/git/makedumpfile/makedumpfile -l -d 31 --message-level 31 vmcore vmcore.m
sadump: does not have partition header
sadump: read dump device as unknown format
sadump: unknown format
@@ -36,70 +36,76 @@
Memory type : SPARSEMEM_EX
+printing memmap .....
mem_map (0)
- mem_map : ffffea0000000000
+ mem_map : 0
pfn_start : 0
pfn_end : 8000
+printing memmap .....
mem_map (1)
- mem_map : ffffea0000200000
+ mem_map : 0
pfn_start : 8000
pfn_end : 10000
+printing memmap .....
mem_map (2)
- mem_map : ffffea0000400000
+ mem_map : 0
pfn_start : 10000
pfn_end : 18000
+printing memmap .....
mem_map (3)
- mem_map : ffffea0000600000
+ mem_map : 0
pfn_start : 18000
pfn_end : 20000
+printing memmap .....
mem_map (4)
- mem_map : ffffea0000800000
+ mem_map : 0
pfn_start : 20000
pfn_end : 28000
+printing memmap .....
mem_map (5)
- mem_map : ffffea0000a00000
+ mem_map : 0
pfn_start : 28000
pfn_end : 30000
+printing memmap .....
mem_map (6)
- mem_map : ffffea0000c00000
+ mem_map : 0
pfn_start : 30000
pfn_end : 38000
+printing memmap .....
mem_map (7)
- mem_map : ffffea0000e00000
+ mem_map : 0
pfn_start : 38000
pfn_end : 3ffda
mmap() is available on the kernel.
Checking for memory holes : [100.0 %] |STEP [Checking for memory holes ] : 0.000030 seconds
-Excluding unnecessary pages : [100.0 %] \STEP [Excluding unnecessary pages] : 0.007547 seconds
-Checking for memory holes : [100.0 %] -STEP [Checking for memory holes ] : 0.000016 seconds
-Checking for memory holes : [100.0 %] /STEP [Checking for memory holes ] : 0.000009 seconds
-Excluding unnecessary pages : [100.0 %] |STEP [Excluding unnecessary pages] : 0.006623 seconds
-Copying data : [100.0 %] \STEP [Copying data ] : 0.184442 seconds
-Copying data : [100.0 %] -STEP [Copying data ] : 0.000041 seconds
+Excluding unnecessary pages : [100.0 %] \STEP [Excluding unnecessary pages] : 0.000007 seconds
+Checking for memory holes : [100.0 %] -STEP [Checking for memory holes ] : 0.000014 seconds
+Checking for memory holes : [100.0 %] /STEP [Checking for memory holes ] : 0.000008 seconds
+Excluding unnecessary pages : [100.0 %] |STEP [Excluding unnecessary pages] : 0.000007 seconds
+Copying data : [100.0 %] /STEP [Copying data ] : 1.421661 seconds
+Copying data : [100.0 %] |STEP [Copying data ] : 0.000036 seconds
Writing erase info...
-offset_eraseinfo: dd5c4e, size_eraseinfo: 0
+offset_eraseinfo: 888c149, size_eraseinfo: 0
Original pages : 0x0000000000030c7d
- Excluded pages : 0x000000000002cf58
- Pages filled with zero : 0x00000000000002be
- Non-private cache pages : 0x0000000000006574
- Private cache pages : 0x0000000000000f27
- User process data pages : 0x0000000000003481
- Free pages : 0x000000000002237e
+ Excluded pages : 0x000000000001d534
+ Pages filled with zero : 0x000000000001d534
+ Non-private cache pages : 0x0000000000000000
+ Private cache pages : 0x0000000000000000
+ User process data pages : 0x0000000000000000
+ Free pages : 0x0000000000000000
Hwpoison pages : 0x0000000000000000
- Remaining pages : 0x0000000000003d25
- (The number of pages is reduced to 7%.)
+ Remaining pages : 0x0000000000013749
+ (The number of pages is reduced to 39%.)
Memory Hole : 0x000000000000f35d
--------------------------------------------------
Total pages : 0x000000000003ffda
-Cache hit: 29946, miss: 47, hit rate: 99.8%
+Cache hit: 196285, miss: 201, hit rate: 99.9%
-The dumpfile is saved to vmcore.m.1.
+The dumpfile is saved to vmcore.m.
makedumpfile Completed.
--rw------- 1 root root 14M Oct 27 10:30 vmcore.m.1
--rw------- 1 root root 137M Oct 27 10:28 vmcore.m
_______________________________________________
kexec mailing list
http://lists.infradead.org/mailman/listinfo/kexec
Pratyush Anand
2016-10-27 05:11:23 UTC
Permalink
Post by Dave Young
According to our test, with these patches the dumped vmcore is correct
which means simple crash test `bt` works. But the saved vmcore size is
larger than before.
I collected two --message-level 31 logs with/without your patches, the
kernel kaslr was disabled during my test so that we can focus on the
vmcore size issue, it looks like mem_map address was not collected
correctly.
[please ignore the patched log extra debug message I added]
Hummm..thanks for pointing it out.

Seems patch 1/4 is wrong.

with this patch page_offset = ffffffff80000000
while with original code page_offset = ffff880000000000

Trying to see where it is wrong

~Pratyush
Dave Anderson
2016-10-27 13:25:18 UTC
Permalink
----- Original Message -----
Post by Dave Young
https://people.redhat.com/~ruyang/test/
Adding Dave Anderson for any comments about the vmcore correctness from
crash point of view..
As it turns out, the vmcore.makedumpfile can be read just fine:

$ crash vmlinux.kaslr vmcore.makedumpfile

crash 7.1.7rc9
Copyright (C) 2002-2016 Red Hat, Inc.
Copyright (C) 2004, 2005, 2006, 2010 IBM Corporation
Copyright (C) 1999-2006 Hewlett-Packard Co
Copyright (C) 2005, 2006, 2011, 2012 Fujitsu Limited
Copyright (C) 2006, 2007 VA Linux Systems Japan K.K.
Copyright (C) 2005, 2011 NEC Corporation
Copyright (C) 1999, 2002, 2007 Silicon Graphics, Inc.
Copyright (C) 1999, 2000, 2001, 2002 Mission Critical Linux, Inc.
This program is free software, covered by the GNU General Public License,
and you are welcome to change it and/or distribute copies of it under
certain conditions. Enter "help copying" to see the conditions.
This program has absolutely no warranty. Enter "help warranty" for details.

GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-unknown-linux-gnu"...

WARNING: kernel relocated [602MB]: patching 55325 gdb minimal_symbol values

KERNEL: vmlinux.kaslr
DUMPFILE: vmcore.makedumpfile [PARTIAL DUMP]
CPUS: 1
DATE: Thu Oct 27 02:07:46 2016
UPTIME: 00:00:41
LOAD AVERAGE: 0.14, 0.04, 0.01
TASKS: 107
NODENAME: localhost.localdomain
RELEASE: 4.9.0-rc2+
VERSION: #185 SMP Wed Oct 26 11:23:54 CST 2016
MACHINE: x86_64 (2294 Mhz)
MEMORY: 1 GB
PANIC: "sysrq: SysRq : Trigger a crash"
PID: 1934
COMMAND: "bash"
TASK: ffff9ca3bc155e00 [THREAD_INFO: ffff9ca3bc155e00]
CPU: 0
STATE: TASK_RUNNING (SYSRQ)

crash> bt
PID: 1934 TASK: ffff9ca3bc155e00 CPU: 0 COMMAND: "bash"
#0 [ffffb2df4033fa08] machine_kexec at ffffffffa6a34e09
#1 [ffffb2df4033fa68] __crash_kexec at ffffffffa6ab945a
#2 [ffffb2df4033fb30] __crash_kexec at ffffffffa6ab9530
#3 [ffffb2df4033fb48] crash_kexec at ffffffffa6ab9576
#4 [ffffb2df4033fb68] oops_end at ffffffffa6a1529f
#5 [ffffb2df4033fb90] no_context at ffffffffa6a3f67b
#6 [ffffb2df4033fbf8] __bad_area_nosemaphore at ffffffffa6a3f8cc
#7 [ffffb2df4033fc48] bad_area at ffffffffa6a3fa71
#8 [ffffb2df4033fc70] __do_page_fault at ffffffffa6a4004d
#9 [ffffb2df4033fcd8] do_page_fault at ffffffffa6a40100
#10 [ffffb2df4033fd08] do_async_page_fault at ffffffffa6a3bbc5
#11 [ffffb2df4033fd20] async_page_fault at ffffffffa6f9f1e5
[exception RIP: sysrq_handle_crash+17]
RIP: ffffffffa6d3a101 RSP: ffffb2df4033fdd8 RFLAGS: 00010282
RAX: 000000000000000f RBX: 0000000000000063 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff9ca3be60cc28 RDI: 0000000000000063
RBP: ffffb2df4033fdd8 R8: 0000000000000001 R9: 0000000000000006
R10: 0000000000000001 R11: 0000000000000172 R12: 0000000000000007
R13: 0000000000000000 R14: ffffffffa745b4a0 R15: 0000000000000000
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#12 [ffffb2df4033fde0] __handle_sysrq at ffffffffa6d3a821
#13 [ffffb2df4033fe10] write_sysrq_trigger at ffffffffa6d3ac4a
#14 [ffffb2df4033fe28] proc_reg_write at ffffffffa6bb73dd
#15 [ffffb2df4033fe48] __vfs_write at ffffffffa6b55b02
#16 [ffffb2df4033fed0] vfs_write at ffffffffa6b56dbc
#17 [ffffb2df4033ff08] sys_write at ffffffffa6b58060
#18 [ffffb2df4033ff50] entry_SYSCALL_64_fastpath at ffffffffa6f9dbbb
RIP: 00007f0b401a9c20 RSP: 00007ffe38d75698 RFLAGS: 00000246
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0b401a9c20
RDX: 0000000000000002 RSI: 00005577bf7de790 RDI: 0000000000000001
RBP: 0000000000000001 R8: 00007f0b40474740 R9: 00007f0b40ab7b40
R10: 0000000000000073 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ffe38d75c10 R14: 0000000000000000 R15: 0000000000000000
ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b
crash>

But the ELF vmcore fails:

$ crash vmlinux.kaslr vmcore

crash 7.1.7rc9
Copyright (C) 2002-2016 Red Hat, Inc.
Copyright (C) 2004, 2005, 2006, 2010 IBM Corporation
Copyright (C) 1999-2006 Hewlett-Packard Co
Copyright (C) 2005, 2006, 2011, 2012 Fujitsu Limited
Copyright (C) 2006, 2007 VA Linux Systems Japan K.K.
Copyright (C) 2005, 2011 NEC Corporation
Copyright (C) 1999, 2002, 2007 Silicon Graphics, Inc.
Copyright (C) 1999, 2000, 2001, 2002 Mission Critical Linux, Inc.
This program is free software, covered by the GNU General Public License,
and you are welcome to change it and/or distribute copies of it under
certain conditions. Enter "help copying" to see the conditions.
This program has absolutely no warranty. Enter "help warranty" for details.

GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-unknown-linux-gnu"...

WARNING: kernel relocated [602MB]: patching 55325 gdb minimal_symbol values

WARNING: could not find MAGIC_START!
WARNING: cannot read linux_banner string
crash: vmlinux.kaslr and vmcore do not match!

Usage:

crash [OPTION]... NAMELIST MEMORY-IMAGE[@ADDRESS] (dumpfile form)
crash [OPTION]... [NAMELIST] (live system form)

Enter "crash -h" for details.
$

Since the data being read was bogus, the first thing I checked was whether
the phys_base value was being calculated correctly. I manually applied
the phys_base that was stored in the compressed dump header, and the ELF
vmcore can be read:

$ crash --machdep phys_base=ffffffffdf400000 vmlinux.kaslr vmcore

crash 7.1.7rc9
Copyright (C) 2002-2016 Red Hat, Inc.
Copyright (C) 2004, 2005, 2006, 2010 IBM Corporation
Copyright (C) 1999-2006 Hewlett-Packard Co
Copyright (C) 2005, 2006, 2011, 2012 Fujitsu Limited
Copyright (C) 2006, 2007 VA Linux Systems Japan K.K.
Copyright (C) 2005, 2011 NEC Corporation
Copyright (C) 1999, 2002, 2007 Silicon Graphics, Inc.
Copyright (C) 1999, 2000, 2001, 2002 Mission Critical Linux, Inc.
This program is free software, covered by the GNU General Public License,
and you are welcome to change it and/or distribute copies of it under
certain conditions. Enter "help copying" to see the conditions.
This program has absolutely no warranty. Enter "help warranty" for details.

NOTE: setting phys_base to: 0xffffffffdf400000

GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-unknown-linux-gnu"...

WARNING: kernel relocated [602MB]: patching 55325 gdb minimal_symbol values

KERNEL: vmlinux.kaslr
DUMPFILE: vmcore
CPUS: 1
DATE: Thu Oct 27 02:07:46 2016
UPTIME: 00:00:41
LOAD AVERAGE: 0.14, 0.04, 0.01
TASKS: 107
NODENAME: localhost.localdomain
RELEASE: 4.9.0-rc2+
VERSION: #185 SMP Wed Oct 26 11:23:54 CST 2016
MACHINE: x86_64 (2294 Mhz)
MEMORY: 1 GB
PANIC: "sysrq: SysRq : Trigger a crash"
PID: 1934
COMMAND: "bash"
TASK: ffff9ca3bc155e00 [THREAD_INFO: ffff9ca3bc155e00]
CPU: 0
STATE: TASK_RUNNING (SYSRQ)

crash>

That being said, my recent 4.8 and 4.9 KASLR testing has been on live
systems and compressed kdumps, so the old tried-and-true manner of
calculating the phys_base from the ELF PT_LOAD segments apparently
no longer works with KASLR.

It would be so much more helpful if the VMCOREINFO data in the ELF
header stored the actual phys_base value instead of its symbol value:

crash> help -D
...
SYMBOL(phys_base)=ffffffffa740b010
...

which is completely useless unless the phys_base value is known.

Anyway, can you send me the makedumpfile code that calculates the
phys_base value?

Dave
Loading...