Message ID | 20180628062857.29658-5-bhe@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Jun 28, 2018 at 02:28:56PM +0800, Baoquan He wrote: > In sparse_init(), two temporary pointer arrays, usemap_map and map_map > are allocated with the size of NR_MEM_SECTIONS. They are used to store > each memory section's usemap and mem map if marked as present. With > the help of these two arrays, continuous memory chunk is allocated for > usemap and memmap for memory sections on one node. This avoids too many > memory fragmentations. Like below diagram, '1' indicates the present > memory section, '0' means absent one. The number 'n' could be much > smaller than NR_MEM_SECTIONS on most of systems. > > |1|1|1|1|0|0|0|0|1|1|0|0|...|1|0||1|0|...|1||0|1|...|0| > ------------------------------------------------------- > 0 1 2 3 4 5 i i+1 n-1 n > > If fail to populate the page tables to map one section's memmap, its > ->section_mem_map will be cleared finally to indicate that it's not present. > After use, these two arrays will be released at the end of sparse_init(). > > In 4-level paging mode, each array costs 4M which can be ignorable. While > in 5-level paging, they costs 256M each, 512M altogether. Kdump kernel > Usually only reserves very few memory, e.g 256M. So, even thouth they are > temporarily allocated, still not acceptable. > > In fact, there's no need to allocate them with the size of NR_MEM_SECTIONS. > Since the ->section_mem_map clearing has been deferred to the last, the > number of present memory sections are kept the same during sparse_init() > until we finally clear out the memory section's ->section_mem_map if its > usemap or memmap is not correctly handled. Thus in the middle whenever > for_each_present_section_nr() loop is taken, the i-th present memory > section is always the same one. > > Here only allocate usemap_map and map_map with the size of > 'nr_present_sections'. For the i-th present memory section, install its > usemap and memmap to usemap_map[i] and mam_map[i] during allocation. Then > in the last for_each_present_section_nr() loop which clears the failed > memory section's ->section_mem_map, fetch usemap and memmap from > usemap_map[] and map_map[] array and set them into mem_section[] > accordingly. > > Signed-off-by: Baoquan He <bhe@redhat.com> > Reviewed-by: Pavel Tatashin <pasha.tatashin@oracle.com> > --- > mm/sparse-vmemmap.c | 5 +++-- > mm/sparse.c | 43 ++++++++++++++++++++++++++++++++++--------- > 2 files changed, 37 insertions(+), 11 deletions(-) > > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > index 68bb65b2d34d..e1a54ba411ec 100644 > --- a/mm/sparse-vmemmap.c > +++ b/mm/sparse-vmemmap.c > @@ -281,6 +281,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, > unsigned long pnum; > unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; > void *vmemmap_buf_start; > + int nr_consumed_maps = 0; > > size = ALIGN(size, PMD_SIZE); > vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, > @@ -295,8 +296,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, > if (!present_section_nr(pnum)) > continue; > > - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); > - if (map_map[pnum]) > + map_map[nr_consumed_maps] = sparse_mem_map_populate(pnum, nodeid, NULL); > + if (map_map[nr_consumed_maps++]) > continue; > pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", > __func__); > diff --git a/mm/sparse.c b/mm/sparse.c > index 4458a23e5293..e1767d9fe4f3 100644 > --- a/mm/sparse.c > +++ b/mm/sparse.c > @@ -386,6 +386,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data, > unsigned long pnum; > unsigned long **usemap_map = (unsigned long **)data; > int size = usemap_size(); > + int nr_consumed_maps = 0; > > usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), > size * usemap_count); > @@ -397,9 +398,10 @@ static void __init sparse_early_usemaps_alloc_node(void *data, > for (pnum = pnum_begin; pnum < pnum_end; pnum++) { > if (!present_section_nr(pnum)) > continue; > - usemap_map[pnum] = usemap; > + usemap_map[nr_consumed_maps] = usemap; > usemap += size; > - check_usemap_section_nr(nodeid, usemap_map[pnum]); > + check_usemap_section_nr(nodeid, usemap_map[nr_consumed_maps]); > + nr_consumed_maps++; > } > } > > @@ -424,27 +426,31 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, > void *map; > unsigned long pnum; > unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; > + int nr_consumed_maps; > > size = PAGE_ALIGN(size); > map = memblock_virt_alloc_try_nid_raw(size * map_count, > PAGE_SIZE, __pa(MAX_DMA_ADDRESS), > BOOTMEM_ALLOC_ACCESSIBLE, nodeid); > if (map) { > + nr_consumed_maps = 0; > for (pnum = pnum_begin; pnum < pnum_end; pnum++) { > if (!present_section_nr(pnum)) > continue; > - map_map[pnum] = map; > + map_map[nr_consumed_maps] = map; > map += size; > + nr_consumed_maps++; > } > return; > } > > /* fallback */ > + nr_consumed_maps = 0; > for (pnum = pnum_begin; pnum < pnum_end; pnum++) { > if (!present_section_nr(pnum)) > continue; > - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); > - if (map_map[pnum]) > + map_map[nr_consumed_maps] = sparse_mem_map_populate(pnum, nodeid, NULL); > + if (map_map[nr_consumed_maps++]) > continue; > pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", > __func__); > @@ -523,6 +529,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) > /* new start, update count etc*/ > nodeid_begin = nodeid; > pnum_begin = pnum; > + data += map_count * data_unit_size; > map_count = 1; > } > /* ok, last chunk */ > @@ -541,6 +548,7 @@ void __init sparse_init(void) > unsigned long *usemap; > unsigned long **usemap_map; > int size; > + int nr_consumed_maps = 0; > #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER > int size2; > struct page **map_map; > @@ -563,7 +571,7 @@ void __init sparse_init(void) > * powerpc need to call sparse_init_one_section right after each > * sparse_early_mem_map_alloc, so allocate usemap_map at first. > */ > - size = sizeof(unsigned long *) * NR_MEM_SECTIONS; > + size = sizeof(unsigned long *) * nr_present_sections; > usemap_map = memblock_virt_alloc(size, 0); > if (!usemap_map) > panic("can not allocate usemap_map\n"); > @@ -572,7 +580,7 @@ void __init sparse_init(void) > sizeof(usemap_map[0])); > > #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER > - size2 = sizeof(struct page *) * NR_MEM_SECTIONS; > + size2 = sizeof(struct page *) * nr_present_sections; > map_map = memblock_virt_alloc(size2, 0); > if (!map_map) > panic("can not allocate map_map\n"); > @@ -581,27 +589,44 @@ void __init sparse_init(void) > sizeof(map_map[0])); > #endif > > + /* The numner of present sections stored in nr_present_sections > + * are kept the same since mem sections are marked as present in > + * memory_present(). In this for loop, we need check which sections > + * failed to allocate memmap or usemap, then clear its > + * ->section_mem_map accordingly. During this process, we need > + * increase 'nr_consumed_maps' whether its allocation of memmap > + * or usemap failed or not, so that after we handle the i-th > + * memory section, can get memmap and usemap of (i+1)-th section > + * correctly. */ > for_each_present_section_nr(0, pnum) { > struct mem_section *ms; > + > + if (nr_consumed_maps >= nr_present_sections) { > + pr_err("nr_consumed_maps goes beyond nr_present_sections\n"); > + break; > + } Hi Baoquan, I am sure I am missing something here, but is this check really needed? I mean, for_each_present_section_nr() only returns the section nr if the section has been marked as SECTION_MARKED_PRESENT. That happens in memory_present(), where now we also increment nr_present_sections whenever we find a present section. So, for_each_present_section_nr() should return the same nr of section as nr_present_sections. Since we only increment nr_consumed_maps once in the loop, I am not so sure we can go beyond nr_present_sections. Did I overlook something? Other than that, this looks good to me. Reviewed-by: Oscar Salvador <osalvador@suse.de>
> > + if (nr_consumed_maps >= nr_present_sections) { > > + pr_err("nr_consumed_maps goes beyond nr_present_sections\n"); > > + break; > > + } > > Hi Baoquan, > > I am sure I am missing something here, but is this check really needed? > > I mean, for_each_present_section_nr() only returns the section nr if the section > has been marked as SECTION_MARKED_PRESENT. > That happens in memory_present(), where now we also increment nr_present_sections whenever > we find a present section. > > So, for_each_present_section_nr() should return the same nr of section as nr_present_sections. > Since we only increment nr_consumed_maps once in the loop, I am not so sure we can > go beyond nr_present_sections. > > Did I overlook something? You did not, this is basically a safety check. A BUG_ON() would be better here. As, this something that should really not happening, and would mean a bug in the current project.
On Thu, Jun 28, 2018 at 08:12:04AM -0400, Pavel Tatashin wrote: > > > + if (nr_consumed_maps >= nr_present_sections) { > > > + pr_err("nr_consumed_maps goes beyond nr_present_sections\n"); > > > + break; > > > + } > > > > Hi Baoquan, > > > > I am sure I am missing something here, but is this check really needed? > > > > I mean, for_each_present_section_nr() only returns the section nr if the section > > has been marked as SECTION_MARKED_PRESENT. > > That happens in memory_present(), where now we also increment nr_present_sections whenever > > we find a present section. > > > > So, for_each_present_section_nr() should return the same nr of section as nr_present_sections. > > Since we only increment nr_consumed_maps once in the loop, I am not so sure we can > > go beyond nr_present_sections. > > > > Did I overlook something? > > You did not, this is basically a safety check. A BUG_ON() would be > better here. As, this something that should really not happening, and > would mean a bug in the current project. I think we would be better off having a BUG_ON() there. Otherwise the system can go sideways later on.
On 06/28/2018 05:12 AM, Pavel Tatashin wrote: > You did not, this is basically a safety check. A BUG_ON() would be > better here. As, this something that should really not happening, and > would mean a bug in the current project. Is this at a point in boot where a BUG_ON() generally produces useful output, or will it just produce and early-boot silent hang with no console output?
> Is this at a point in boot where a BUG_ON() generally produces useful > output, or will it just produce and early-boot silent hang with no > console output? Probably depends on the platform, but in KVM, I see a nice panic message (inserted BUG_ON(1) into sparse_init()): [ 0.000000] kernel BUG at mm/sparse.c:490! PANIC: early exception 0x06 IP 10:ffffffffb6bd43d9 error 0 cr2 0xffff898747575000 [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 4.18.0-rc2_pt_sparse #6 [ 0.000000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014 [ 0.000000] RIP: 0010:sparse_init+0x0/0x2 [ 0.000000] Code: fe 3b 05 ba d0 16 00 7e 06 89 05 b2 d0 16 00 49 83 08 01 48 81 c3 00 80 00 00 e9 73 ff ff ff 48 83 c4 10 5b 5d 41 5c 41 5d c3 <0f> 0b 48 8b 05 ae 46 8f ff 48 c1 e2 15 48 01 d0 c3 41 56 48 8b 05 [ 0.000000] RSP: 0000:ffffffffb6603e98 EFLAGS: 00010086 ORIG_RAX: 0000000000000000 [ 0.000000] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffffb6603e80 [ 0.000000] RDX: ffffffffb6603e78 RSI: 0000000000000040 RDI: ffffffffb6603e70 [ 0.000000] RBP: 0000000007f7ec00 R08: ffffffffb6603e74 R09: 0000000000007fe0 [ 0.000000] R10: 0000000000000100 R11: 0000000007fd6000 R12: 0000000000000000 [ 0.000000] R13: ffffffffb6603f18 R14: 0000000000000000 R15: 0000000000000000 [ 0.000000] FS: 0000000000000000(0000) GS:ffffffffb6b82000(0000) knlGS:0000000000000000 [ 0.000000] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.000000] CR2: ffff898747575000 CR3: 0000000006e0a000 CR4: 00000000000606b0 [ 0.000000] Call Trace: [ 0.000000] ? paging_init+0xf/0x2c [ 0.000000] ? setup_arch+0xae8/0xc17 [ 0.000000] ? printk+0x53/0x6a [ 0.000000] ? start_kernel+0x62/0x4b3 [ 0.000000] ? load_ucode_bsp+0x3d/0x129 [ 0.000000] ? secondary_startup_64+0xa5/0xb0
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 68bb65b2d34d..e1a54ba411ec 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -281,6 +281,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long pnum; unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; void *vmemmap_buf_start; + int nr_consumed_maps = 0; size = ALIGN(size, PMD_SIZE); vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, @@ -295,8 +296,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (!present_section_nr(pnum)) continue; - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[pnum]) + map_map[nr_consumed_maps] = sparse_mem_map_populate(pnum, nodeid, NULL); + if (map_map[nr_consumed_maps++]) continue; pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); diff --git a/mm/sparse.c b/mm/sparse.c index 4458a23e5293..e1767d9fe4f3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -386,6 +386,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data, unsigned long pnum; unsigned long **usemap_map = (unsigned long **)data; int size = usemap_size(); + int nr_consumed_maps = 0; usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), size * usemap_count); @@ -397,9 +398,10 @@ static void __init sparse_early_usemaps_alloc_node(void *data, for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; - usemap_map[pnum] = usemap; + usemap_map[nr_consumed_maps] = usemap; usemap += size; - check_usemap_section_nr(nodeid, usemap_map[pnum]); + check_usemap_section_nr(nodeid, usemap_map[nr_consumed_maps]); + nr_consumed_maps++; } } @@ -424,27 +426,31 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, void *map; unsigned long pnum; unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + int nr_consumed_maps; size = PAGE_ALIGN(size); map = memblock_virt_alloc_try_nid_raw(size * map_count, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nodeid); if (map) { + nr_consumed_maps = 0; for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; - map_map[pnum] = map; + map_map[nr_consumed_maps] = map; map += size; + nr_consumed_maps++; } return; } /* fallback */ + nr_consumed_maps = 0; for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[pnum]) + map_map[nr_consumed_maps] = sparse_mem_map_populate(pnum, nodeid, NULL); + if (map_map[nr_consumed_maps++]) continue; pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); @@ -523,6 +529,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) /* new start, update count etc*/ nodeid_begin = nodeid; pnum_begin = pnum; + data += map_count * data_unit_size; map_count = 1; } /* ok, last chunk */ @@ -541,6 +548,7 @@ void __init sparse_init(void) unsigned long *usemap; unsigned long **usemap_map; int size; + int nr_consumed_maps = 0; #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER int size2; struct page **map_map; @@ -563,7 +571,7 @@ void __init sparse_init(void) * powerpc need to call sparse_init_one_section right after each * sparse_early_mem_map_alloc, so allocate usemap_map at first. */ - size = sizeof(unsigned long *) * NR_MEM_SECTIONS; + size = sizeof(unsigned long *) * nr_present_sections; usemap_map = memblock_virt_alloc(size, 0); if (!usemap_map) panic("can not allocate usemap_map\n"); @@ -572,7 +580,7 @@ void __init sparse_init(void) sizeof(usemap_map[0])); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - size2 = sizeof(struct page *) * NR_MEM_SECTIONS; + size2 = sizeof(struct page *) * nr_present_sections; map_map = memblock_virt_alloc(size2, 0); if (!map_map) panic("can not allocate map_map\n"); @@ -581,27 +589,44 @@ void __init sparse_init(void) sizeof(map_map[0])); #endif + /* The numner of present sections stored in nr_present_sections + * are kept the same since mem sections are marked as present in + * memory_present(). In this for loop, we need check which sections + * failed to allocate memmap or usemap, then clear its + * ->section_mem_map accordingly. During this process, we need + * increase 'nr_consumed_maps' whether its allocation of memmap + * or usemap failed or not, so that after we handle the i-th + * memory section, can get memmap and usemap of (i+1)-th section + * correctly. */ for_each_present_section_nr(0, pnum) { struct mem_section *ms; + + if (nr_consumed_maps >= nr_present_sections) { + pr_err("nr_consumed_maps goes beyond nr_present_sections\n"); + break; + } ms = __nr_to_section(pnum); - usemap = usemap_map[pnum]; + usemap = usemap_map[nr_consumed_maps]; if (!usemap) { ms->section_mem_map = 0; + nr_consumed_maps++; continue; } #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - map = map_map[pnum]; + map = map_map[nr_consumed_maps]; #else map = sparse_early_mem_map_alloc(pnum); #endif if (!map) { ms->section_mem_map = 0; + nr_consumed_maps++; continue; } sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); + nr_consumed_maps++; } vmemmap_populate_print_last();