diff mbox

[v3] mm: make expand_downwards symmetrical to expand_upwards

Message ID alpine.DEB.2.00.1104201530430.13948@chino.kir.corp.google.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

David Rientjes April 20, 2011, 11:12 p.m. UTC
On Wed, 20 Apr 2011, James Bottomley wrote:

> > This is probably because the parisc's DISCONTIGMEM memory ranges don't 
> > have bits set in N_NORMAL_MEMORY.
> > 
> > diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
> > --- a/arch/parisc/mm/init.c
> > +++ b/arch/parisc/mm/init.c
> > @@ -266,8 +266,10 @@ static void __init setup_bootmem(void)
> >  	}
> >  	memset(pfnnid_map, 0xff, sizeof(pfnnid_map));
> >  
> > -	for (i = 0; i < npmem_ranges; i++)
> > +	for (i = 0; i < npmem_ranges; i++) {
> > +		node_set_state(i, N_NORMAL_MEMORY);
> >  		node_set_online(i);
> > +	}
> >  #endif
> 
> Yes, this seems to be the missing piece that gets it to boot.  We really
> need this in generic code, unless someone wants to run through all the
> other arch's doing it ...
> 

Looking at all other architectures that allow ARCH_DISCONTIGMEM_ENABLE, we 
already know x86 is fine, avr32 disables ARCH_DISCONTIGMEM_ENABLE entirely 
because its code only brings online node 0, and tile already sets the bit 
in N_NORMAL_MEMORY correctly when bringing a node online, probably because 
it was introduced after the various node state masks were added in 
7ea1530ab3fd back in October 2007.

So we're really only talking about alpha, ia64, m32r, m68k, and mips and 
it only seems to matter when using CONFIG_SLUB, which isn't surprising 
when greping for it:

	$ grep -r N_NORMAL_MEMORY mm/*
	mm/memcontrol.c:	if (!node_state(node, N_NORMAL_MEMORY))
	mm/memcontrol.c:		if (!node_state(node, N_NORMAL_MEMORY))
	mm/page_alloc.c:	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
	mm/page_alloc.c:			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
	mm/slub.c:		for_each_node_state(node, N_NORMAL_MEMORY) {
	mm/slub.c:		for_each_node_state(node, N_NORMAL_MEMORY) {
	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY)

Those memory controller occurrences only result in it passing a node id of 
-1 to kmalloc_node() which means no specific node target, and that's fine 
for DISCONTIGMEM since we don't care about any proximity between memory 
ranges.

This should fix the remaining architectures so they can use CONFIG_SLUB, 
but I hope it can be tested by the individual arch maintainers like you 
did for parisc.

--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Motohiro KOSAKI April 21, 2011, 1:16 p.m. UTC | #1
> On Wed, 20 Apr 2011, James Bottomley wrote:
> 
> > > This is probably because the parisc's DISCONTIGMEM memory ranges don't 
> > > have bits set in N_NORMAL_MEMORY.
> > > 
> > > diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
> > > --- a/arch/parisc/mm/init.c
> > > +++ b/arch/parisc/mm/init.c
> > > @@ -266,8 +266,10 @@ static void __init setup_bootmem(void)
> > >  	}
> > >  	memset(pfnnid_map, 0xff, sizeof(pfnnid_map));
> > >  
> > > -	for (i = 0; i < npmem_ranges; i++)
> > > +	for (i = 0; i < npmem_ranges; i++) {
> > > +		node_set_state(i, N_NORMAL_MEMORY);
> > >  		node_set_online(i);
> > > +	}
> > >  #endif
> > 
> > Yes, this seems to be the missing piece that gets it to boot.  We really
> > need this in generic code, unless someone wants to run through all the
> > other arch's doing it ...
> > 
> 
> Looking at all other architectures that allow ARCH_DISCONTIGMEM_ENABLE, we 
> already know x86 is fine, avr32 disables ARCH_DISCONTIGMEM_ENABLE entirely 
> because its code only brings online node 0, and tile already sets the bit 
> in N_NORMAL_MEMORY correctly when bringing a node online, probably because 
> it was introduced after the various node state masks were added in 
> 7ea1530ab3fd back in October 2007.
> 
> So we're really only talking about alpha, ia64, m32r, m68k, and mips and 
> it only seems to matter when using CONFIG_SLUB, which isn't surprising 
> when greping for it:
> 
> 	$ grep -r N_NORMAL_MEMORY mm/*
> 	mm/memcontrol.c:	if (!node_state(node, N_NORMAL_MEMORY))
> 	mm/memcontrol.c:		if (!node_state(node, N_NORMAL_MEMORY))
> 	mm/page_alloc.c:	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
> 	mm/page_alloc.c:			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
> 	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
> 	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
> 	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
> 	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
> 	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
> 	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
> 	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY) {
> 	mm/slub.c:		for_each_node_state(node, N_NORMAL_MEMORY) {
> 	mm/slub.c:		for_each_node_state(node, N_NORMAL_MEMORY) {
> 	mm/slub.c:	for_each_node_state(node, N_NORMAL_MEMORY)
> 
> Those memory controller occurrences only result in it passing a node id of 
> -1 to kmalloc_node() which means no specific node target, and that's fine 
> for DISCONTIGMEM since we don't care about any proximity between memory 
> ranges.
> 
> This should fix the remaining architectures so they can use CONFIG_SLUB, 
> but I hope it can be tested by the individual arch maintainers like you 
> did for parisc.

ia64 and mips have CONFIG_ARCH_POPULATES_NODE_MAP and it initialize
N_NORMAL_MEMORY automatically if my understand is correct.
(plz see free_area_init_nodes)

I guess alpha and m32r have no active developrs. only m68k seems to be need
fix and we have a chance to get a review... 



--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley April 21, 2011, 4:37 p.m. UTC | #2
On Thu, 2011-04-21 at 22:16 +0900, KOSAKI Motohiro wrote:
> > This should fix the remaining architectures so they can use CONFIG_SLUB, 
> > but I hope it can be tested by the individual arch maintainers like you 
> > did for parisc.
> 
> ia64 and mips have CONFIG_ARCH_POPULATES_NODE_MAP and it initialize
> N_NORMAL_MEMORY automatically if my understand is correct.
> (plz see free_area_init_nodes)
> 
> I guess alpha and m32r have no active developrs. only m68k seems to be need
> fix and we have a chance to get a review... 

Actually, it's not quite a fix yet, I'm afraid.  I've just been
investigating why my main 4 way box got slower with kernel builds:
Apparently userspace processes are now all stuck on CPU0, so we're
obviously tripping over some NUMA scheduling stuff that's missing.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) April 21, 2011, 6:33 p.m. UTC | #3
On Thu, 21 Apr 2011, James Bottomley wrote:

> On Thu, 2011-04-21 at 22:16 +0900, KOSAKI Motohiro wrote:
> > > This should fix the remaining architectures so they can use CONFIG_SLUB,
> > > but I hope it can be tested by the individual arch maintainers like you
> > > did for parisc.
> >
> > ia64 and mips have CONFIG_ARCH_POPULATES_NODE_MAP and it initialize
> > N_NORMAL_MEMORY automatically if my understand is correct.
> > (plz see free_area_init_nodes)
> >
> > I guess alpha and m32r have no active developrs. only m68k seems to be need
> > fix and we have a chance to get a review...
>
> Actually, it's not quite a fix yet, I'm afraid.  I've just been
> investigating why my main 4 way box got slower with kernel builds:
> Apparently userspace processes are now all stuck on CPU0, so we're
> obviously tripping over some NUMA scheduling stuff that's missing.

The simplest solution may be to move these arches to use SPARSE instead.
AFAICT this was relatively easy for the arm guys.

Here is short guide on how to do that from the mips people:

http://www.linux-mips.org/archives/linux-mips/2008-08/msg00154.html

http://mytechkorner.blogspot.com/2010/12/sparsemem.html

Dave Hansen, Mel: Can you provide us with some help? (Its Easter and so
the europeans may be off for awhile)
--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Hansen April 21, 2011, 6:45 p.m. UTC | #4
On Thu, 2011-04-21 at 13:33 -0500, Christoph Lameter wrote:
> http://www.linux-mips.org/archives/linux-mips/2008-08/msg00154.html
> 
> http://mytechkorner.blogspot.com/2010/12/sparsemem.html
> 
> Dave Hansen, Mel: Can you provide us with some help? (Its Easter and so
> the europeans may be off for awhile) 

Yup, for sure.  It's also interesting how much code ppc64 removed when
they did this:

http://lists.ozlabs.org/pipermail/linuxppc64-dev/2005-November/006646.html

Please cc me on patches.  Or, if nobody else was planning on doing it, I
can take a stab at doing SPARSEMEM on one of the arches.  I won't be
able to _run_ it outside of qemu, but it might be quicker than someone
starting from scratch.

Was it really just m68k and parisc that need immediate attention?

-- Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Rientjes April 21, 2011, 7:33 p.m. UTC | #5
On Thu, 21 Apr 2011, KOSAKI Motohiro wrote:

> ia64 and mips have CONFIG_ARCH_POPULATES_NODE_MAP and it initialize
> N_NORMAL_MEMORY automatically if my understand is correct.
> (plz see free_area_init_nodes)
> 

ia64 doesn't enable CONFIG_HIGHMEM, so it never gets set via this generic 
code; mips also doesn't enable it for all configs even for 32-bit.

So we'll either want to take check_for_regular_memory() out from under 
CONFIG_HIGHMEM and do it for all configs or teach slub to use 
N_HIGH_MEMORY rather than N_NORMAL_MEMORY.
--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley April 21, 2011, 8:05 p.m. UTC | #6
On Thu, 2011-04-21 at 13:33 -0500, Christoph Lameter wrote:
> On Thu, 21 Apr 2011, James Bottomley wrote:
> 
> > On Thu, 2011-04-21 at 22:16 +0900, KOSAKI Motohiro wrote:
> > > > This should fix the remaining architectures so they can use CONFIG_SLUB,
> > > > but I hope it can be tested by the individual arch maintainers like you
> > > > did for parisc.
> > >
> > > ia64 and mips have CONFIG_ARCH_POPULATES_NODE_MAP and it initialize
> > > N_NORMAL_MEMORY automatically if my understand is correct.
> > > (plz see free_area_init_nodes)
> > >
> > > I guess alpha and m32r have no active developrs. only m68k seems to be need
> > > fix and we have a chance to get a review...
> >
> > Actually, it's not quite a fix yet, I'm afraid.  I've just been
> > investigating why my main 4 way box got slower with kernel builds:
> > Apparently userspace processes are now all stuck on CPU0, so we're
> > obviously tripping over some NUMA scheduling stuff that's missing.
> 
> The simplest solution may be to move these arches to use SPARSE instead.
> AFAICT this was relatively easy for the arm guys.
> 
> Here is short guide on how to do that from the mips people:
> 
> http://www.linux-mips.org/archives/linux-mips/2008-08/msg00154.html
> 
> http://mytechkorner.blogspot.com/2010/12/sparsemem.html
> 
> Dave Hansen, Mel: Can you provide us with some help? (Its Easter and so
> the europeans may be off for awhile)

It sort of depends on your definition of easy.  The problem going from
DISCONTIGMEM to SPARSEMEM is sorting out the section size (the minimum
indivisible size for a sectional_mem_map array) and also deciding on
whether you need SPARSEMEM_EXTREME (discontigmem allows arbitrarily
different sizes for each contiguous region) or
ARCH_HAS_HOLES_MEMORYMODEL (allows empty mem_map regions as well).  I
suspect most architectures will want SPARSEMEM_EXTREME (it means that
the section array isn't fully populated) because the gaps can be huge
(we've got a 64GB gap on parisc).

However, even though I think we can do this going forwards ... I don't
think we can backport it as a bug fix for the slub panic.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) April 21, 2011, 9:07 p.m. UTC | #7
On Thu, 21 Apr 2011, James Bottomley wrote:

> > Dave Hansen, Mel: Can you provide us with some help? (Its Easter and so
> > the europeans may be off for awhile)
>
> It sort of depends on your definition of easy.  The problem going from
> DISCONTIGMEM to SPARSEMEM is sorting out the section size (the minimum
> indivisible size for a sectional_mem_map array) and also deciding on
> whether you need SPARSEMEM_EXTREME (discontigmem allows arbitrarily
> different sizes for each contiguous region) or
> ARCH_HAS_HOLES_MEMORYMODEL (allows empty mem_map regions as well).  I
> suspect most architectures will want SPARSEMEM_EXTREME (it means that
> the section array isn't fully populated) because the gaps can be huge
> (we've got a 64GB gap on parisc).

Well my favorite is SPARSEMEM_VMEMMAP because it allows page level holes
and uses the TLB (via page tables) to avoid lookups in the SPARSE maps but
that is likely not going to be in an initial fix.

> However, even though I think we can do this going forwards ... I don't
> think we can backport it as a bug fix for the slub panic.

So far there seems to be no other solution that will fix the issues
cleanly since we have a clash of the notions of a node in !NUMA between
core and discontig. Which is a pretty basic thing to get wrong.

If we can avoid all the fancy stuff and Dave can just get a minimal SPARSE
config going then this may be the best solution for stable as well.

But then these configs have been broken for years and no one noticed. This
means the users of these arches likely have been running a subset of
kernel functionality. I suspect they have never freed memory from
DISCONTIG node 1 and higher without CONFIG_DEBUG_VM on. Otherwise I
cannot explain why the VM_BUG_ONs did not trigger in
mm/page_alloc.c:move_freepages() that should have been brought to the MM
developers attention.

This set of circumstances leads to the suspicion that there were only
tests run that showed that the kernel booted. Higher node memory was never
touched and the MM code was never truly exercised.

So I am not sure that there is any urgency in this matter. No one has
cared for years after all.
--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley April 21, 2011, 9:22 p.m. UTC | #8
On Thu, 2011-04-21 at 16:07 -0500, Christoph Lameter wrote:
> On Thu, 21 Apr 2011, James Bottomley wrote:
> 
> > > Dave Hansen, Mel: Can you provide us with some help? (Its Easter and so
> > > the europeans may be off for awhile)
> >
> > It sort of depends on your definition of easy.  The problem going from
> > DISCONTIGMEM to SPARSEMEM is sorting out the section size (the minimum
> > indivisible size for a sectional_mem_map array) and also deciding on
> > whether you need SPARSEMEM_EXTREME (discontigmem allows arbitrarily
> > different sizes for each contiguous region) or
> > ARCH_HAS_HOLES_MEMORYMODEL (allows empty mem_map regions as well).  I
> > suspect most architectures will want SPARSEMEM_EXTREME (it means that
> > the section array isn't fully populated) because the gaps can be huge
> > (we've got a 64GB gap on parisc).
> 
> Well my favorite is SPARSEMEM_VMEMMAP because it allows page level holes
> and uses the TLB (via page tables) to avoid lookups in the SPARSE maps but
> that is likely not going to be in an initial fix.

Really, no ... that requires additional pte insertion logic and some
other stuff that's nasty to craft and requires significant testing.

> > However, even though I think we can do this going forwards ... I don't
> > think we can backport it as a bug fix for the slub panic.
> 
> So far there seems to be no other solution that will fix the issues
> cleanly since we have a clash of the notions of a node in !NUMA between
> core and discontig. Which is a pretty basic thing to get wrong.

Yes there is ... there's the slub patch or the marking as broken.
Either are much simpler.

> If we can avoid all the fancy stuff and Dave can just get a minimal SPARSE
> config going then this may be the best solution for stable as well.
> 
> But then these configs have been broken for years and no one noticed. This
> means the users of these arches likely have been running a subset of
> kernel functionality. I suspect they have never freed memory from
> DISCONTIG node 1 and higher without CONFIG_DEBUG_VM on. Otherwise I
> cannot explain why the VM_BUG_ONs did not trigger in
> mm/page_alloc.c:move_freepages() that should have been brought to the MM
> developers attention.

Yes they have.  As willy said, they've just never been run with DEBUG_VM
or HUGEPAGES or, until recently, SLUB.  The test boxes (at least for
parisc) get hammered quite a lot to flush out coherency issues.  That's
why I'm confident this panic only triggers for slub.  I found the panic
within about two days of turning SLUB on.

> This set of circumstances leads to the suspicion that there were only
> tests run that showed that the kernel booted. Higher node memory was never
> touched and the MM code was never truly exercised.

Look, try to stay on point with logic: they have been extensively
tested, just not in the slub configuration, which is the only one that
crashes.  As I explained (several times) we're just now picking up slub
because debian now enables it by default.

> So I am not sure that there is any urgency in this matter. No one has
> cared for years after all.

If we didn't care, we wouldn't be making all this fuss.  It's only a
couple of days since the bug was reported, which should indicate the
high importance attached to it (well, by everyone except you,
apparently).

James


--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Motohiro KOSAKI April 22, 2011, 12:34 a.m. UTC | #9
> On Thu, 21 Apr 2011, KOSAKI Motohiro wrote:
> 
> > ia64 and mips have CONFIG_ARCH_POPULATES_NODE_MAP and it initialize
> > N_NORMAL_MEMORY automatically if my understand is correct.
> > (plz see free_area_init_nodes)
> > 
> 
> ia64 doesn't enable CONFIG_HIGHMEM, so it never gets set via this generic 
> code; mips also doesn't enable it for all configs even for 32-bit.
> 
> So we'll either want to take check_for_regular_memory() out from under 
> CONFIG_HIGHMEM and do it for all configs or teach slub to use 
> N_HIGH_MEMORY rather than N_NORMAL_MEMORY.

Hey, I already told this thing.

If CONFIG_HIGHMEM=n, N_HIGH_MEMORY and N_NORMAL_MEMORY are share the
same value. then, 
	node_set_state(nid, N_HIGH_MEMORY) in free_area_init_nodes()

mean set both N_HIGH_MEMORY and N_NORMAL_MEMORY.


--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley April 22, 2011, 6:19 p.m. UTC | #10
On Thu, 2011-04-21 at 11:45 -0700, Dave Hansen wrote:
> On Thu, 2011-04-21 at 13:33 -0500, Christoph Lameter wrote:
> > http://www.linux-mips.org/archives/linux-mips/2008-08/msg00154.html
> > 
> > http://mytechkorner.blogspot.com/2010/12/sparsemem.html
> > 
> > Dave Hansen, Mel: Can you provide us with some help? (Its Easter and so
> > the europeans may be off for awhile) 
> 
> Yup, for sure.  It's also interesting how much code ppc64 removed when
> they did this:
> 
> http://lists.ozlabs.org/pipermail/linuxppc64-dev/2005-November/006646.html

I looked at converting parisc to sparsemem and there's one problem that
none of these cover.  How do you set up bootmem?  If I look at the
examples, they all seem to have enough memory in the first range to
allocate from, so there's no problem.  On parisc, with discontigmem, we
set up all of our ranges as bootmem (we can do this because we
effectively have one node per range).  Obviously, since sparsemem has a
single bitmap for all of the bootmem, we can no longer allocate all of
our memory to it (well, without exploding because some of our gaps are
gigabytes big).  How does everyone cope with this (do you search for
your largest range and use that as bootmem or something)?

James


If 

--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Hansen April 22, 2011, 8:24 p.m. UTC | #11
On Fri, 2011-04-22 at 13:19 -0500, James Bottomley wrote:
> I looked at converting parisc to sparsemem and there's one problem that
> none of these cover.  How do you set up bootmem?  If I look at the
> examples, they all seem to have enough memory in the first range to
> allocate from, so there's no problem.  On parisc, with discontigmem, we
> set up all of our ranges as bootmem (we can do this because we
> effectively have one node per range).  Obviously, since sparsemem has a
> single bitmap for all of the bootmem, we can no longer allocate all of
> our memory to it (well, without exploding because some of our gaps are
> gigabytes big).  How does everyone cope with this (do you search for
> your largest range and use that as bootmem or something)? 

Sparsemem is purely post-bootmem.  It doesn't deal with sparse
bootmem. :(

That said, I'm not sure you're in trouble.  One bit of bitmap covers 4k
(with 4k pages of course) of memory, one byte covers 32k, and A 32MB
bitmap can cover 1TB of address space.  It explodes, but I think it's
manageable.  It hasn't been a problem enough up to this point to go fix
it.

-- Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley April 22, 2011, 8:35 p.m. UTC | #12
On Fri, 2011-04-22 at 13:24 -0700, Dave Hansen wrote:
> On Fri, 2011-04-22 at 13:19 -0500, James Bottomley wrote:
> > I looked at converting parisc to sparsemem and there's one problem that
> > none of these cover.  How do you set up bootmem?  If I look at the
> > examples, they all seem to have enough memory in the first range to
> > allocate from, so there's no problem.  On parisc, with discontigmem, we
> > set up all of our ranges as bootmem (we can do this because we
> > effectively have one node per range).  Obviously, since sparsemem has a
> > single bitmap for all of the bootmem, we can no longer allocate all of
> > our memory to it (well, without exploding because some of our gaps are
> > gigabytes big).  How does everyone cope with this (do you search for
> > your largest range and use that as bootmem or something)? 
> 
> Sparsemem is purely post-bootmem.  It doesn't deal with sparse
> bootmem. :(

Well, this is enabled in discontigmem, sigh.

> That said, I'm not sure you're in trouble.  One bit of bitmap covers 4k
> (with 4k pages of course) of memory, one byte covers 32k, and A 32MB
> bitmap can cover 1TB of address space.  It explodes, but I think it's
> manageable.  It hasn't been a problem enough up to this point to go fix
> it.

I think the platform limited physical address range is 42 bits, so I
suppose that's 128MB ... hopefully we should have that as a contiguous
range from the end of the loaded kernel.  We're lucky they didn't enable
the full ZX1 address range; that would have been 48 bits (or a whole
gigabyte just for the bitmap).

James


--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley April 22, 2011, 9:33 p.m. UTC | #13
On Thu, 2011-04-21 at 11:45 -0700, Dave Hansen wrote:
> On Thu, 2011-04-21 at 13:33 -0500, Christoph Lameter wrote:
> > http://www.linux-mips.org/archives/linux-mips/2008-08/msg00154.html
> > 

By the way, this reference is actively wrong for parisc (having just
debugged the problem).  The basic issue is that until we start paging,
we have the kernel and some memory beyond it barely covered with the pg0
page table set up in head.S  On our systems, that extends out to 16MB.
SPARSEMEM is much more bootmem resource greedy than DISCONTIGMEM, so if
we actually call sparse_init() before we have the page tables set up, we
fall off the end of our 16MB mapping and go boom.  For us, therefore, we
can't call sparse_init() until we have our proper page tables in place.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Hansen April 27, 2011, 4:36 p.m. UTC | #14
On Sat, 2011-04-23 at 13:34 -0500, James Bottomley wrote: 
> This is the preliminary conversion.  It's very nasty on parisc because
> the memory allocation isn't symmetric anymore: under DISCONTIGMEM, we
> push all memory into bootmem and then let free_all_bootmem() do the
> magic for us;

Urg, that's unfortunate.  I bet we could fairly easily teach the bootmem
allocator to allow a couple of bootmem_data's to hang off of an
individual pgdat.  Put each pmem_ranges in one of those instead of a
pgdat.  That would at least help with the bitmap size explosion and
extra loops.

> now we have to do separate initialisations for ranges
> because SPARSEMEM can't do multi-range boot memory. It's also got the
> horrible hack that I only use the first found range for bootmem.  I'm
> not sure if this is correct (it won't be if the first found range can be
> under about 50MB because we'll run out of bootmem during boot) ... we
> might have to sort the ranges and use the larges, but that will involve
> us in even more hackery around the bootmem reservations code.
> 
> The boot sequence got a few seconds slower because now all of the loops
> over our pfn ranges actually have to skip through the holes (which takes
> time for 64GB).

Which iterations were these, btw?  All of the ones I saw the patch touch
seemed to be running over just a single pmem_range.

> All in all, I've not been very impressed with SPARSEMEM over
> DISCONTIGMEM.  It seems to have a lot of rough edges (necessitating
> exception code) which DISCONTIGMEM just copes with.

We definitely need to look at extending it to cover bootmem-time a bit.
Is that even worth it these days with the no-bootmem bits around?

-- Dave


--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -245,6 +245,7 @@  setup_memory_node(int nid, void *kernel_end)
 			bootmap_size, BOOTMEM_DEFAULT);
 	printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
 
+	node_set_state(nid, N_NORMAL_MEMORY);
 	node_set_online(nid);
 }
 
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -573,6 +573,8 @@  void __init find_memory(void)
 				  map>>PAGE_SHIFT,
 				  bdp->node_min_pfn,
 				  bdp->node_low_pfn);
+		if (node_present_pages(node))
+			node_set_state(node, N_NORMAL_MEMORY);
 	}
 
 	efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -247,7 +247,9 @@  void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_DISCONTIGMEM
 	nodes_clear(node_online_map);
+	node_set_state(0, N_NORMAL_MEMORY);	/* always has memory */
 	node_set_online(0);
+	node_set_state(1, N_NORMAL_MEMORY);	/* always has memory */
 	node_set_online(1);
 #endif	/* CONFIG_DISCONTIGMEM */
 
diff --git a/arch/m68k/mm/init_mm.c b/arch/m68k/mm/init_mm.c
--- a/arch/m68k/mm/init_mm.c
+++ b/arch/m68k/mm/init_mm.c
@@ -59,6 +59,8 @@  void __init m68k_setup_node(int node)
 	}
 #endif
 	pg_data_map[node].bdata = bootmem_node_data + node;
+	if (node_present_pages(node))
+		node_set_state(node, N_NORMAL_MEMORY);
 	node_set_online(node);
 }
 
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -471,6 +471,8 @@  void __init paging_init(void)
 
 		if (end_pfn > max_low_pfn)
 			max_low_pfn = end_pfn;
+		if (end_pfn > start_pfn)
+			node_set_state(node, N_NORMAL_MEMORY);
 	}
 	zones_size[ZONE_NORMAL] = max_low_pfn;
 	free_area_init_nodes(zones_size);