diff mbox series

[2/2] fbcon: Defer console takeover for splash screens to first switch

Message ID 20240202085408.23251-2-daniel.van.vugt@canonical.com (mailing list archive)
State Superseded
Headers show
Series [1/2] dummycon: Add dummycon_(un)register_switch_notifier | expand

Commit Message

Daniel van Vugt Feb. 2, 2024, 8:53 a.m. UTC
Until now, deferred console takeover only meant defer until there is
output. But that risks stepping on the toes of userspace splash screens,
as console messages may appear before the splash screen. So check for the
"splash" parameter (as used by Plymouth) and if present then extend the
deferral until the first switch.

Closes: https://bugs.launchpad.net/bugs/1970069
Cc: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Daniel van Vugt <daniel.van.vugt@canonical.com>
---
 drivers/video/fbdev/core/fbcon.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

Comments

Mario Limonciello Feb. 2, 2024, 7:46 p.m. UTC | #1
On 2/2/2024 02:53, Daniel van Vugt wrote:
> Until now, deferred console takeover only meant defer until there is
> output. But that risks stepping on the toes of userspace splash screens,
> as console messages may appear before the splash screen. So check for the
> "splash" parameter (as used by Plymouth) and if present then extend the
> deferral until the first switch.
> 
> Closes: https://bugs.launchpad.net/bugs/1970069
> Cc: Mario Limonciello <mario.limonciello@amd.com>
> Signed-off-by: Daniel van Vugt <daniel.van.vugt@canonical.com>
> ---
>   drivers/video/fbdev/core/fbcon.c | 32 +++++++++++++++++++++++++++++---
>   1 file changed, 29 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
> index 63af6ab034..5b9f7635f7 100644
> --- a/drivers/video/fbdev/core/fbcon.c
> +++ b/drivers/video/fbdev/core/fbcon.c
> @@ -76,6 +76,7 @@
>   #include <linux/crc32.h> /* For counting font checksums */
>   #include <linux/uaccess.h>
>   #include <asm/irq.h>
> +#include <asm/cmdline.h>
>   
>   #include "fbcon.h"
>   #include "fb_internal.h"
> @@ -146,6 +147,7 @@ static inline void fbcon_map_override(void)
>   
>   #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>   static bool deferred_takeover = true;
> +static int initial_console = -1;
>   #else
>   #define deferred_takeover false
>   #endif
> @@ -3341,7 +3343,7 @@ static void fbcon_register_existing_fbs(struct work_struct *work)
>   	console_unlock();
>   }
>   
> -static struct notifier_block fbcon_output_nb;
> +static struct notifier_block fbcon_output_nb, fbcon_switch_nb;
>   static DECLARE_WORK(fbcon_deferred_takeover_work, fbcon_register_existing_fbs);
>   
>   static int fbcon_output_notifier(struct notifier_block *nb,
> @@ -3358,6 +3360,21 @@ static int fbcon_output_notifier(struct notifier_block *nb,
>   
>   	return NOTIFY_OK;
>   }
> +
> +static int fbcon_switch_notifier(struct notifier_block *nb,
> +				 unsigned long action, void *data)
> +{
> +	struct vc_data *vc = data;
> +
> +	WARN_CONSOLE_UNLOCKED();
> +
> +	if (vc->vc_num != initial_console) {
> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
> +		dummycon_register_output_notifier(&fbcon_output_nb);
> +	}
> +
> +	return NOTIFY_OK;
> +}
>   #endif
>   
>   static void fbcon_start(void)
> @@ -3370,7 +3387,14 @@ static void fbcon_start(void)
>   
>   	if (deferred_takeover) {
>   		fbcon_output_nb.notifier_call = fbcon_output_notifier;
> -		dummycon_register_output_notifier(&fbcon_output_nb);
> +		fbcon_switch_nb.notifier_call = fbcon_switch_notifier;
> +		initial_console = fg_console;
> +
> +		if (cmdline_find_option_bool(boot_command_line, "splash"))
> +			dummycon_register_switch_notifier(&fbcon_switch_nb);

So there is a problem here that this would only apply if the distro 
happened to use "splash" which some distros use something different.

I looked at the matching plymouth code [1] and they have a bunch of 
variations of what they accept and what it does.

[1] 
https://gitlab.freedesktop.org/plymouth/plymouth/-/blob/main/src/main.c?ref_type=heads#L888

If from the kernel side we're going to have code that caters to the 
userspace behavior of plymouth we probably need to match all those cases 
they do too.

Another alternative could be to make it a kernel configuration option 
for which string to look for to activate this behavior.

> +		else
> +			dummycon_register_output_notifier(&fbcon_output_nb);
> +
>   		return;
>   	}
>   #endif
> @@ -3417,8 +3441,10 @@ void __exit fb_console_exit(void)
>   {
>   #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>   	console_lock();
> -	if (deferred_takeover)
> +	if (deferred_takeover) {
>   		dummycon_unregister_output_notifier(&fbcon_output_nb);
> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
> +	}
>   	console_unlock();
>   
>   	cancel_work_sync(&fbcon_deferred_takeover_work);
Hans de Goede Feb. 26, 2024, 6:23 p.m. UTC | #2
Hi All,

On 2/2/24 09:53, Daniel van Vugt wrote:
> Until now, deferred console takeover only meant defer until there is
> output. But that risks stepping on the toes of userspace splash screens,
> as console messages may appear before the splash screen. So check for the
> "splash" parameter (as used by Plymouth) and if present then extend the
> deferral until the first switch.

Daniel, thank you for your patch but I do not believe that this
is the right solution. Deferring fbcon takeover further then
after the first text is output means that any errors about e.g.
a corrupt initrd or the kernel erroring out / crashing will not
be visible.

When the kernel e.g. oopses or panics because of not finding
its rootfs (I tested the latter option when writing the original
deferred fbcon takeover code) then fbcon must takeover and
print the messages from the dying kernel so that the user has
some notion of what is going wrong.

And since your patch seems to delay switching till the first
vc-switch this means that e.g. even after say gdm refusing
to start because of issues there still will be no text
output. This makes debugging various issues much harder.

Moreover Fedora has been doing flickerfree boot for many
years without needing this.

The kernel itself will be quiet as long as you set
CONFIG_CONSOLE_LOGLEVEL_QUIET=3 Ubuntu atm has set this
to 4 which means any kernel pr_err() or dev_err()
messages will get through and since there are quite
a few false positives of those Ubuntu really needs
to set CONFIG_CONSOLE_LOGLEVEL_QUIET=3 to fix part of:
https://bugs.launchpad.net/bugs/1970069

After that it is "just" a matter of not making userspace
output anything unless it has errors to report.

systemd already is quiet by default (only logging
errors) when quiet is on the kernel commandline.

So any remaining issues are Ubuntu specific boot
process bits and Ubuntu really should be able to
make those by silent unless they have important
info (errors or other unexpected things) to report.

Given that this will make debugging boot issues
much harder and that there are other IMHO better
alternatives I'm nacking this patch: NACK.

FWIW I believe that I'm actually saving Ubuntu
from shooting themselves in the foot here,
hiding all sort of boot errors (like the initrd
not finding /) until the user does a magic
alt+f2 followed by alt+f1 incantation really is
not doing yourself any favors wrt debugging any
sort of boot failures.

Regards,

Hans





> Closes: https://bugs.launchpad.net/bugs/1970069
> Cc: Mario Limonciello <mario.limonciello@amd.com>
> Signed-off-by: Daniel van Vugt <daniel.van.vugt@canonical.com>
> ---
>  drivers/video/fbdev/core/fbcon.c | 32 +++++++++++++++++++++++++++++---
>  1 file changed, 29 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
> index 63af6ab034..5b9f7635f7 100644
> --- a/drivers/video/fbdev/core/fbcon.c
> +++ b/drivers/video/fbdev/core/fbcon.c
> @@ -76,6 +76,7 @@
>  #include <linux/crc32.h> /* For counting font checksums */
>  #include <linux/uaccess.h>
>  #include <asm/irq.h>
> +#include <asm/cmdline.h>
>  
>  #include "fbcon.h"
>  #include "fb_internal.h"
> @@ -146,6 +147,7 @@ static inline void fbcon_map_override(void)
>  
>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>  static bool deferred_takeover = true;
> +static int initial_console = -1;
>  #else
>  #define deferred_takeover false
>  #endif
> @@ -3341,7 +3343,7 @@ static void fbcon_register_existing_fbs(struct work_struct *work)
>  	console_unlock();
>  }
>  
> -static struct notifier_block fbcon_output_nb;
> +static struct notifier_block fbcon_output_nb, fbcon_switch_nb;
>  static DECLARE_WORK(fbcon_deferred_takeover_work, fbcon_register_existing_fbs);
>  
>  static int fbcon_output_notifier(struct notifier_block *nb,
> @@ -3358,6 +3360,21 @@ static int fbcon_output_notifier(struct notifier_block *nb,
>  
>  	return NOTIFY_OK;
>  }
> +
> +static int fbcon_switch_notifier(struct notifier_block *nb,
> +				 unsigned long action, void *data)
> +{
> +	struct vc_data *vc = data;
> +
> +	WARN_CONSOLE_UNLOCKED();
> +
> +	if (vc->vc_num != initial_console) {
> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
> +		dummycon_register_output_notifier(&fbcon_output_nb);
> +	}
> +
> +	return NOTIFY_OK;
> +}
>  #endif
>  
>  static void fbcon_start(void)
> @@ -3370,7 +3387,14 @@ static void fbcon_start(void)
>  
>  	if (deferred_takeover) {
>  		fbcon_output_nb.notifier_call = fbcon_output_notifier;
> -		dummycon_register_output_notifier(&fbcon_output_nb);
> +		fbcon_switch_nb.notifier_call = fbcon_switch_notifier;
> +		initial_console = fg_console;
> +
> +		if (cmdline_find_option_bool(boot_command_line, "splash"))
> +			dummycon_register_switch_notifier(&fbcon_switch_nb);
> +		else
> +			dummycon_register_output_notifier(&fbcon_output_nb);
> +
>  		return;
>  	}
>  #endif
> @@ -3417,8 +3441,10 @@ void __exit fb_console_exit(void)
>  {
>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>  	console_lock();
> -	if (deferred_takeover)
> +	if (deferred_takeover) {
>  		dummycon_unregister_output_notifier(&fbcon_output_nb);
> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
> +	}
>  	console_unlock();
>  
>  	cancel_work_sync(&fbcon_deferred_takeover_work);
Daniel van Vugt Feb. 27, 2024, 1:06 a.m. UTC | #3
On 27/2/24 02:23, Hans de Goede wrote:
> Hi All,
> 
> On 2/2/24 09:53, Daniel van Vugt wrote:
>> Until now, deferred console takeover only meant defer until there is
>> output. But that risks stepping on the toes of userspace splash screens,
>> as console messages may appear before the splash screen. So check for the
>> "splash" parameter (as used by Plymouth) and if present then extend the
>> deferral until the first switch.
> 
> Daniel, thank you for your patch but I do not believe that this
> is the right solution. Deferring fbcon takeover further then
> after the first text is output means that any errors about e.g.
> a corrupt initrd or the kernel erroring out / crashing will not
> be visible.

That's not really correct. If a boot failure has occurred after the splash then
pressing escape shows the log. If a boot failure has occurred before the splash
then it can be debugged visually by rebooting without the "splash" parameter.

> 
> When the kernel e.g. oopses or panics because of not finding
> its rootfs (I tested the latter option when writing the original
> deferred fbcon takeover code) then fbcon must takeover and
> print the messages from the dying kernel so that the user has
> some notion of what is going wrong.

Indeed, just reboot without the "splash" parameter to do that.

> 
> And since your patch seems to delay switching till the first
> vc-switch this means that e.g. even after say gdm refusing
> to start because of issues there still will be no text
> output. This makes debugging various issues much harder.

I've debugged many gdm failures and it is never useful to use the console for
those. Reboot and get the system journal instead.

> 
> Moreover Fedora has been doing flickerfree boot for many
> years without needing this.

I believe Fedora has a mostly working solution, but not totally reliable, as
mentioned in the commit message:

"even systems whose splash exists in initrd may not be not immune because they
 still rely on racing against all possible kernel messages that might
 trigger the fbcon takeover"

> 
> The kernel itself will be quiet as long as you set
> CONFIG_CONSOLE_LOGLEVEL_QUIET=3 Ubuntu atm has set this
> to 4 which means any kernel pr_err() or dev_err()
> messages will get through and since there are quite
> a few false positives of those Ubuntu really needs
> to set CONFIG_CONSOLE_LOGLEVEL_QUIET=3 to fix part of:
> https://bugs.launchpad.net/bugs/1970069

Incorrect. In my testing some laptops needed log level as low as 2 to go quiet.
And the Ubuntu kernel team is never going to fix all those for non-sponsored
devices.

> 
> After that it is "just" a matter of not making userspace
> output anything unless it has errors to report.
> 
> systemd already is quiet by default (only logging
> errors) when quiet is on the kernel commandline.

Unfortunately not true for Ubuntu. We carry a noisy systemd patch which I'm
told we can't remove in the short term:

https://bugs.launchpad.net/ubuntu/+source/plymouth/+bug/1970069/comments/39

> 
> So any remaining issues are Ubuntu specific boot
> process bits and Ubuntu really should be able to
> make those by silent unless they have important
> info (errors or other unexpected things) to report.
> 
> Given that this will make debugging boot issues
> much harder and that there are other IMHO better
> alternatives I'm nacking this patch: NACK.
> 
> FWIW I believe that I'm actually saving Ubuntu
> from shooting themselves in the foot here,
> hiding all sort of boot errors (like the initrd
> not finding /) until the user does a magic
> alt+f2 followed by alt+f1 incantation really is
> not doing yourself any favors wrt debugging any
> sort of boot failures.
> 
> Regards,
> 
> Hans

Thanks for your input, but I respectfully disagree and did consider these
points already.

- Daniel

> 
> 
> 
> 
> 
>> Closes: https://bugs.launchpad.net/bugs/1970069
>> Cc: Mario Limonciello <mario.limonciello@amd.com>
>> Signed-off-by: Daniel van Vugt <daniel.van.vugt@canonical.com>
>> ---
>>  drivers/video/fbdev/core/fbcon.c | 32 +++++++++++++++++++++++++++++---
>>  1 file changed, 29 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
>> index 63af6ab034..5b9f7635f7 100644
>> --- a/drivers/video/fbdev/core/fbcon.c
>> +++ b/drivers/video/fbdev/core/fbcon.c
>> @@ -76,6 +76,7 @@
>>  #include <linux/crc32.h> /* For counting font checksums */
>>  #include <linux/uaccess.h>
>>  #include <asm/irq.h>
>> +#include <asm/cmdline.h>
>>  
>>  #include "fbcon.h"
>>  #include "fb_internal.h"
>> @@ -146,6 +147,7 @@ static inline void fbcon_map_override(void)
>>  
>>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>  static bool deferred_takeover = true;
>> +static int initial_console = -1;
>>  #else
>>  #define deferred_takeover false
>>  #endif
>> @@ -3341,7 +3343,7 @@ static void fbcon_register_existing_fbs(struct work_struct *work)
>>  	console_unlock();
>>  }
>>  
>> -static struct notifier_block fbcon_output_nb;
>> +static struct notifier_block fbcon_output_nb, fbcon_switch_nb;
>>  static DECLARE_WORK(fbcon_deferred_takeover_work, fbcon_register_existing_fbs);
>>  
>>  static int fbcon_output_notifier(struct notifier_block *nb,
>> @@ -3358,6 +3360,21 @@ static int fbcon_output_notifier(struct notifier_block *nb,
>>  
>>  	return NOTIFY_OK;
>>  }
>> +
>> +static int fbcon_switch_notifier(struct notifier_block *nb,
>> +				 unsigned long action, void *data)
>> +{
>> +	struct vc_data *vc = data;
>> +
>> +	WARN_CONSOLE_UNLOCKED();
>> +
>> +	if (vc->vc_num != initial_console) {
>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>> +		dummycon_register_output_notifier(&fbcon_output_nb);
>> +	}
>> +
>> +	return NOTIFY_OK;
>> +}
>>  #endif
>>  
>>  static void fbcon_start(void)
>> @@ -3370,7 +3387,14 @@ static void fbcon_start(void)
>>  
>>  	if (deferred_takeover) {
>>  		fbcon_output_nb.notifier_call = fbcon_output_notifier;
>> -		dummycon_register_output_notifier(&fbcon_output_nb);
>> +		fbcon_switch_nb.notifier_call = fbcon_switch_notifier;
>> +		initial_console = fg_console;
>> +
>> +		if (cmdline_find_option_bool(boot_command_line, "splash"))
>> +			dummycon_register_switch_notifier(&fbcon_switch_nb);
>> +		else
>> +			dummycon_register_output_notifier(&fbcon_output_nb);
>> +
>>  		return;
>>  	}
>>  #endif
>> @@ -3417,8 +3441,10 @@ void __exit fb_console_exit(void)
>>  {
>>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>  	console_lock();
>> -	if (deferred_takeover)
>> +	if (deferred_takeover) {
>>  		dummycon_unregister_output_notifier(&fbcon_output_nb);
>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>> +	}
>>  	console_unlock();
>>  
>>  	cancel_work_sync(&fbcon_deferred_takeover_work);
>
Hans de Goede Feb. 27, 2024, 1:47 p.m. UTC | #4
Hi,

On 2/27/24 02:06, Daniel van Vugt wrote:
> On 27/2/24 02:23, Hans de Goede wrote:
>> Hi All,
>>
>> On 2/2/24 09:53, Daniel van Vugt wrote:
>>> Until now, deferred console takeover only meant defer until there is
>>> output. But that risks stepping on the toes of userspace splash screens,
>>> as console messages may appear before the splash screen. So check for the
>>> "splash" parameter (as used by Plymouth) and if present then extend the
>>> deferral until the first switch.
>>
>> Daniel, thank you for your patch but I do not believe that this
>> is the right solution. Deferring fbcon takeover further then
>> after the first text is output means that any errors about e.g.
>> a corrupt initrd or the kernel erroring out / crashing will not
>> be visible.
> 
> That's not really correct. If a boot failure has occurred after the splash then
> pressing escape shows the log.

Hmm, I guess this is with the latest plymouth which has a builtin terminal
emulator for kernels without VT support ? Pressing ESC does not to a VC
switch and AFAICT that is what you are triggering on to allow fbcon takeover
after this patches.

> If a boot failure has occurred before the splash
> then it can be debugged visually by rebooting without the "splash" parameter.

Which requires the user to know this and requires the user to know how to
edit kernel cmdline parameters in e.g. grub. This is not a good user
experience. We want inexperienced users to just be able to point
a phone camera at the screen and take a picture of the errors.


>> When the kernel e.g. oopses or panics because of not finding
>> its rootfs (I tested the latter option when writing the original
>> deferred fbcon takeover code) then fbcon must takeover and
>> print the messages from the dying kernel so that the user has
>> some notion of what is going wrong.
> 
> Indeed, just reboot without the "splash" parameter to do that.

Again not something beginning Linux users will be able to do,
what happened to "Ubuntu: Linux for Human Beings" ?

>> And since your patch seems to delay switching till the first
>> vc-switch this means that e.g. even after say gdm refusing
>> to start because of issues there still will be no text
>> output. This makes debugging various issues much harder.
> 
> I've debugged many gdm failures and it is never useful to use the console for
> those. Reboot and get the system journal instead.

But users will not see any errors now, meaning they don't
even know where to begin with troubleshooting ...

>> Moreover Fedora has been doing flickerfree boot for many
>> years without needing this.
> 
> I believe Fedora has a mostly working solution, but not totally reliable, as
> mentioned in the commit message:
> 
> "even systems whose splash exists in initrd may not be not immune because they
>  still rely on racing against all possible kernel messages that might
>  trigger the fbcon takeover"

Only very serious kernel errors like oopses or panics will
trigger the takeover and that is *exactly* what we want.

There is a race where plymouth may hide such vary serious
messages, if plymouth does manage to start before the errors,
but that is actually an existing issue which we don't want
to make bigger by *always* hiding such errors.

>> The kernel itself will be quiet as long as you set
>> CONFIG_CONSOLE_LOGLEVEL_QUIET=3 Ubuntu atm has set this
>> to 4 which means any kernel pr_err() or dev_err()
>> messages will get through and since there are quite
>> a few false positives of those Ubuntu really needs
>> to set CONFIG_CONSOLE_LOGLEVEL_QUIET=3 to fix part of:
>> https://bugs.launchpad.net/bugs/1970069
> 
> Incorrect. In my testing some laptops needed log level as low as 2 to go quiet.
> And the Ubuntu kernel team is never going to fix all those for non-sponsored
> devices.

Notice that atm Ubuntu's kernel is using the too high
CONFIG_CONSOLE_LOGLEVEL_QUIET=4 with
CONFIG_CONSOLE_LOGLEVEL_QUIET=3 getting any errors logged
to the console should be very very rare.

The only thing I can think of is if the kernel oopses
/ WARN()s early on but the cause is innocent enough
that the boot happily continues.

In that case actually showing the oops/WARN() is a good
thing.

For all the years Fedora has had flickerfree boot I have
seen zero bug reports about this. If you have examples
of this actually being a problem please file bugs for
them (launchpad or bugzilla.kernel.org is fine) and
then lets take a look at those bugs and fix them.

These should be so rare that I'm not worried about this
becoming a never ending list of bugs (unlike pr_err() /
dev_err() messages of which there are simply too many).

>> After that it is "just" a matter of not making userspace
>> output anything unless it has errors to report.
>>
>> systemd already is quiet by default (only logging
>> errors) when quiet is on the kernel commandline.
> 
> Unfortunately not true for Ubuntu. We carry a noisy systemd patch which I'm
> told we can't remove in the short term:
> 
> https://bugs.launchpad.net/ubuntu/+source/plymouth/+bug/1970069/comments/39

Well then make the patch less noisy? Suppressing non
error message unless in debug mode should be easy
even with a downstream patch.

> Thanks for your input, but I respectfully disagree and did consider these
> points already.

Sorry, but your real problem here seems to be your
noisy downstream systemd patch. I'm not going to ack
a kernel patch which I consider a bad idea because
Ubuntu has a non standard systemd patch which is
to trigger happy with spamming the console.

So this is still a NACK from me.

Regards,

Hans





>>> Closes: https://bugs.launchpad.net/bugs/1970069
>>> Cc: Mario Limonciello <mario.limonciello@amd.com>
>>> Signed-off-by: Daniel van Vugt <daniel.van.vugt@canonical.com>
>>> ---
>>>  drivers/video/fbdev/core/fbcon.c | 32 +++++++++++++++++++++++++++++---
>>>  1 file changed, 29 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
>>> index 63af6ab034..5b9f7635f7 100644
>>> --- a/drivers/video/fbdev/core/fbcon.c
>>> +++ b/drivers/video/fbdev/core/fbcon.c
>>> @@ -76,6 +76,7 @@
>>>  #include <linux/crc32.h> /* For counting font checksums */
>>>  #include <linux/uaccess.h>
>>>  #include <asm/irq.h>
>>> +#include <asm/cmdline.h>
>>>  
>>>  #include "fbcon.h"
>>>  #include "fb_internal.h"
>>> @@ -146,6 +147,7 @@ static inline void fbcon_map_override(void)
>>>  
>>>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>>  static bool deferred_takeover = true;
>>> +static int initial_console = -1;
>>>  #else
>>>  #define deferred_takeover false
>>>  #endif
>>> @@ -3341,7 +3343,7 @@ static void fbcon_register_existing_fbs(struct work_struct *work)
>>>  	console_unlock();
>>>  }
>>>  
>>> -static struct notifier_block fbcon_output_nb;
>>> +static struct notifier_block fbcon_output_nb, fbcon_switch_nb;
>>>  static DECLARE_WORK(fbcon_deferred_takeover_work, fbcon_register_existing_fbs);
>>>  
>>>  static int fbcon_output_notifier(struct notifier_block *nb,
>>> @@ -3358,6 +3360,21 @@ static int fbcon_output_notifier(struct notifier_block *nb,
>>>  
>>>  	return NOTIFY_OK;
>>>  }
>>> +
>>> +static int fbcon_switch_notifier(struct notifier_block *nb,
>>> +				 unsigned long action, void *data)
>>> +{
>>> +	struct vc_data *vc = data;
>>> +
>>> +	WARN_CONSOLE_UNLOCKED();
>>> +
>>> +	if (vc->vc_num != initial_console) {
>>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>>> +		dummycon_register_output_notifier(&fbcon_output_nb);
>>> +	}
>>> +
>>> +	return NOTIFY_OK;
>>> +}
>>>  #endif
>>>  
>>>  static void fbcon_start(void)
>>> @@ -3370,7 +3387,14 @@ static void fbcon_start(void)
>>>  
>>>  	if (deferred_takeover) {
>>>  		fbcon_output_nb.notifier_call = fbcon_output_notifier;
>>> -		dummycon_register_output_notifier(&fbcon_output_nb);
>>> +		fbcon_switch_nb.notifier_call = fbcon_switch_notifier;
>>> +		initial_console = fg_console;
>>> +
>>> +		if (cmdline_find_option_bool(boot_command_line, "splash"))
>>> +			dummycon_register_switch_notifier(&fbcon_switch_nb);
>>> +		else
>>> +			dummycon_register_output_notifier(&fbcon_output_nb);
>>> +
>>>  		return;
>>>  	}
>>>  #endif
>>> @@ -3417,8 +3441,10 @@ void __exit fb_console_exit(void)
>>>  {
>>>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>>  	console_lock();
>>> -	if (deferred_takeover)
>>> +	if (deferred_takeover) {
>>>  		dummycon_unregister_output_notifier(&fbcon_output_nb);
>>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>>> +	}
>>>  	console_unlock();
>>>  
>>>  	cancel_work_sync(&fbcon_deferred_takeover_work);
>>
>
Daniel van Vugt Feb. 28, 2024, 2 a.m. UTC | #5
On 27/2/24 21:47, Hans de Goede wrote:
> Hi,
> 
> On 2/27/24 02:06, Daniel van Vugt wrote:
>> On 27/2/24 02:23, Hans de Goede wrote:
>>> Hi All,
>>>
>>> On 2/2/24 09:53, Daniel van Vugt wrote:
>>>> Until now, deferred console takeover only meant defer until there is
>>>> output. But that risks stepping on the toes of userspace splash screens,
>>>> as console messages may appear before the splash screen. So check for the
>>>> "splash" parameter (as used by Plymouth) and if present then extend the
>>>> deferral until the first switch.
>>>
>>> Daniel, thank you for your patch but I do not believe that this
>>> is the right solution. Deferring fbcon takeover further then
>>> after the first text is output means that any errors about e.g.
>>> a corrupt initrd or the kernel erroring out / crashing will not
>>> be visible.
>>
>> That's not really correct. If a boot failure has occurred after the splash then
>> pressing escape shows the log.
> 
> Hmm, I guess this is with the latest plymouth which has a builtin terminal
> emulator for kernels without VT support ? Pressing ESC does not to a VC
> switch and AFAICT that is what you are triggering on to allow fbcon takeover
> after this patches.
> 
>> If a boot failure has occurred before the splash
>> then it can be debugged visually by rebooting without the "splash" parameter.
> 
> Which requires the user to know this and requires the user to know how to
> edit kernel cmdline parameters in e.g. grub. This is not a good user
> experience. We want inexperienced users to just be able to point
> a phone camera at the screen and take a picture of the errors.

As the person who contributes most to Ubuntu bug triage I have a pretty good
idea of what users experience. And when they do experience boot failures it's
either with a blank screen already (because userspace, not the kernel's fault),
or they report an error message to us that's not relevant to the real failure.

In both cases our users understand (or learn quickly) the ease with which they
can reboot either to recovery mode, or a previous kernel. We then direct them
to collect the full log of the failed boot. Because even if they were booting
with a full text console, most of those bugs don't reveal themselves on the
console. If they did then they'd be visible in the system journal along with
everything else.

What is not a "good user experience" is the boot messages people are shown on
every boot.

> 
> 
>>> When the kernel e.g. oopses or panics because of not finding
>>> its rootfs (I tested the latter option when writing the original
>>> deferred fbcon takeover code) then fbcon must takeover and
>>> print the messages from the dying kernel so that the user has
>>> some notion of what is going wrong.
>>
>> Indeed, just reboot without the "splash" parameter to do that.
> 
> Again not something beginning Linux users will be able to do,
> what happened to "Ubuntu: Linux for Human Beings" ?

It is more user-friendly than it sounds. Just reboot, trigger the grub menu and
select recovery mode or an older kernel (which is always available).

I think some boot failures also take you to the grub menu automatically next time?

> 
>>> And since your patch seems to delay switching till the first
>>> vc-switch this means that e.g. even after say gdm refusing
>>> to start because of issues there still will be no text
>>> output. This makes debugging various issues much harder.
>>
>> I've debugged many gdm failures and it is never useful to use the console for
>> those. Reboot and get the system journal instead.
> 
> But users will not see any errors now, meaning they don't
> even know where to begin with troubleshooting ...

Indeed. I deal with those users every day and they log their bugs against the
wrong components, understandably. We then work with them to triage and reassign
the issue to the right place.

> 
>>> Moreover Fedora has been doing flickerfree boot for many
>>> years without needing this.
>>
>> I believe Fedora has a mostly working solution, but not totally reliable, as
>> mentioned in the commit message:
>>
>> "even systems whose splash exists in initrd may not be not immune because they
>>  still rely on racing against all possible kernel messages that might
>>  trigger the fbcon takeover"
> 
> Only very serious kernel errors like oopses or panics will
> trigger the takeover and that is *exactly* what we want.
> 
> There is a race where plymouth may hide such vary serious
> messages, if plymouth does manage to start before the errors,
> but that is actually an existing issue which we don't want
> to make bigger by *always* hiding such errors.
> 
>>> The kernel itself will be quiet as long as you set
>>> CONFIG_CONSOLE_LOGLEVEL_QUIET=3 Ubuntu atm has set this
>>> to 4 which means any kernel pr_err() or dev_err()
>>> messages will get through and since there are quite
>>> a few false positives of those Ubuntu really needs
>>> to set CONFIG_CONSOLE_LOGLEVEL_QUIET=3 to fix part of:
>>> https://bugs.launchpad.net/bugs/1970069
>>
>> Incorrect. In my testing some laptops needed log level as low as 2 to go quiet.
>> And the Ubuntu kernel team is never going to fix all those for non-sponsored
>> devices.
> 
> Notice that atm Ubuntu's kernel is using the too high
> CONFIG_CONSOLE_LOGLEVEL_QUIET=4 with
> CONFIG_CONSOLE_LOGLEVEL_QUIET=3 getting any errors logged
> to the console should be very very rare.
> 
> The only thing I can think of is if the kernel oopses
> / WARN()s early on but the cause is innocent enough
> that the boot happily continues.
> 
> In that case actually showing the oops/WARN() is a good
> thing.
> 
> For all the years Fedora has had flickerfree boot I have
> seen zero bug reports about this. If you have examples
> of this actually being a problem please file bugs for
> them (launchpad or bugzilla.kernel.org is fine) and
> then lets take a look at those bugs and fix them.
> 
> These should be so rare that I'm not worried about this
> becoming a never ending list of bugs (unlike pr_err() /
> dev_err() messages of which there are simply too many).

I personally own many laptops with so many different boot messages that it's
overwhelming for me already to report bugs for each of them. Hence this patch.

Also I don't own all the laptops in the world, so fixing all the errors just
for my collection wouldn't solve all cases. Whereas this patch does.

> 
>>> After that it is "just" a matter of not making userspace
>>> output anything unless it has errors to report.
>>>
>>> systemd already is quiet by default (only logging
>>> errors) when quiet is on the kernel commandline.
>>
>> Unfortunately not true for Ubuntu. We carry a noisy systemd patch which I'm
>> told we can't remove in the short term:
>>
>> https://bugs.launchpad.net/ubuntu/+source/plymouth/+bug/1970069/comments/39
> 
> Well then make the patch less noisy? Suppressing non
> error message unless in debug mode should be easy
> even with a downstream patch.
> 
>> Thanks for your input, but I respectfully disagree and did consider these
>> points already.
> 
> Sorry, but your real problem here seems to be your
> noisy downstream systemd patch. I'm not going to ack
> a kernel patch which I consider a bad idea because
> Ubuntu has a non standard systemd patch which is
> to trigger happy with spamming the console.

Indeed the systemd patch is a big problem. We seem to have had it for 9 years
or so. I only just discovered it recently and would love to drop it, but was
told we can't. Its main problem is that it uses the console as a communication
pipe to plymouth. So simply making it less noisy isn't possible without
disabling its functionality. It was seemingly intended to run behind the
splash, but since it does fsck it tends to run before the splash (because DRM
startup takes a few more seconds).

> 
> So this is still a NACK from me.
> 
> Regards,
> 
> Hans
> 
> 
> 
> 
> 
>>>> Closes: https://bugs.launchpad.net/bugs/1970069
>>>> Cc: Mario Limonciello <mario.limonciello@amd.com>
>>>> Signed-off-by: Daniel van Vugt <daniel.van.vugt@canonical.com>
>>>> ---
>>>>  drivers/video/fbdev/core/fbcon.c | 32 +++++++++++++++++++++++++++++---
>>>>  1 file changed, 29 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
>>>> index 63af6ab034..5b9f7635f7 100644
>>>> --- a/drivers/video/fbdev/core/fbcon.c
>>>> +++ b/drivers/video/fbdev/core/fbcon.c
>>>> @@ -76,6 +76,7 @@
>>>>  #include <linux/crc32.h> /* For counting font checksums */
>>>>  #include <linux/uaccess.h>
>>>>  #include <asm/irq.h>
>>>> +#include <asm/cmdline.h>
>>>>  
>>>>  #include "fbcon.h"
>>>>  #include "fb_internal.h"
>>>> @@ -146,6 +147,7 @@ static inline void fbcon_map_override(void)
>>>>  
>>>>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>>>  static bool deferred_takeover = true;
>>>> +static int initial_console = -1;
>>>>  #else
>>>>  #define deferred_takeover false
>>>>  #endif
>>>> @@ -3341,7 +3343,7 @@ static void fbcon_register_existing_fbs(struct work_struct *work)
>>>>  	console_unlock();
>>>>  }
>>>>  
>>>> -static struct notifier_block fbcon_output_nb;
>>>> +static struct notifier_block fbcon_output_nb, fbcon_switch_nb;
>>>>  static DECLARE_WORK(fbcon_deferred_takeover_work, fbcon_register_existing_fbs);
>>>>  
>>>>  static int fbcon_output_notifier(struct notifier_block *nb,
>>>> @@ -3358,6 +3360,21 @@ static int fbcon_output_notifier(struct notifier_block *nb,
>>>>  
>>>>  	return NOTIFY_OK;
>>>>  }
>>>> +
>>>> +static int fbcon_switch_notifier(struct notifier_block *nb,
>>>> +				 unsigned long action, void *data)
>>>> +{
>>>> +	struct vc_data *vc = data;
>>>> +
>>>> +	WARN_CONSOLE_UNLOCKED();
>>>> +
>>>> +	if (vc->vc_num != initial_console) {
>>>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>>>> +		dummycon_register_output_notifier(&fbcon_output_nb);
>>>> +	}
>>>> +
>>>> +	return NOTIFY_OK;
>>>> +}
>>>>  #endif
>>>>  
>>>>  static void fbcon_start(void)
>>>> @@ -3370,7 +3387,14 @@ static void fbcon_start(void)
>>>>  
>>>>  	if (deferred_takeover) {
>>>>  		fbcon_output_nb.notifier_call = fbcon_output_notifier;
>>>> -		dummycon_register_output_notifier(&fbcon_output_nb);
>>>> +		fbcon_switch_nb.notifier_call = fbcon_switch_notifier;
>>>> +		initial_console = fg_console;
>>>> +
>>>> +		if (cmdline_find_option_bool(boot_command_line, "splash"))
>>>> +			dummycon_register_switch_notifier(&fbcon_switch_nb);
>>>> +		else
>>>> +			dummycon_register_output_notifier(&fbcon_output_nb);
>>>> +
>>>>  		return;
>>>>  	}
>>>>  #endif
>>>> @@ -3417,8 +3441,10 @@ void __exit fb_console_exit(void)
>>>>  {
>>>>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>>>  	console_lock();
>>>> -	if (deferred_takeover)
>>>> +	if (deferred_takeover) {
>>>>  		dummycon_unregister_output_notifier(&fbcon_output_nb);
>>>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>>>> +	}
>>>>  	console_unlock();
>>>>  
>>>>  	cancel_work_sync(&fbcon_deferred_takeover_work);
>>>
>>
>
Hans de Goede Feb. 28, 2024, 11:54 a.m. UTC | #6
Hi Daniel,

On 2/28/24 03:00, Daniel van Vugt wrote:
> On 27/2/24 21:47, Hans de Goede wrote:

<snip>

> I think some boot failures also take you to the grub menu automatically next time?

In Fedora all boot failures will unhide the grub menu on
the next boot. This unfortunately relies on downstream changes
so I don't know what Ubuntu does here.

<snip>

>>>> The kernel itself will be quiet as long as you set
>>>> CONFIG_CONSOLE_LOGLEVEL_QUIET=3 Ubuntu atm has set this
>>>> to 4 which means any kernel pr_err() or dev_err()
>>>> messages will get through and since there are quite
>>>> a few false positives of those Ubuntu really needs
>>>> to set CONFIG_CONSOLE_LOGLEVEL_QUIET=3 to fix part of:
>>>> https://bugs.launchpad.net/bugs/1970069
>>>
>>> Incorrect. In my testing some laptops needed log level as low as 2 to go quiet.
>>> And the Ubuntu kernel team is never going to fix all those for non-sponsored
>>> devices.
>>
>> Notice that atm Ubuntu's kernel is using the too high
>> CONFIG_CONSOLE_LOGLEVEL_QUIET=4 with
>> CONFIG_CONSOLE_LOGLEVEL_QUIET=3 getting any errors logged
>> to the console should be very very rare.
>>
>> The only thing I can think of is if the kernel oopses
>> / WARN()s early on but the cause is innocent enough
>> that the boot happily continues.
>>
>> In that case actually showing the oops/WARN() is a good
>> thing.
>>
>> For all the years Fedora has had flickerfree boot I have
>> seen zero bug reports about this. If you have examples
>> of this actually being a problem please file bugs for
>> them (launchpad or bugzilla.kernel.org is fine) and
>> then lets take a look at those bugs and fix them.
>>
>> These should be so rare that I'm not worried about this
>> becoming a never ending list of bugs (unlike pr_err() /
>> dev_err() messages of which there are simply too many).
> 
> I personally own many laptops with so many different boot messages that it's
> overwhelming for me already to report bugs for each of them. Hence this patch.
> 
> Also I don't own all the laptops in the world, so fixing all the errors just
> for my collection wouldn't solve all cases. Whereas this patch does.

Almost all of those boot messages are because Ubuntu has
set CONFIG_CONSOLE_LOGLEVEL_QUIET too high. Once that is fixed
there should be very little of not no messages left.

I too own many laptops and I'm not seeing this issue on
any of them.

You claim you are still seeing errors with
CONFIG_CONSOLE_LOGLEVEL_QUIET=3 yet you have not provided
a single example!

>> Sorry, but your real problem here seems to be your
>> noisy downstream systemd patch. I'm not going to ack
>> a kernel patch which I consider a bad idea because
>> Ubuntu has a non standard systemd patch which is
>> to trigger happy with spamming the console.
> 
> Indeed the systemd patch is a big problem. We seem to have had it for 9 years
> or so. I only just discovered it recently and would love to drop it, but was
> told we can't. Its main problem is that it uses the console as a communication
> pipe to plymouth. So simply making it less noisy isn't possible without
> disabling its functionality. It was seemingly intended to run behind the
> splash, but since it does fsck it tends to run before the splash (because DRM
> startup takes a few more seconds).

This does indeed sound like it is a non trivial problem to fix,
but that is still not a good reason to add this (IMHO) hack
to the kernel.

The issue deferred fbcon takeover was designed to fix is that
the fbcon would mess up the framebuffer contents even if
nothing was ever logged to the console.

The whole idea being that to still have the fbcon come up
as soon as there are any messages.

Actively hiding messages was never part of the design, so
this is still a NACK from me.

Also note that this matches how things work in grub
and shim when I first implemented flickerfree boot
I also had to patch shim and grub to not make EFI
text output protocol calls (including init()) until
they actually had some text to show.

So the whole design here for shim, grub and the kernel
has always been to not mess with the framebuffer until
there is some text (any text) to output and then show
that text immediately.

I do not think that deviating from this design is a good
idea.

Regards,

Hans



>>>>> Closes: https://bugs.launchpad.net/bugs/1970069
>>>>> Cc: Mario Limonciello <mario.limonciello@amd.com>
>>>>> Signed-off-by: Daniel van Vugt <daniel.van.vugt@canonical.com>
>>>>> ---
>>>>>  drivers/video/fbdev/core/fbcon.c | 32 +++++++++++++++++++++++++++++---
>>>>>  1 file changed, 29 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
>>>>> index 63af6ab034..5b9f7635f7 100644
>>>>> --- a/drivers/video/fbdev/core/fbcon.c
>>>>> +++ b/drivers/video/fbdev/core/fbcon.c
>>>>> @@ -76,6 +76,7 @@
>>>>>  #include <linux/crc32.h> /* For counting font checksums */
>>>>>  #include <linux/uaccess.h>
>>>>>  #include <asm/irq.h>
>>>>> +#include <asm/cmdline.h>
>>>>>  
>>>>>  #include "fbcon.h"
>>>>>  #include "fb_internal.h"
>>>>> @@ -146,6 +147,7 @@ static inline void fbcon_map_override(void)
>>>>>  
>>>>>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>>>>  static bool deferred_takeover = true;
>>>>> +static int initial_console = -1;
>>>>>  #else
>>>>>  #define deferred_takeover false
>>>>>  #endif
>>>>> @@ -3341,7 +3343,7 @@ static void fbcon_register_existing_fbs(struct work_struct *work)
>>>>>  	console_unlock();
>>>>>  }
>>>>>  
>>>>> -static struct notifier_block fbcon_output_nb;
>>>>> +static struct notifier_block fbcon_output_nb, fbcon_switch_nb;
>>>>>  static DECLARE_WORK(fbcon_deferred_takeover_work, fbcon_register_existing_fbs);
>>>>>  
>>>>>  static int fbcon_output_notifier(struct notifier_block *nb,
>>>>> @@ -3358,6 +3360,21 @@ static int fbcon_output_notifier(struct notifier_block *nb,
>>>>>  
>>>>>  	return NOTIFY_OK;
>>>>>  }
>>>>> +
>>>>> +static int fbcon_switch_notifier(struct notifier_block *nb,
>>>>> +				 unsigned long action, void *data)
>>>>> +{
>>>>> +	struct vc_data *vc = data;
>>>>> +
>>>>> +	WARN_CONSOLE_UNLOCKED();
>>>>> +
>>>>> +	if (vc->vc_num != initial_console) {
>>>>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>>>>> +		dummycon_register_output_notifier(&fbcon_output_nb);
>>>>> +	}
>>>>> +
>>>>> +	return NOTIFY_OK;
>>>>> +}
>>>>>  #endif
>>>>>  
>>>>>  static void fbcon_start(void)
>>>>> @@ -3370,7 +3387,14 @@ static void fbcon_start(void)
>>>>>  
>>>>>  	if (deferred_takeover) {
>>>>>  		fbcon_output_nb.notifier_call = fbcon_output_notifier;
>>>>> -		dummycon_register_output_notifier(&fbcon_output_nb);
>>>>> +		fbcon_switch_nb.notifier_call = fbcon_switch_notifier;
>>>>> +		initial_console = fg_console;
>>>>> +
>>>>> +		if (cmdline_find_option_bool(boot_command_line, "splash"))
>>>>> +			dummycon_register_switch_notifier(&fbcon_switch_nb);
>>>>> +		else
>>>>> +			dummycon_register_output_notifier(&fbcon_output_nb);
>>>>> +
>>>>>  		return;
>>>>>  	}
>>>>>  #endif
>>>>> @@ -3417,8 +3441,10 @@ void __exit fb_console_exit(void)
>>>>>  {
>>>>>  #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>>>>  	console_lock();
>>>>> -	if (deferred_takeover)
>>>>> +	if (deferred_takeover) {
>>>>>  		dummycon_unregister_output_notifier(&fbcon_output_nb);
>>>>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>>>>> +	}
>>>>>  	console_unlock();
>>>>>  
>>>>>  	cancel_work_sync(&fbcon_deferred_takeover_work);
>>>>
>>>
>>
>
Mario Limonciello Feb. 28, 2024, 6:09 p.m. UTC | #7
On 2/28/2024 05:54, Hans de Goede wrote:
> Hi Daniel,
> 
> On 2/28/24 03:00, Daniel van Vugt wrote:
>> On 27/2/24 21:47, Hans de Goede wrote:
> 
> <snip>
> 
>> I think some boot failures also take you to the grub menu automatically next time?
> 
> In Fedora all boot failures will unhide the grub menu on
> the next boot. This unfortunately relies on downstream changes
> so I don't know what Ubuntu does here.
> 
> <snip>
> 
>>>>> The kernel itself will be quiet as long as you set
>>>>> CONFIG_CONSOLE_LOGLEVEL_QUIET=3 Ubuntu atm has set this
>>>>> to 4 which means any kernel pr_err() or dev_err()
>>>>> messages will get through and since there are quite
>>>>> a few false positives of those Ubuntu really needs
>>>>> to set CONFIG_CONSOLE_LOGLEVEL_QUIET=3 to fix part of:
>>>>> https://bugs.launchpad.net/bugs/1970069
>>>>
>>>> Incorrect. In my testing some laptops needed log level as low as 2 to go quiet.
>>>> And the Ubuntu kernel team is never going to fix all those for non-sponsored
>>>> devices.
>>>
>>> Notice that atm Ubuntu's kernel is using the too high
>>> CONFIG_CONSOLE_LOGLEVEL_QUIET=4 with
>>> CONFIG_CONSOLE_LOGLEVEL_QUIET=3 getting any errors logged
>>> to the console should be very very rare.
>>>
>>> The only thing I can think of is if the kernel oopses
>>> / WARN()s early on but the cause is innocent enough
>>> that the boot happily continues.
>>>
>>> In that case actually showing the oops/WARN() is a good
>>> thing.
>>>
>>> For all the years Fedora has had flickerfree boot I have
>>> seen zero bug reports about this. If you have examples
>>> of this actually being a problem please file bugs for
>>> them (launchpad or bugzilla.kernel.org is fine) and
>>> then lets take a look at those bugs and fix them.
>>>
>>> These should be so rare that I'm not worried about this
>>> becoming a never ending list of bugs (unlike pr_err() /
>>> dev_err() messages of which there are simply too many).
>>
>> I personally own many laptops with so many different boot messages that it's
>> overwhelming for me already to report bugs for each of them. Hence this patch.
>>
>> Also I don't own all the laptops in the world, so fixing all the errors just
>> for my collection wouldn't solve all cases. Whereas this patch does.
> 
> Almost all of those boot messages are because Ubuntu has
> set CONFIG_CONSOLE_LOGLEVEL_QUIET too high. Once that is fixed
> there should be very little of not no messages left.
> 
> I too own many laptops and I'm not seeing this issue on
> any of them.
> 
> You claim you are still seeing errors with
> CONFIG_CONSOLE_LOGLEVEL_QUIET=3 yet you have not provided
> a single example!
> 
>>> Sorry, but your real problem here seems to be your
>>> noisy downstream systemd patch. I'm not going to ack
>>> a kernel patch which I consider a bad idea because
>>> Ubuntu has a non standard systemd patch which is
>>> to trigger happy with spamming the console.
>>
>> Indeed the systemd patch is a big problem. We seem to have had it for 9 years
>> or so. I only just discovered it recently and would love to drop it, but was
>> told we can't. Its main problem is that it uses the console as a communication
>> pipe to plymouth. So simply making it less noisy isn't possible without
>> disabling its functionality. It was seemingly intended to run behind the
>> splash, but since it does fsck it tends to run before the splash (because DRM
>> startup takes a few more seconds).

This comes back to what I was saying before - Ubuntu (and anyone else 
that wants a flicker free boot for that matter) should adopt simpledrm.

When simpledrm is compiled into the kernel DRM will be up long before 
the splash screen comes up.  When drivers do fastboot (Intel) or 
seamless (AMD) handoff you /should/ be able to get the splash screen 
without a modeset.

I think between doing that and changing the default log level not to 
show console err messages will go a long way.

If there is a concern that people need to see those; how about changing 
the kernel command line for the recovery kernel so that they only come 
up in the recovery kernel?

> 
> This does indeed sound like it is a non trivial problem to fix,
> but that is still not a good reason to add this (IMHO) hack
> to the kernel.
> 
> The issue deferred fbcon takeover was designed to fix is that
> the fbcon would mess up the framebuffer contents even if
> nothing was ever logged to the console.
> 
> The whole idea being that to still have the fbcon come up
> as soon as there are any messages.
> 
> Actively hiding messages was never part of the design, so
> this is still a NACK from me.
> 
> Also note that this matches how things work in grub
> and shim when I first implemented flickerfree boot
> I also had to patch shim and grub to not make EFI
> text output protocol calls (including init()) until
> they actually had some text to show.
> 
> So the whole design here for shim, grub and the kernel
> has always been to not mess with the framebuffer until
> there is some text (any text) to output and then show
> that text immediately.
> 
> I do not think that deviating from this design is a good
> idea.
> 
> Regards,
> 
> Hans
> 
> 
> 
>>>>>> Closes: https://bugs.launchpad.net/bugs/1970069
>>>>>> Cc: Mario Limonciello <mario.limonciello@amd.com>
>>>>>> Signed-off-by: Daniel van Vugt <daniel.van.vugt@canonical.com>
>>>>>> ---
>>>>>>   drivers/video/fbdev/core/fbcon.c | 32 +++++++++++++++++++++++++++++---
>>>>>>   1 file changed, 29 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
>>>>>> index 63af6ab034..5b9f7635f7 100644
>>>>>> --- a/drivers/video/fbdev/core/fbcon.c
>>>>>> +++ b/drivers/video/fbdev/core/fbcon.c
>>>>>> @@ -76,6 +76,7 @@
>>>>>>   #include <linux/crc32.h> /* For counting font checksums */
>>>>>>   #include <linux/uaccess.h>
>>>>>>   #include <asm/irq.h>
>>>>>> +#include <asm/cmdline.h>
>>>>>>   
>>>>>>   #include "fbcon.h"
>>>>>>   #include "fb_internal.h"
>>>>>> @@ -146,6 +147,7 @@ static inline void fbcon_map_override(void)
>>>>>>   
>>>>>>   #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>>>>>   static bool deferred_takeover = true;
>>>>>> +static int initial_console = -1;
>>>>>>   #else
>>>>>>   #define deferred_takeover false
>>>>>>   #endif
>>>>>> @@ -3341,7 +3343,7 @@ static void fbcon_register_existing_fbs(struct work_struct *work)
>>>>>>   	console_unlock();
>>>>>>   }
>>>>>>   
>>>>>> -static struct notifier_block fbcon_output_nb;
>>>>>> +static struct notifier_block fbcon_output_nb, fbcon_switch_nb;
>>>>>>   static DECLARE_WORK(fbcon_deferred_takeover_work, fbcon_register_existing_fbs);
>>>>>>   
>>>>>>   static int fbcon_output_notifier(struct notifier_block *nb,
>>>>>> @@ -3358,6 +3360,21 @@ static int fbcon_output_notifier(struct notifier_block *nb,
>>>>>>   
>>>>>>   	return NOTIFY_OK;
>>>>>>   }
>>>>>> +
>>>>>> +static int fbcon_switch_notifier(struct notifier_block *nb,
>>>>>> +				 unsigned long action, void *data)
>>>>>> +{
>>>>>> +	struct vc_data *vc = data;
>>>>>> +
>>>>>> +	WARN_CONSOLE_UNLOCKED();
>>>>>> +
>>>>>> +	if (vc->vc_num != initial_console) {
>>>>>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>>>>>> +		dummycon_register_output_notifier(&fbcon_output_nb);
>>>>>> +	}
>>>>>> +
>>>>>> +	return NOTIFY_OK;
>>>>>> +}
>>>>>>   #endif
>>>>>>   
>>>>>>   static void fbcon_start(void)
>>>>>> @@ -3370,7 +3387,14 @@ static void fbcon_start(void)
>>>>>>   
>>>>>>   	if (deferred_takeover) {
>>>>>>   		fbcon_output_nb.notifier_call = fbcon_output_notifier;
>>>>>> -		dummycon_register_output_notifier(&fbcon_output_nb);
>>>>>> +		fbcon_switch_nb.notifier_call = fbcon_switch_notifier;
>>>>>> +		initial_console = fg_console;
>>>>>> +
>>>>>> +		if (cmdline_find_option_bool(boot_command_line, "splash"))
>>>>>> +			dummycon_register_switch_notifier(&fbcon_switch_nb);
>>>>>> +		else
>>>>>> +			dummycon_register_output_notifier(&fbcon_output_nb);
>>>>>> +
>>>>>>   		return;
>>>>>>   	}
>>>>>>   #endif
>>>>>> @@ -3417,8 +3441,10 @@ void __exit fb_console_exit(void)
>>>>>>   {
>>>>>>   #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
>>>>>>   	console_lock();
>>>>>> -	if (deferred_takeover)
>>>>>> +	if (deferred_takeover) {
>>>>>>   		dummycon_unregister_output_notifier(&fbcon_output_nb);
>>>>>> +		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
>>>>>> +	}
>>>>>>   	console_unlock();
>>>>>>   
>>>>>>   	cancel_work_sync(&fbcon_deferred_takeover_work);
>>>>>
>>>>
>>>
>>
>
diff mbox series

Patch

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 63af6ab034..5b9f7635f7 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -76,6 +76,7 @@ 
 #include <linux/crc32.h> /* For counting font checksums */
 #include <linux/uaccess.h>
 #include <asm/irq.h>
+#include <asm/cmdline.h>
 
 #include "fbcon.h"
 #include "fb_internal.h"
@@ -146,6 +147,7 @@  static inline void fbcon_map_override(void)
 
 #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
 static bool deferred_takeover = true;
+static int initial_console = -1;
 #else
 #define deferred_takeover false
 #endif
@@ -3341,7 +3343,7 @@  static void fbcon_register_existing_fbs(struct work_struct *work)
 	console_unlock();
 }
 
-static struct notifier_block fbcon_output_nb;
+static struct notifier_block fbcon_output_nb, fbcon_switch_nb;
 static DECLARE_WORK(fbcon_deferred_takeover_work, fbcon_register_existing_fbs);
 
 static int fbcon_output_notifier(struct notifier_block *nb,
@@ -3358,6 +3360,21 @@  static int fbcon_output_notifier(struct notifier_block *nb,
 
 	return NOTIFY_OK;
 }
+
+static int fbcon_switch_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct vc_data *vc = data;
+
+	WARN_CONSOLE_UNLOCKED();
+
+	if (vc->vc_num != initial_console) {
+		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
+		dummycon_register_output_notifier(&fbcon_output_nb);
+	}
+
+	return NOTIFY_OK;
+}
 #endif
 
 static void fbcon_start(void)
@@ -3370,7 +3387,14 @@  static void fbcon_start(void)
 
 	if (deferred_takeover) {
 		fbcon_output_nb.notifier_call = fbcon_output_notifier;
-		dummycon_register_output_notifier(&fbcon_output_nb);
+		fbcon_switch_nb.notifier_call = fbcon_switch_notifier;
+		initial_console = fg_console;
+
+		if (cmdline_find_option_bool(boot_command_line, "splash"))
+			dummycon_register_switch_notifier(&fbcon_switch_nb);
+		else
+			dummycon_register_output_notifier(&fbcon_output_nb);
+
 		return;
 	}
 #endif
@@ -3417,8 +3441,10 @@  void __exit fb_console_exit(void)
 {
 #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
 	console_lock();
-	if (deferred_takeover)
+	if (deferred_takeover) {
 		dummycon_unregister_output_notifier(&fbcon_output_nb);
+		dummycon_unregister_switch_notifier(&fbcon_switch_nb);
+	}
 	console_unlock();
 
 	cancel_work_sync(&fbcon_deferred_takeover_work);