diff mbox series

[-next] PM: hibernate: add retry mechanism for S4 image loading failures

Message ID 20240501061258.2874092-1-luoxueqin@kylinos.cn (mailing list archive)
State Rejected, archived
Headers show
Series [-next] PM: hibernate: add retry mechanism for S4 image loading failures | expand

Commit Message

xueqin Luo May 1, 2024, 6:12 a.m. UTC
During the S4 resume process, there's a rare chance of image loading
failure. We provide three retries to load it successfully. If the
recovery fails after these attempts, print out the handle CRC32
value. When the CRC32 value obtained on each retry are inconsistent,
there may be a memory anomaly. When the values are consistent, it
might indicate corrupted swapped memory data.

Signed-off-by: Xueqin Luo <luoxueqin@kylinos.cn>
---
 kernel/power/hibernate.c |  8 ++++++++
 kernel/power/swap.c      | 14 ++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

Comments

Pavel Machek May 1, 2024, 6:24 a.m. UTC | #1
> During the S4 resume process, there's a rare chance of image loading
> failure. We provide three retries to load it successfully. If the
> recovery fails after these attempts, print out the handle CRC32
> value. When the CRC32 value obtained on each retry are inconsistent,
> there may be a memory anomaly. When the values are consistent, it
> might indicate corrupted swapped memory data.
> 
> Signed-off-by: Xueqin Luo <luoxueqin@kylinos.cn>

NAK. If machine is so broken it can't load memory image, it is not
safe to  continue.

								Pavel
> ---
>  kernel/power/hibernate.c |  8 ++++++++
>  kernel/power/swap.c      | 14 ++++++++------
>  2 files changed, 16 insertions(+), 6 deletions(-)
> 
> diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
> index 43b1a82e800c..d9bcf38221ef 100644
> --- a/kernel/power/hibernate.c
> +++ b/kernel/power/hibernate.c
> @@ -703,6 +703,7 @@ static int load_image_and_restore(void)
>  {
>  	int error;
>  	unsigned int flags;
> +	int cnt = 0;
>  
>  	pm_pr_dbg("Loading hibernation image.\n");
>  
> @@ -713,7 +714,14 @@ static int load_image_and_restore(void)
>  		goto Unlock;
>  	}
>  
> +retry:
>  	error = swsusp_read(&flags);
> +	if (error && (cnt++ < 3)) {
> +		pr_err("Failed to load hibernation image, trying to load again...\n");
> +		swsusp_free();
> +		goto retry;
> +	}
> +
>  	swsusp_close();
>  	if (!error)
>  		error = hibernation_restore(flags & SF_PLATFORM_MODE);
> diff --git a/kernel/power/swap.c b/kernel/power/swap.c
> index 5bc04bfe2db1..4b866c645cd7 100644
> --- a/kernel/power/swap.c
> +++ b/kernel/power/swap.c
> @@ -1489,15 +1489,17 @@ static int load_compressed_image(struct swap_map_handle *handle,
>  		ret = snapshot_write_finalize(snapshot);
>  		if (!ret && !snapshot_image_loaded(snapshot))
>  			ret = -ENODATA;
> -		if (!ret) {
> -			if (swsusp_header->flags & SF_CRC32_MODE) {
> -				if(handle->crc32 != swsusp_header->crc32) {
> -					pr_err("Invalid image CRC32!\n");
> -					ret = -ENODATA;
> -				}
> +	}
> +	if (!ret) {
> +		if (swsusp_header->flags & SF_CRC32_MODE) {
> +			if (handle->crc32 != swsusp_header->crc32) {
> +				pr_err("Invalid image CRC32, swsusp header CRC32: %u, handle CRC32: %u\n",
> +					 swsusp_header->crc32, handle->crc32);
> +				ret = -ENODATA;
>  			}
>  		}
>  	}
> +
>  	swsusp_show_speed(start, stop, nr_to_read, "Read");
>  out_clean:
>  	hib_finish_batch(&hb);
> 2.25.1
XiongXin May 1, 2024, 7:43 a.m. UTC | #2
On 2024/5/1 14:24, Pavel Machek wrote:
>> During the S4 resume process, there's a rare chance of image loading
>> failure. We provide three retries to load it successfully. If the
>> recovery fails after these attempts, print out the handle CRC32
>> value. When the CRC32 value obtained on each retry are inconsistent,
>> there may be a memory anomaly. When the values are consistent, it
>> might indicate corrupted swapped memory data.
>>
>> Signed-off-by: Xueqin Luo <luoxueqin@kylinos.cn>
> 
> NAK. If machine is so broken it can't load memory image, it is not
> safe to  continue.
> 
> 								Pavel

Through a large number of tests, we found that when the CRC32 check 
fails, the retry mechanism may make Image loading successful again. Even 
if the final loading attempt fails, we can quickly analyze this anomaly 
by examining the CRC32 values. If the CRC32 values from three attempts 
are inconsistent, it may indicate a memory leakage issue; if they are 
consistent, it may suggest changes in data within the disk.

>> ---
>>   kernel/power/hibernate.c |  8 ++++++++
>>   kernel/power/swap.c      | 14 ++++++++------
>>   2 files changed, 16 insertions(+), 6 deletions(-)
>>
>> diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
>> index 43b1a82e800c..d9bcf38221ef 100644
>> --- a/kernel/power/hibernate.c
>> +++ b/kernel/power/hibernate.c
>> @@ -703,6 +703,7 @@ static int load_image_and_restore(void)
>>   {
>>   	int error;
>>   	unsigned int flags;
>> +	int cnt = 0;
>>   
>>   	pm_pr_dbg("Loading hibernation image.\n");
>>   
>> @@ -713,7 +714,14 @@ static int load_image_and_restore(void)
>>   		goto Unlock;
>>   	}
>>   
>> +retry:
>>   	error = swsusp_read(&flags);
>> +	if (error && (cnt++ < 3)) {
>> +		pr_err("Failed to load hibernation image, trying to load again...\n");
>> +		swsusp_free();
>> +		goto retry;
>> +	}
>> +
>>   	swsusp_close();
>>   	if (!error)
>>   		error = hibernation_restore(flags & SF_PLATFORM_MODE);
>> diff --git a/kernel/power/swap.c b/kernel/power/swap.c
>> index 5bc04bfe2db1..4b866c645cd7 100644
>> --- a/kernel/power/swap.c
>> +++ b/kernel/power/swap.c
>> @@ -1489,15 +1489,17 @@ static int load_compressed_image(struct swap_map_handle *handle,
>>   		ret = snapshot_write_finalize(snapshot);
>>   		if (!ret && !snapshot_image_loaded(snapshot))
>>   			ret = -ENODATA;
>> -		if (!ret) {
>> -			if (swsusp_header->flags & SF_CRC32_MODE) {
>> -				if(handle->crc32 != swsusp_header->crc32) {
>> -					pr_err("Invalid image CRC32!\n");
>> -					ret = -ENODATA;
>> -				}
>> +	}
>> +	if (!ret) {
>> +		if (swsusp_header->flags & SF_CRC32_MODE) {
>> +			if (handle->crc32 != swsusp_header->crc32) {
>> +				pr_err("Invalid image CRC32, swsusp header CRC32: %u, handle CRC32: %u\n",
>> +					 swsusp_header->crc32, handle->crc32);
>> +				ret = -ENODATA;
>>   			}
>>   		}
>>   	}
>> +
>>   	swsusp_show_speed(start, stop, nr_to_read, "Read");
>>   out_clean:
>>   	hib_finish_batch(&hb);
>> 2.25.1
diff mbox series

Patch

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 43b1a82e800c..d9bcf38221ef 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -703,6 +703,7 @@  static int load_image_and_restore(void)
 {
 	int error;
 	unsigned int flags;
+	int cnt = 0;
 
 	pm_pr_dbg("Loading hibernation image.\n");
 
@@ -713,7 +714,14 @@  static int load_image_and_restore(void)
 		goto Unlock;
 	}
 
+retry:
 	error = swsusp_read(&flags);
+	if (error && (cnt++ < 3)) {
+		pr_err("Failed to load hibernation image, trying to load again...\n");
+		swsusp_free();
+		goto retry;
+	}
+
 	swsusp_close();
 	if (!error)
 		error = hibernation_restore(flags & SF_PLATFORM_MODE);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 5bc04bfe2db1..4b866c645cd7 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1489,15 +1489,17 @@  static int load_compressed_image(struct swap_map_handle *handle,
 		ret = snapshot_write_finalize(snapshot);
 		if (!ret && !snapshot_image_loaded(snapshot))
 			ret = -ENODATA;
-		if (!ret) {
-			if (swsusp_header->flags & SF_CRC32_MODE) {
-				if(handle->crc32 != swsusp_header->crc32) {
-					pr_err("Invalid image CRC32!\n");
-					ret = -ENODATA;
-				}
+	}
+	if (!ret) {
+		if (swsusp_header->flags & SF_CRC32_MODE) {
+			if (handle->crc32 != swsusp_header->crc32) {
+				pr_err("Invalid image CRC32, swsusp header CRC32: %u, handle CRC32: %u\n",
+					 swsusp_header->crc32, handle->crc32);
+				ret = -ENODATA;
 			}
 		}
 	}
+
 	swsusp_show_speed(start, stop, nr_to_read, "Read");
 out_clean:
 	hib_finish_batch(&hb);