diff mbox series

[v4,2/2] target/s390x: support SHA-512 extensions

Message ID 20220802190011.458871-3-Jason@zx2c4.com (mailing list archive)
State New, archived
Headers show
Series MSA EXT 5 for s390x | expand

Commit Message

Jason A. Donenfeld Aug. 2, 2022, 7 p.m. UTC
In order to fully support MSA_EXT_5, we have to also support the SHA-512
special instructions. So implement those.

The implementation began as something TweetNacl-like, and then was
adjusted to be useful here. It's not very beautiful, but it is quite
short and compact, which is what we're going for.

Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 target/s390x/gen-features.c      |   2 +
 target/s390x/tcg/crypto_helper.c | 116 +++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

Comments

David Hildenbrand Aug. 3, 2022, 11:55 a.m. UTC | #1
On 02.08.22 21:00, Jason A. Donenfeld wrote:
> In order to fully support MSA_EXT_5, we have to also support the SHA-512
> special instructions. So implement those.
> 
> The implementation began as something TweetNacl-like, and then was
> adjusted to be useful here. It's not very beautiful, but it is quite
> short and compact, which is what we're going for.
> 

Do we have to worry about copyright/authorship of the original code or
did you write that from scratch?

[...]

I cannot really comment on the actual math, so I'll point out some code
style thingies.

> +static void kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
> +                        uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer)
> +{
> +    uint64_t z[8], b[8], a[8], w[16], t;
> +    int i, j;
> +
> +    for (i = 0; i < 8; ++i)
> +        z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);

Please always use curly brackets in QEMU for code blocks, they are
mandatory.

> +
> +    while (*len_reg >= 128) {
> +        for (i = 0; i < 16; ++i) {

i++, also for all cases below.

> +            if (message_reg)
> +                w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, *message_reg + 8 * i), ra);
> +            else
> +                w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]);
> +        }
> +
> +        for (i = 0; i < 80; ++i) {
> +            for (j = 0; j < 8; ++j)
> +                b[j] = a[j];
> +            t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16];
> +            b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]);
> +            b[3] += t;
> +            for (j = 0; j < 8; ++j)
> +                a[(j + 1) % 8] = b[j];
> +            if (i % 16 == 15) {
> +                for (j = 0; j < 16; ++j)
> +                    w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) +
> +                            sigma1(w[(j + 14) % 16]);
> +            }
> +        }
> +
> +        for (i = 0; i < 8; ++i) {
> +            a[i] += z[i];
> +            z[i] = a[i];
> +        }
> +
> +        if (message_reg)
> +            *message_reg += 128;
> +        else
> +            stack_buffer += 128;
> +        *len_reg -= 128;
> +    }
> +
> +    for (i = 0; i < 8; ++i)
> +        cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra);
> +}
> +
> +static void klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
> +                        uint64_t *message_reg, uint64_t *len_reg)
> +{
> +    uint8_t x[256];
> +    uint64_t i;
> +    int j;
> +
> +    kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL);
> +    for (i = 0; i < *len_reg; ++i)
> +        x[i] = cpu_ldub_data_ra(env, wrap_address(env, *message_reg + i), ra);
> +    *message_reg += *len_reg;
> +    *len_reg = 0;
> +    memset(x + i, 0, sizeof(x) - i);
> +    x[i] = 128;
> +    i = i < 112 ? 128 : 256;
> +    for (j = 0; j < 16; ++j)
> +        x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra);
> +    kimd_sha512(env, ra, parameter_block, NULL, &i, x);
> +}

Are we properly handling the length register (r2 + 1) in the
24-bit/31-bit addressing mode?

Similarly, are we properly handling updates to the message register (r2)
depending on the addressing mode?


It's worth noting that we might want to implement (also for PRNO-TRNG):

"The operation is ended when all
source bytes in the second operand have been pro-
cessed (called normal completion), or when a CPU-
determined number of blocks that is less than the
length of the second operand have been processed
(called partial completion). The CPU-determined
number of blocks depends on the model, and may be
a different number each time the instruction is exe-
cuted. The CPU-determined number of blocks is usu-
ally nonzero. In certain unusual situations, this
number may be zero, and condition code 3 may be
set with no progress."

Otherwise, a large length can make us loop quite a while in QEMU,
without the chance to deliver any other interrupts.
Jason A. Donenfeld Aug. 3, 2022, 12:14 p.m. UTC | #2
Hi David,

On Wed, Aug 03, 2022 at 01:55:21PM +0200, David Hildenbrand wrote:
> On 02.08.22 21:00, Jason A. Donenfeld wrote:
> > In order to fully support MSA_EXT_5, we have to also support the SHA-512
> > special instructions. So implement those.
> > 
> > The implementation began as something TweetNacl-like, and then was
> > adjusted to be useful here. It's not very beautiful, but it is quite
> > short and compact, which is what we're going for.
> > 
> 
> Do we have to worry about copyright/authorship of the original code or
> did you write that from scratch?

I actually don't really remember how much of that is leftover from
tweetnacl and how much I've rewritten - I've had some variant of this
code or another kicking around in various projects and repos for a long
time. But the tweetnacl stuff is public domain to begin with, so all
good.

> Are we properly handling the length register (r2 + 1) in the
> 24-bit/31-bit addressing mode?
> Similarly, are we properly handling updates to the message register (r2)
> depending on the addressing mode?

Ugh, probably not... I didn't do any of the deposit_64 stuff. I guess
I'll look into that.

> It's worth noting that we might want to implement (also for PRNO-TRNG):
> 
> "The operation is ended when all
> source bytes in the second operand have been pro-
> cessed (called normal completion), or when a CPU-
> determined number of blocks that is less than the
> length of the second operand have been processed
> (called partial completion). The CPU-determined
> number of blocks depends on the model, and may be
> a different number each time the instruction is exe-
> cuted. The CPU-determined number of blocks is usu-
> ally nonzero. In certain unusual situations, this
> number may be zero, and condition code 3 may be
> set with no progress."
> 
> Otherwise, a large length can make us loop quite a while in QEMU,
> without the chance to deliver any other interrupts.

Hmm, okay. Looking at the Linux code, I see:

        s.even = (unsigned long)src;
        s.odd  = (unsigned long)src_len;
        asm volatile(
                "       lgr     0,%[fc]\n"
                "       lgr     1,%[pba]\n"
                "0:     .insn   rre,%[opc] << 16,0,%[src]\n"
                "       brc     1,0b\n" /* handle partial completion */
                : [src] "+&d" (s.pair)
                : [fc] "d" (func), [pba] "d" ((unsigned long)(param)),
                  [opc] "i" (CPACF_KIMD)
                : "cc", "memory", "0", "1");

So I guess that means it'll just loop until it's done? Or do I need to
return "1" from HELPER(msa)?

Jason
Jason A. Donenfeld Aug. 3, 2022, 12:47 p.m. UTC | #3
On Wed, Aug 03, 2022 at 02:14:58PM +0200, Jason A. Donenfeld wrote:
>         s.even = (unsigned long)src;
>         s.odd  = (unsigned long)src_len;
>         asm volatile(
>                 "       lgr     0,%[fc]\n"
>                 "       lgr     1,%[pba]\n"
>                 "0:     .insn   rre,%[opc] << 16,0,%[src]\n"
>                 "       brc     1,0b\n" /* handle partial completion */
>                 : [src] "+&d" (s.pair)
>                 : [fc] "d" (func), [pba] "d" ((unsigned long)(param)),
>                   [opc] "i" (CPACF_KIMD)
>                 : "cc", "memory", "0", "1");
> 
> So I guess that means it'll just loop until it's done? Or do I need to
> return "1" from HELPER(msa)?

Looks like returning 3 did the trick. v5 incoming...

Jason
Harald Freudenberger Aug. 4, 2022, 6:51 a.m. UTC | #4
On 2022-08-03 14:14, Jason A. Donenfeld wrote:
> Hi David,
> 
> On Wed, Aug 03, 2022 at 01:55:21PM +0200, David Hildenbrand wrote:
>> On 02.08.22 21:00, Jason A. Donenfeld wrote:
>> > In order to fully support MSA_EXT_5, we have to also support the SHA-512
>> > special instructions. So implement those.
>> >
>> > The implementation began as something TweetNacl-like, and then was
>> > adjusted to be useful here. It's not very beautiful, but it is quite
>> > short and compact, which is what we're going for.
>> >
>> 
>> Do we have to worry about copyright/authorship of the original code or
>> did you write that from scratch?
> 
> I actually don't really remember how much of that is leftover from
> tweetnacl and how much I've rewritten - I've had some variant of this
> code or another kicking around in various projects and repos for a long
> time. But the tweetnacl stuff is public domain to begin with, so all
> good.
> 
>> Are we properly handling the length register (r2 + 1) in the
>> 24-bit/31-bit addressing mode?
>> Similarly, are we properly handling updates to the message register 
>> (r2)
>> depending on the addressing mode?
> 
> Ugh, probably not... I didn't do any of the deposit_64 stuff. I guess
> I'll look into that.
> 
>> It's worth noting that we might want to implement (also for 
>> PRNO-TRNG):
>> 
>> "The operation is ended when all
>> source bytes in the second operand have been pro-
>> cessed (called normal completion), or when a CPU-
>> determined number of blocks that is less than the
>> length of the second operand have been processed
>> (called partial completion). The CPU-determined
>> number of blocks depends on the model, and may be
>> a different number each time the instruction is exe-
>> cuted. The CPU-determined number of blocks is usu-
>> ally nonzero. In certain unusual situations, this
>> number may be zero, and condition code 3 may be
>> set with no progress."
>> 
>> Otherwise, a large length can make us loop quite a while in QEMU,
>> without the chance to deliver any other interrupts.
> 
> Hmm, okay. Looking at the Linux code, I see:
> 
>         s.even = (unsigned long)src;
>         s.odd  = (unsigned long)src_len;
>         asm volatile(
>                 "       lgr     0,%[fc]\n"
>                 "       lgr     1,%[pba]\n"
>                 "0:     .insn   rre,%[opc] << 16,0,%[src]\n"
>                 "       brc     1,0b\n" /* handle partial completion */
>                 : [src] "+&d" (s.pair)
>                 : [fc] "d" (func), [pba] "d" ((unsigned long)(param)),
>                   [opc] "i" (CPACF_KIMD)
>                 : "cc", "memory", "0", "1");
> 
> So I guess that means it'll just loop until it's done? Or do I need to
> return "1" from HELPER(msa)?
> 
> Jason

Hm, you don't really want to implement some kind of particial complete.
Qemu is an emulation and you would have to implement some kind of
fragmenting this based on machine generation. For my feeling this is
way too overengineered. Btw. as there came the request to handle
the 24-bit/31-bit addressing correctly. Is Qemu 32 bit supported ?
Christian Borntraeger Aug. 4, 2022, 6:56 a.m. UTC | #5
Am 04.08.22 um 08:51 schrieb Harald Freudenberger:
> On 2022-08-03 14:14, Jason A. Donenfeld wrote:
>> Hi David,
>>
>> On Wed, Aug 03, 2022 at 01:55:21PM +0200, David Hildenbrand wrote:
>>> On 02.08.22 21:00, Jason A. Donenfeld wrote:
>>> > In order to fully support MSA_EXT_5, we have to also support the SHA-512
>>> > special instructions. So implement those.
>>> >
>>> > The implementation began as something TweetNacl-like, and then was
>>> > adjusted to be useful here. It's not very beautiful, but it is quite
>>> > short and compact, which is what we're going for.
>>> >
>>>
>>> Do we have to worry about copyright/authorship of the original code or
>>> did you write that from scratch?
>>
>> I actually don't really remember how much of that is leftover from
>> tweetnacl and how much I've rewritten - I've had some variant of this
>> code or another kicking around in various projects and repos for a long
>> time. But the tweetnacl stuff is public domain to begin with, so all
>> good.
>>
>>> Are we properly handling the length register (r2 + 1) in the
>>> 24-bit/31-bit addressing mode?
>>> Similarly, are we properly handling updates to the message register (r2)
>>> depending on the addressing mode?
>>
>> Ugh, probably not... I didn't do any of the deposit_64 stuff. I guess
>> I'll look into that.
>>
>>> It's worth noting that we might want to implement (also for PRNO-TRNG):
>>>
>>> "The operation is ended when all
>>> source bytes in the second operand have been pro-
>>> cessed (called normal completion), or when a CPU-
>>> determined number of blocks that is less than the
>>> length of the second operand have been processed
>>> (called partial completion). The CPU-determined
>>> number of blocks depends on the model, and may be
>>> a different number each time the instruction is exe-
>>> cuted. The CPU-determined number of blocks is usu-
>>> ally nonzero. In certain unusual situations, this
>>> number may be zero, and condition code 3 may be
>>> set with no progress."
>>>
>>> Otherwise, a large length can make us loop quite a while in QEMU,
>>> without the chance to deliver any other interrupts.
>>
>> Hmm, okay. Looking at the Linux code, I see:
>>
>>         s.even = (unsigned long)src;
>>         s.odd  = (unsigned long)src_len;
>>         asm volatile(
>>                 "       lgr     0,%[fc]\n"
>>                 "       lgr     1,%[pba]\n"
>>                 "0:     .insn   rre,%[opc] << 16,0,%[src]\n"
>>                 "       brc     1,0b\n" /* handle partial completion */
>>                 : [src] "+&d" (s.pair)
>>                 : [fc] "d" (func), [pba] "d" ((unsigned long)(param)),
>>                   [opc] "i" (CPACF_KIMD)
>>                 : "cc", "memory", "0", "1");
>>
>> So I guess that means it'll just loop until it's done? Or do I need to
>> return "1" from HELPER(msa)?
>>
>> Jason
> 
> Hm, you don't really want to implement some kind of particial complete.
> Qemu is an emulation and you would have to implement some kind of
> fragmenting this based on machine generation. For my feeling this is
> way too overengineered. Btw. as there came the request to handle
> the 24-bit/31-bit addressing correctly. Is Qemu 32 bit supported ?

We do not support the esa390 mode, but the 24/31 bit _addressing_ modes are
totally valid to be used in zarch mode (with sam31 for example). The kernel
does that for example for some diagnoses under z/VM.
Nobody in problem state should probably do that, but its possible.
David Hildenbrand Aug. 4, 2022, 8:10 a.m. UTC | #6
On 04.08.22 08:51, Harald Freudenberger wrote:
> On 2022-08-03 14:14, Jason A. Donenfeld wrote:
>> Hi David,
>>
>> On Wed, Aug 03, 2022 at 01:55:21PM +0200, David Hildenbrand wrote:
>>> On 02.08.22 21:00, Jason A. Donenfeld wrote:
>>>> In order to fully support MSA_EXT_5, we have to also support the SHA-512
>>>> special instructions. So implement those.
>>>>
>>>> The implementation began as something TweetNacl-like, and then was
>>>> adjusted to be useful here. It's not very beautiful, but it is quite
>>>> short and compact, which is what we're going for.
>>>>
>>>
>>> Do we have to worry about copyright/authorship of the original code or
>>> did you write that from scratch?
>>
>> I actually don't really remember how much of that is leftover from
>> tweetnacl and how much I've rewritten - I've had some variant of this
>> code or another kicking around in various projects and repos for a long
>> time. But the tweetnacl stuff is public domain to begin with, so all
>> good.
>>
>>> Are we properly handling the length register (r2 + 1) in the
>>> 24-bit/31-bit addressing mode?
>>> Similarly, are we properly handling updates to the message register 
>>> (r2)
>>> depending on the addressing mode?
>>
>> Ugh, probably not... I didn't do any of the deposit_64 stuff. I guess
>> I'll look into that.
>>
>>> It's worth noting that we might want to implement (also for 
>>> PRNO-TRNG):
>>>
>>> "The operation is ended when all
>>> source bytes in the second operand have been pro-
>>> cessed (called normal completion), or when a CPU-
>>> determined number of blocks that is less than the
>>> length of the second operand have been processed
>>> (called partial completion). The CPU-determined
>>> number of blocks depends on the model, and may be
>>> a different number each time the instruction is exe-
>>> cuted. The CPU-determined number of blocks is usu-
>>> ally nonzero. In certain unusual situations, this
>>> number may be zero, and condition code 3 may be
>>> set with no progress."
>>>
>>> Otherwise, a large length can make us loop quite a while in QEMU,
>>> without the chance to deliver any other interrupts.
>>
>> Hmm, okay. Looking at the Linux code, I see:
>>
>>         s.even = (unsigned long)src;
>>         s.odd  = (unsigned long)src_len;
>>         asm volatile(
>>                 "       lgr     0,%[fc]\n"
>>                 "       lgr     1,%[pba]\n"
>>                 "0:     .insn   rre,%[opc] << 16,0,%[src]\n"
>>                 "       brc     1,0b\n" /* handle partial completion */
>>                 : [src] "+&d" (s.pair)
>>                 : [fc] "d" (func), [pba] "d" ((unsigned long)(param)),
>>                   [opc] "i" (CPACF_KIMD)
>>                 : "cc", "memory", "0", "1");
>>
>> So I guess that means it'll just loop until it's done? Or do I need to
>> return "1" from HELPER(msa)?
>>
>> Jason
> 
> Hm, you don't really want to implement some kind of particial complete.
> Qemu is an emulation and you would have to implement some kind of
> fragmenting this based on machine generation.

Do we?

"The
CPU-determined number of bytes depends on the
model, and may be a different number each time the
instruction is executed. The CPU-determined number
of bytes is usually nonzero. In certain unusual situa-
tions, this number may be zero, and condition code 3
may be set with no progress. However, the CPU pro-
tects against endless recurrence of this no-progress
case.
"

I read that as "do what you want, even on a given model it might be random."
Jason A. Donenfeld Aug. 4, 2022, 12:07 p.m. UTC | #7
Hi,

On Thu, Aug 04, 2022 at 10:10:52AM +0200, David Hildenbrand wrote:
> > Hm, you don't really want to implement some kind of particial complete.
> > Qemu is an emulation and you would have to implement some kind of
> > fragmenting this based on machine generation.
> 
> Do we?
> 
> "The
> CPU-determined number of bytes depends on the
> model, and may be a different number each time the
> instruction is executed. The CPU-determined number
> of bytes is usually nonzero. In certain unusual situa-
> tions, this number may be zero, and condition code 3
> may be set with no progress. However, the CPU pro-
> tects against endless recurrence of this no-progress
> case.
> "
> 
> I read that as "do what you want, even on a given model it might be random."

Just FYI, I implemented this, and it works in v6. Please take a look at:
https://lore.kernel.org/qemu-devel/20220803171536.1314717-2-Jason@zx2c4.com/

So we can keep that. Or I can send a v7 that removes it.

It wasn't very hard to implement, and it's not very hard to remove, so
either way, just tell me what you want to do.

Jason
Jason A. Donenfeld Aug. 4, 2022, 12:09 p.m. UTC | #8
Hi,

On Thu, Aug 04, 2022 at 08:56:19AM +0200, Christian Borntraeger wrote:
> We do not support the esa390 mode, but the 24/31 bit _addressing_ modes are
> totally valid to be used in zarch mode (with sam31 for example). The kernel
> does that for example for some diagnoses under z/VM.
> Nobody in problem state should probably do that, but its possible.

v6 of this series handles 24/31:

https://lore.kernel.org/qemu-devel/20220803171536.1314717-1-Jason@zx2c4.com/ [unchanged for a while now]
https://lore.kernel.org/qemu-devel/20220803171536.1314717-2-Jason@zx2c4.com/ [the new sha512 thing]

Jason
diff mbox series

Patch

diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index 3d333e2789..b6d804fa6d 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -751,6 +751,8 @@  static uint16_t qemu_MAX[] = {
     S390_FEAT_VECTOR_ENH2,
     S390_FEAT_MSA_EXT_5,
     S390_FEAT_PRNO_TRNG,
+    S390_FEAT_KIMD_SHA_512,
+    S390_FEAT_KLMD_SHA_512,
 };
 
 /****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 8ad4ef1ace..475627aa83 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -19,6 +19,112 @@ 
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 
+static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); }
+static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); }
+static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); }
+static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); }
+static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); }
+static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); }
+static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); }
+
+static const uint64_t K[80] = {
+    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+    0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+    0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+    0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+    0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+    0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+    0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+    0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+    0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+    0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+    0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+    0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+    0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+    0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+static void kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+                        uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer)
+{
+    uint64_t z[8], b[8], a[8], w[16], t;
+    int i, j;
+
+    for (i = 0; i < 8; ++i)
+        z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);
+
+    while (*len_reg >= 128) {
+        for (i = 0; i < 16; ++i) {
+            if (message_reg)
+                w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, *message_reg + 8 * i), ra);
+            else
+                w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]);
+        }
+
+        for (i = 0; i < 80; ++i) {
+            for (j = 0; j < 8; ++j)
+                b[j] = a[j];
+            t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16];
+            b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]);
+            b[3] += t;
+            for (j = 0; j < 8; ++j)
+                a[(j + 1) % 8] = b[j];
+            if (i % 16 == 15) {
+                for (j = 0; j < 16; ++j)
+                    w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) +
+                            sigma1(w[(j + 14) % 16]);
+            }
+        }
+
+        for (i = 0; i < 8; ++i) {
+            a[i] += z[i];
+            z[i] = a[i];
+        }
+
+        if (message_reg)
+            *message_reg += 128;
+        else
+            stack_buffer += 128;
+        *len_reg -= 128;
+    }
+
+    for (i = 0; i < 8; ++i)
+        cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra);
+}
+
+static void klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+                        uint64_t *message_reg, uint64_t *len_reg)
+{
+    uint8_t x[256];
+    uint64_t i;
+    int j;
+
+    kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL);
+    for (i = 0; i < *len_reg; ++i)
+        x[i] = cpu_ldub_data_ra(env, wrap_address(env, *message_reg + i), ra);
+    *message_reg += *len_reg;
+    *len_reg = 0;
+    memset(x + i, 0, sizeof(x) - i);
+    x[i] = 128;
+    i = i < 112 ? 128 : 256;
+    for (j = 0; j < 16; ++j)
+        x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra);
+    kimd_sha512(env, ra, parameter_block, NULL, &i, x);
+}
+
 static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
                             uint64_t *buf_reg, uint64_t *len_reg)
 {
@@ -78,6 +184,16 @@  uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
             cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
         }
         break;
+    case 3: /* CPACF_*_SHA_512 */
+        switch (type) {
+        case S390_FEAT_TYPE_KIMD:
+            kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL);
+            break;
+        case S390_FEAT_TYPE_KLMD:
+            klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]);
+            break;
+        }
+        break;
     case 114: /* CPACF_PRNO_TRNG */
         fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
         fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);