Message ID | 1518122886.21828.20.camel@HansenPartnership.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
I'll prepare a patch set with tpm_buf change and this and send it ASAP. /Jarkko On Thu, Feb 08, 2018 at 12:48:06PM -0800, James Bottomley wrote: > From 8f147e6a9a8e08433d6c1836afbf030598d26e9e Mon Sep 17 00:00:00 2001 > From: James Bottomley <James.Bottomley@HansenPartnership.com> > Date: Thu, 8 Feb 2018 12:33:47 -0800 > Subject: [PATCH] tpm: fix intermittent failure with self tests > > Ever since > > commit 2482b1bba5122b1d5516c909832bdd282015b8e9 > Author: Alexander Steffen <Alexander.Steffen@infineon.com> > Date: Thu Aug 31 19:18:56 2017 +0200 > > tpm: Trigger only missing TPM 2.0 self tests > > My Nuvoton 6xx in a Dell XPS-13 has been intermittently failing to > work (necessitating a reboot). The problem seems to be that the TPM > gets into a state where the partial self-test doesn't return > TPM_RC_SUCCESS (meaning all tests have run to completion), but instead > returns TPM_RC_TESTING (meaning some tests are still running in the > background). There are various theories that resending the self-test > command actually causes the tests to restart and thus triggers more > TPM_RC_TESTING returns until the timeout is exceeded. > > There are several issues here: firstly being we shouldn't slow down > the boot sequence waiting for the self test to complete once the TPM > backgrounds them. It will actually make available all functions that > have passed and if it gets a failure return TPM_RC_FAILURE to every > subsequent command. So the fix is to kick off self tests once and if > they return TPM_RC_TESTING log that as a backgrounded self test and > continue on. In order to prevent userspace from seeing any > TPM_RC_TESTING returns (which it might if userspace sends a command > that needs a TPM subsystem which is still under test), we loop in > tpm_transmit_cmd until either a timeout or we don't get a > TPM_RC_TESTING return. > > Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com> > Fixes: 2482b1bba5122b1d5516c909832bdd282015b8e9 > --- > drivers/char/tpm/tpm-interface.c | 28 ++++++++++++++++++++++++---- > drivers/char/tpm/tpm.h | 1 + > drivers/char/tpm/tpm2-cmd.c | 36 ++++++++++++++++-------------------- > 3 files changed, 41 insertions(+), 24 deletions(-) > > diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c > index 3cec403a80b3..c3e6d0248af8 100644 > --- a/drivers/char/tpm/tpm-interface.c > +++ b/drivers/char/tpm/tpm-interface.c > @@ -521,12 +521,32 @@ ssize_t tpm_transmit_cmd(struct tpm_chip *chip, struct tpm_space *space, > const struct tpm_output_header *header = buf; > int err; > ssize_t len; > + unsigned int delay_msec = 20; > > - len = tpm_transmit(chip, space, (u8 *)buf, bufsiz, flags); > - if (len < 0) > - return len; > + /* > + * on first probe we kick off a TPM self test in the > + * background This means the TPM may return RC_TESTING to any > + * command that tries to use a subsystem under test, so do an > + * exponential backoff wait if that happens > + */ > + for (;;) { > + len = tpm_transmit(chip, space, (u8 *)buf, bufsiz, flags); > + if (len < 0) > + return len; > + > + err = be32_to_cpu(header->return_code); > + if (err != TPM2_RC_TESTING || > + (flags & TPM_TRANSMIT_NOWAIT)) > + break; > + > + delay_msec *= 2; > + if (delay_msec > TPM2_DURATION_LONG) { > + dev_err(&chip->dev,"TPM: still running self tests, giving up waiting\n"); > + break; > + } > + tpm_msleep(delay_msec); > + } > > - err = be32_to_cpu(header->return_code); > if (err != 0 && desc) > dev_err(&chip->dev, "A TPM error (%d) occurred %s\n", err, > desc); > diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h > index 528cffbd49d3..47c5a5206325 100644 > --- a/drivers/char/tpm/tpm.h > +++ b/drivers/char/tpm/tpm.h > @@ -495,6 +495,7 @@ extern struct idr dev_nums_idr; > enum tpm_transmit_flags { > TPM_TRANSMIT_UNLOCKED = BIT(0), > TPM_TRANSMIT_RAW = BIT(1), > + TPM_TRANSMIT_NOWAIT = BIT(2), > }; > > ssize_t tpm_transmit(struct tpm_chip *chip, struct tpm_space *space, > diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c > index f6be08483ae6..169232179182 100644 > --- a/drivers/char/tpm/tpm2-cmd.c > +++ b/drivers/char/tpm/tpm2-cmd.c > @@ -853,28 +853,24 @@ static const struct tpm_input_header tpm2_selftest_header = { > static int tpm2_do_selftest(struct tpm_chip *chip) > { > int rc; > - unsigned int delay_msec = 20; > - long duration; > struct tpm2_cmd cmd; > > - duration = jiffies_to_msecs( > - tpm2_calc_ordinal_duration(chip, TPM2_CC_SELF_TEST)); > - > - while (duration > 0) { > - cmd.header.in = tpm2_selftest_header; > - cmd.params.selftest_in.full_test = 0; > - > - rc = tpm_transmit_cmd(chip, NULL, &cmd, TPM2_SELF_TEST_IN_SIZE, > - 0, 0, "continue selftest"); > - > - if (rc != TPM2_RC_TESTING) > - break; > - > - tpm_msleep(delay_msec); > - duration -= delay_msec; > - > - /* wait longer the next round */ > - delay_msec *= 2; > + cmd.header.in = tpm2_selftest_header; > + cmd.params.selftest_in.full_test = 0; > + > + rc = tpm_transmit_cmd(chip, NULL, &cmd, TPM2_SELF_TEST_IN_SIZE, > + 0, TPM_TRANSMIT_NOWAIT, "continue selftest"); > + > + if (rc == TPM2_RC_TESTING) { > + /* > + * A return of RC_TESTING means the TPM is still > + * running self tests. If one fails it will go into > + * failure mode and return RC_FAILED to every command, > + * so treat a still in testing return as a success > + * rather than causing a driver detach. > + */ > + dev_info(&chip->dev,"TPM: Running self test in background\n"); > + rc = TPM2_RC_SUCCESS; > } > > return rc; > -- > 2.12.3
Hi See the comments below and please include this as a prequel patch for the next version: https://patchwork.kernel.org/patch/10208965/ On Thu, Feb 08, 2018 at 12:48:06PM -0800, James Bottomley wrote: > From 8f147e6a9a8e08433d6c1836afbf030598d26e9e Mon Sep 17 00:00:00 2001 > From: James Bottomley <James.Bottomley@HansenPartnership.com> > Date: Thu, 8 Feb 2018 12:33:47 -0800 > Subject: [PATCH] tpm: fix intermittent failure with self tests > > Ever since > > commit 2482b1bba5122b1d5516c909832bdd282015b8e9 > Author: Alexander Steffen <Alexander.Steffen@infineon.com> > Date: Thu Aug 31 19:18:56 2017 +0200 > > tpm: Trigger only missing TPM 2.0 self tests > > My Nuvoton 6xx in a Dell XPS-13 has been intermittently failing to > work (necessitating a reboot). The problem seems to be that the TPM > gets into a state where the partial self-test doesn't return > TPM_RC_SUCCESS (meaning all tests have run to completion), but instead > returns TPM_RC_TESTING (meaning some tests are still running in the > background). There are various theories that resending the self-test > command actually causes the tests to restart and thus triggers more > TPM_RC_TESTING returns until the timeout is exceeded. > > There are several issues here: firstly being we shouldn't slow down > the boot sequence waiting for the self test to complete once the TPM > backgrounds them. It will actually make available all functions that > have passed and if it gets a failure return TPM_RC_FAILURE to every > subsequent command. So the fix is to kick off self tests once and if > they return TPM_RC_TESTING log that as a backgrounded self test and > continue on. In order to prevent userspace from seeing any > TPM_RC_TESTING returns (which it might if userspace sends a command > that needs a TPM subsystem which is still under test), we loop in > tpm_transmit_cmd until either a timeout or we don't get a > TPM_RC_TESTING return. > > Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com> > Fixes: 2482b1bba5122b1d5516c909832bdd282015b8e9 > --- > drivers/char/tpm/tpm-interface.c | 28 ++++++++++++++++++++++++---- > drivers/char/tpm/tpm.h | 1 + > drivers/char/tpm/tpm2-cmd.c | 36 ++++++++++++++++-------------------- > 3 files changed, 41 insertions(+), 24 deletions(-) > > diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c > index 3cec403a80b3..c3e6d0248af8 100644 > --- a/drivers/char/tpm/tpm-interface.c > +++ b/drivers/char/tpm/tpm-interface.c > @@ -521,12 +521,32 @@ ssize_t tpm_transmit_cmd(struct tpm_chip *chip, struct tpm_space *space, > const struct tpm_output_header *header = buf; > int err; > ssize_t len; > + unsigned int delay_msec = 20; > > - len = tpm_transmit(chip, space, (u8 *)buf, bufsiz, flags); > - if (len < 0) > - return len; > + /* > + * on first probe we kick off a TPM self test in the > + * background This means the TPM may return RC_TESTING to any > + * command that tries to use a subsystem under test, so do an > + * exponential backoff wait if that happens > + */ > + for (;;) { > + len = tpm_transmit(chip, space, (u8 *)buf, bufsiz, flags); > + if (len < 0) > + return len; > + > + err = be32_to_cpu(header->return_code); > + if (err != TPM2_RC_TESTING || > + (flags & TPM_TRANSMIT_NOWAIT)) > + break; > + > + delay_msec *= 2; > + if (delay_msec > TPM2_DURATION_LONG) { > + dev_err(&chip->dev,"TPM: still running self tests, giving up waiting\n"); > + break; > + } > + tpm_msleep(delay_msec); > + } Please have a helper function for this. > > - err = be32_to_cpu(header->return_code); > if (err != 0 && desc) > dev_err(&chip->dev, "A TPM error (%d) occurred %s\n", err, > desc); > diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h > index 528cffbd49d3..47c5a5206325 100644 > --- a/drivers/char/tpm/tpm.h > +++ b/drivers/char/tpm/tpm.h > @@ -495,6 +495,7 @@ extern struct idr dev_nums_idr; > enum tpm_transmit_flags { > TPM_TRANSMIT_UNLOCKED = BIT(0), > TPM_TRANSMIT_RAW = BIT(1), > + TPM_TRANSMIT_NOWAIT = BIT(2), Note to myself: TPM_TRANSMIT_RAW is a retarded name that does not mean anything. That should be renamed simply as TPM_TRANSMIT_IGNORE_LOCALITY. It is really hard to interpret what TPM_TRANSMIT_NOWAIT means. Please just name it as TPM_TRANSMIT_IGNORE_TESTING or propose something better. I think that name makes it obvious what the flag means. > }; > > ssize_t tpm_transmit(struct tpm_chip *chip, struct tpm_space *space, > diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c > index f6be08483ae6..169232179182 100644 > --- a/drivers/char/tpm/tpm2-cmd.c > +++ b/drivers/char/tpm/tpm2-cmd.c > @@ -853,28 +853,24 @@ static const struct tpm_input_header tpm2_selftest_header = { > static int tpm2_do_selftest(struct tpm_chip *chip) > { > int rc; > - unsigned int delay_msec = 20; > - long duration; > struct tpm2_cmd cmd; > > - duration = jiffies_to_msecs( > - tpm2_calc_ordinal_duration(chip, TPM2_CC_SELF_TEST)); > - > - while (duration > 0) { > - cmd.header.in = tpm2_selftest_header; > - cmd.params.selftest_in.full_test = 0; > - > - rc = tpm_transmit_cmd(chip, NULL, &cmd, TPM2_SELF_TEST_IN_SIZE, > - 0, 0, "continue selftest"); > - > - if (rc != TPM2_RC_TESTING) > - break; > - > - tpm_msleep(delay_msec); > - duration -= delay_msec; > - > - /* wait longer the next round */ > - delay_msec *= 2; > + cmd.header.in = tpm2_selftest_header; > + cmd.params.selftest_in.full_test = 0; > + > + rc = tpm_transmit_cmd(chip, NULL, &cmd, TPM2_SELF_TEST_IN_SIZE, > + 0, TPM_TRANSMIT_NOWAIT, "continue selftest"); > + > + if (rc == TPM2_RC_TESTING) { > + /* > + * A return of RC_TESTING means the TPM is still > + * running self tests. If one fails it will go into > + * failure mode and return RC_FAILED to every command, > + * so treat a still in testing return as a success > + * rather than causing a driver detach. > + */ > + dev_info(&chip->dev,"TPM: Running self test in background\n"); > + rc = TPM2_RC_SUCCESS; > } > > return rc; I'm not sure how tpm_write() call path is handled. /Jarkko
On Fri, 2018-02-16 at 10:34 +0200, Jarkko Sakkinen wrote: > Hi > > See the comments below and please include this as a prequel patch for > the next version: > > https://patchwork.kernel.org/patch/10208965/ Actually, I've got another curious bit of behaviour from my Nuvoton: After an experiment with primary EK generation that sent it into failure mode (connected with something else, nothing to do with this), it's taken to returning 232 (TPM_RC_COMMAND_CODE) to a partial self test periodically on boot. According to the manual this is impossible, so I think it's something to do with tpm_validate_command. I think, perhaps, we shouldn't call the getcaps for the command codes until the self test is over, but I need to do more debugging to track down what's going on. I've also got other reports of TPM_RC_COMMAND_CODE failures during self test, so this isn't just my screwed up chip. James
On Fri, 2018-02-16 at 10:17 -0800, James Bottomley wrote: > On Fri, 2018-02-16 at 10:34 +0200, Jarkko Sakkinen wrote: > > > > Hi > > > > See the comments below and please include this as a prequel patch > > for the next version: > > > > https://patchwork.kernel.org/patch/10208965/ > > Actually, I've got another curious bit of behaviour from my Nuvoton: > After an experiment with primary EK generation that sent it into > failure mode (connected with something else, nothing to do with > this), it's taken to returning 232 (TPM_RC_COMMAND_CODE) to a partial > self test periodically on boot. According to the manual this is > impossible, so I think it's something to do with > tpm_validate_command. I think, perhaps, we shouldn't call the > getcaps for the command codes until the self test is over, but I need > to do more debugging to track down what's going on. > > I've also got other reports of TPM_RC_COMMAND_CODE failures during > self test, so this isn't just my screwed up chip. This isn't anything to do with us synthesizing TPM_RC_COMMAND_CODE, the status is definitely coming from the chip. If I run a full self test immediately after this, everything works as normal. My instinct from this is to take any return from the partial self tests except TPM_RC_SUCCESS, TPM_RC_TESTING or TPM_RC_FAILURE as an indication we should run a full self test, so I'll code that up. Can anyone from the manufacturers shed any light on the TPM_RC_COMMAND_CODE return from a partial self test? James
On 16.02.2018 19:59, James Bottomley wrote: > On Fri, 2018-02-16 at 10:17 -0800, James Bottomley wrote: >> On Fri, 2018-02-16 at 10:34 +0200, Jarkko Sakkinen wrote: >>> >>> Hi >>> >>> See the comments below and please include this as a prequel patch >>> for the next version: >>> >>> https://patchwork.kernel.org/patch/10208965/ >> >> Actually, I've got another curious bit of behaviour from my Nuvoton: >> After an experiment with primary EK generation that sent it into >> failure mode (connected with something else, nothing to do with >> this), it's taken to returning 232 (TPM_RC_COMMAND_CODE) to a partial >> self test periodically on boot. According to the manual this is >> impossible, so I think it's something to do with >> tpm_validate_command. I think, perhaps, we shouldn't call the >> getcaps for the command codes until the self test is over, but I need >> to do more debugging to track down what's going on. >> >> I've also got other reports of TPM_RC_COMMAND_CODE failures during >> self test, so this isn't just my screwed up chip. > > This isn't anything to do with us synthesizing TPM_RC_COMMAND_CODE, the > status is definitely coming from the chip. If I run a full self test > immediately after this, everything works as normal. Interesting. What you call full and partial self test, those are the same command (TPM2_SelfTest), just with fullTest=YES and fullTest=NO, right? Then it seems even stranger that whether you get RC_COMMAND_CODE does not depend on the command but on the parameter. > Can anyone from the manufacturers shed any light on the > TPM_RC_COMMAND_CODE return from a partial self test? It's the first time I see this usage of that return code. The specification says that this code means "command code not supported", which to me sounds rather like "command not implemented", not "command temporarily not available". Maybe this is just a bug in this TPM implementation? Alexander
On Fri, 2018-02-16 at 20:26 +0100, Alexander Steffen wrote: > On 16.02.2018 19:59, James Bottomley wrote: > > > > On Fri, 2018-02-16 at 10:17 -0800, James Bottomley wrote: > > > > > > On Fri, 2018-02-16 at 10:34 +0200, Jarkko Sakkinen wrote: > > > > > > > > > > > > Hi > > > > > > > > See the comments below and please include this as a prequel > > > > patch for the next version: > > > > > > > > https://patchwork.kernel.org/patch/10208965/ > > > > > > Actually, I've got another curious bit of behaviour from my > > > Nuvoton: After an experiment with primary EK generation that > > > sent it into failure mode (connected with something else, nothing > > > to do with this), it's taken to returning 232 > > > (TPM_RC_COMMAND_CODE) to a partial self test periodically on > > > boot. According to the manual this is impossible, so I think > > > it's something to do with tpm_validate_command. I think, > > > perhaps, we shouldn't call the getcaps for the command codes > > > until the self test is over, but I need to do more debugging to > > > track down what's going on. > > > > > > I've also got other reports of TPM_RC_COMMAND_CODE failures > > > during self test, so this isn't just my screwed up chip. > > > > This isn't anything to do with us synthesizing TPM_RC_COMMAND_CODE, > > the status is definitely coming from the chip. If I run a full > > self test immediately after this, everything works as normal. > > Interesting. What you call full and partial self test, those are the > same command (TPM2_SelfTest), just with fullTest=YES and fullTest=NO, > right? Then it seems even stranger that whether you get > RC_COMMAND_CODE does not depend on the command but on the parameter. Yes, I did a lot of debugging because this is unexpected, but nevertheless that's what I see. > > Can anyone from the manufacturers shed any light on the > > TPM_RC_COMMAND_CODE return from a partial self test? > > It's the first time I see this usage of that return code. The > specification says that this code means "command code not supported", > which to me sounds rather like "command not implemented", not > "command temporarily not available". Maybe this is just a bug in this > TPM implementation? That was my first theory. However, even if it's only Nuvoton specific they're pretty widely deployed (all the Dell laptops), so we have to cope with it in the kernel. But I've also got reports of the same problem affecting a ST Micro TPM, so it looks like an implementation that's spreading. For those who can play along at home, the way I induce the TPM failure is to execute tsscreateek -cp -alg ec -noflush What seems to be happening is that most TPM implementations have a hard coded short cut for primary endorsement key generation, but mine seems to be missing the EC certificate, so asking it to generate an EC primary for the endorsement seed hits a BUG_ON() in its internal implementation code which causes the TPM to go into failure mode. James
On Fri, 2018-02-16 at 10:34 +0200, Jarkko Sakkinen wrote: > Hi > > See the comments below and please include this as a prequel patch for > the next version: > > https://patchwork.kernel.org/patch/10208965/ OK, and responding to review comments now rather than reporting other anomalies seen. [...] > diff --git a/drivers/char/tpm/tpm-interface.c > > b/drivers/char/tpm/tpm-interface.c > > index 3cec403a80b3..c3e6d0248af8 100644 > > --- a/drivers/char/tpm/tpm-interface.c > > +++ b/drivers/char/tpm/tpm-interface.c > > @@ -521,12 +521,32 @@ ssize_t tpm_transmit_cmd(struct tpm_chip > > *chip, struct tpm_space *space, > > const struct tpm_output_header *header = buf; > > int err; > > ssize_t len; > > + unsigned int delay_msec = 20; > > > > - len = tpm_transmit(chip, space, (u8 *)buf, bufsiz, flags); > > - if (len < 0) > > - return len; > > + /* > > + * on first probe we kick off a TPM self test in the > > + * background This means the TPM may return RC_TESTING to > > any > > + * command that tries to use a subsystem under test, so do > > an > > + * exponential backoff wait if that happens > > + */ > > + for (;;) { > > + len = tpm_transmit(chip, space, (u8 *)buf, bufsiz, > > flags); > > + if (len < 0) > > + return len; > > + > > + err = be32_to_cpu(header->return_code); > > + if (err != TPM2_RC_TESTING || > > + (flags & TPM_TRANSMIT_NOWAIT)) > > + break; > > + > > + delay_msec *= 2; > > + if (delay_msec > TPM2_DURATION_LONG) { > > + dev_err(&chip->dev,"TPM: still running > > self tests, giving up waiting\n"); > > + break; > > + } > > + tpm_msleep(delay_msec); > > + } > > Please have a helper function for this. OK. > > > > > > - err = be32_to_cpu(header->return_code); > > if (err != 0 && desc) > > dev_err(&chip->dev, "A TPM error (%d) occurred > > %s\n", err, > > desc); > > diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h > > index 528cffbd49d3..47c5a5206325 100644 > > --- a/drivers/char/tpm/tpm.h > > +++ b/drivers/char/tpm/tpm.h > > @@ -495,6 +495,7 @@ extern struct idr dev_nums_idr; > > enum tpm_transmit_flags { > > TPM_TRANSMIT_UNLOCKED = BIT(0), > > TPM_TRANSMIT_RAW = BIT(1), > > + TPM_TRANSMIT_NOWAIT = BIT(2), > > Note to myself: TPM_TRANSMIT_RAW is a retarded name that does not > mean > anything. That should be renamed simply as > TPM_TRANSMIT_IGNORE_LOCALITY. > > It is really hard to interpret what TPM_TRANSMIT_NOWAIT means. Please > just name it as TPM_TRANSMIT_IGNORE_TESTING or propose something > better. I think that name makes it obvious what the flag means. TPM_TRANSMIT_NO_WAIT_TESTING? [...] > I'm not sure how tpm_write() call path is handled. It isn't currently since it uses tpm_transmit directly. My thought on this is that if the TPM hasn't got its testing crap together by the time we enter userspace (which will be 10 or more seconds after we first sent the test commands), then we really have a problem and the user should see it. James
On Fri, Feb 16, 2018 at 12:15:08PM -0800, James Bottomley wrote: > It isn't currently since it uses tpm_transmit directly. My thought on > this is that if the TPM hasn't got its testing crap together by the > time we enter userspace (which will be 10 or more seconds after we > first sent the test commands), then we really have a problem and the > user should see it. Why would it be 10s? My embedded systems got to userspace in something like 0.5s after sending the startup. Not sure what to do here.. Our model has been that userspace gets a raw view - but it has also been that the kernel makes the TPM fully ready. Jason
On Sun, 2018-02-18 at 10:08 -0700, Jason Gunthorpe wrote: > On Fri, Feb 16, 2018 at 12:15:08PM -0800, James Bottomley wrote: > > > > It isn't currently since it uses tpm_transmit directly. My thought > > on this is that if the TPM hasn't got its testing crap together by > > the time we enter userspace (which will be 10 or more seconds after > > we first sent the test commands), then we really have a problem and > > the user should see it. > > Why would it be 10s? My embedded systems got to userspace in > something like 0.5s after sending the startup. The misbehaving chips seem to be laptop, and that's about what it takes mine to get through the boot sequence ... and I thought waiting 2s for the TPM to self test was a long time for me; it must be an eternity to you ... > Not sure what to do here.. Our model has been that userspace gets a > raw view - but it has also been that the kernel makes the TPM fully > ready. Well, I could move the wait for testing to finish loop to tpm_transmit(). That would cope with both cases. However, I've never actually seen this loop activate, even with all the TPM misbehaviour I've managed to induce, so I have no objective measure for whether it's useful or not. James
On Sun, Feb 18, 2018 at 09:16:42AM -0800, James Bottomley wrote: > On Sun, 2018-02-18 at 10:08 -0700, Jason Gunthorpe wrote: > > On Fri, Feb 16, 2018 at 12:15:08PM -0800, James Bottomley wrote: > > > > > > It isn't currently since it uses tpm_transmit directly. My thought > > > on this is that if the TPM hasn't got its testing crap together by > > > the time we enter userspace (which will be 10 or more seconds after > > > we first sent the test commands), then we really have a problem and > > > the user should see it. > > > > Why would it be 10s? My embedded systems got to userspace in > > something like 0.5s after sending the startup. > > The misbehaving chips seem to be laptop, and that's about what it takes > mine to get through the boot sequence ... and I thought waiting 2s for > the TPM to self test was a long time for me; it must be an eternity to > you ... Yes :) The TPMs I used did not take 2 seconds to self test. Maybe all the new algorithms in TPM2 make it take much longer? > > Not sure what to do here.. Our model has been that userspace gets a > > raw view - but it has also been that the kernel makes the TPM fully > > ready. > > Well, I could move the wait for testing to finish loop to > tpm_transmit(). That would cope with both cases. However, I've never > actually seen this loop activate, even with all the TPM misbehaviour > I've managed to induce, so I have no objective measure for whether it's > useful or not. That is just a time issue, right? Or does the kernel send no commands early on that are depending on self test? Jason
On Sun, 2018-02-18 at 10:36 -0700, Jason Gunthorpe wrote: > On Sun, Feb 18, 2018 at 09:16:42AM -0800, James Bottomley wrote: > > > > On Sun, 2018-02-18 at 10:08 -0700, Jason Gunthorpe wrote: > > > > > > On Fri, Feb 16, 2018 at 12:15:08PM -0800, James Bottomley wrote: > > > > > > > > > > > > It isn't currently since it uses tpm_transmit directly. My > > > > thought on this is that if the TPM hasn't got its testing crap > > > > together by the time we enter userspace (which will be 10 or > > > > more seconds after we first sent the test commands), then we > > > > really have a problem and the user should see it. > > > > > > Why would it be 10s? My embedded systems got to userspace in > > > something like 0.5s after sending the startup. > > > > The misbehaving chips seem to be laptop, and that's about what it > > takes mine to get through the boot sequence ... and I thought > > waiting 2s for the TPM to self test was a long time for me; it must > > be an eternity to you ... > > Yes :) The TPMs I used did not take 2 seconds to self test. Maybe all > the new algorithms in TPM2 make it take much longer? Heh, this is all undefined territory. The spec says what the TPM is allowed to do (and some of the TPMs don't even respect that), but it doesn't say what we should do, so we're winging it. However, if my TPM returns TPM_RC_TESTING and we wait for all self- tests to complete, it's definitely taking over 2s. Hence my argument that we shouldn't wait. > > > Not sure what to do here.. Our model has been that userspace gets > > > a raw view - but it has also been that the kernel makes the TPM > > > fully ready. > > > > Well, I could move the wait for testing to finish loop to > > tpm_transmit(). That would cope with both cases. However, I've > > never actually seen this loop activate, even with all the TPM > > misbehaviour I've managed to induce, so I have no objective measure > > for whether it's useful or not. > > That is just a time issue, right? Or does the kernel send no commands > early on that are depending on self test? I've got IMA enabled on my system, so they get an immediate read and update of PCR values within milliseconds of exiting the self test, which succeeds. Now the TPM is allowed to process systems that have completed test even if some others are under testing and I'd guess that the PCR systems are the simplest to test and first to complete. The first thing my system does in userspace is start the VPN which has a TPM key, so I'm using the cryptographic function reasonably fast as well, but that's after the initial kernel bring up, so the fastest I've seen it is 5s after the TPM exits self test. James
On Fri, 2018-02-16 at 10:17 -0800, James Bottomley wrote: > so I think it's something to do with tpm_validate_command. I think, > perhaps, we shouldn't call the getcaps for the command codes until > the > self test is over, but I need to do more debugging to track down > what's > going on. The calls for tpm2_get_pcr_allocation() and tpm2_get_cc_attrs_tbl() could be also moved before the self test. /Jarkko
On Tue, 2018-02-20 at 15:30 +0200, Jarkko Sakkinen wrote: > On Fri, 2018-02-16 at 10:17 -0800, James Bottomley wrote: > > > > so I think it's something to do with tpm_validate_command. I > > think, perhaps, we shouldn't call the getcaps for the command codes > > until the self test is over, but I need to do more debugging to > > track down what's going on. > > The calls for tpm2_get_pcr_allocation() and tpm2_get_cc_attrs_tbl() > could be also moved before the self test. That's not a good idea for a couple of reasons 1. You really should do as little as possible with the TPM before the self test 2. The TPM might not be started before the self test, so it would error all commands with TPM_RC_INITIALIZE anyway (this was the problem with the initial version of the patch set). So self test should be the first command we send to the TPM. The only reason I was suspicious of tpm_validate_command() is because it can manufacture a TPM_RC_COMMAND_CODE return. However, that turned out not to be the case (and tpm_validate_command() has a bypass for sending everything to the TPM before the attribute table is initialized, so it's all working correctly). James
On Fri, 2018-02-16 at 11:45 -0800, James Bottomley wrote:
> tsscreateek -cp -alg ec -noflush
Can you describe in high-level what this command does? I will rather
add a test to my smoke test suite than depend on TSS implementations
for various reasons. This seems like a good test case to add as
part of it.
/Jarkko
On Fri, 2018-02-16 at 12:15 -0800, James Bottomley wrote: > > I'm not sure how tpm_write() call path is handled. > > It isn't currently since it uses tpm_transmit directly. My thought > on > this is that if the TPM hasn't got its testing crap together by the > time we enter userspace (which will be 10 or more seconds after we > first sent the test commands), then we really have a problem and the > user should see it. Fair enough. /Jarkko
On Tue, 2018-02-20 at 16:24 +0200, Jarkko Sakkinen wrote: > On Fri, 2018-02-16 at 11:45 -0800, James Bottomley wrote: > > > > tsscreateek -cp -alg ec -noflush > > Can you describe in high-level what this command does? I will rather > add a test to my smoke test suite than depend on TSS implementations > for various reasons. This seems like a good test case to add as > part of it. It's basically doing a create primary on the endorsement seed for an elliptic curve key. However, it first tries to get the seed template and unique data from the correct NV index, and if that doesn't work it uses the data defined in: https://trustedcomputinggroup.org/tcg-ek-credential-profile-tpm-family-2-0/ to build a template and uses that. I think what's happening is my Nuvoton recognises the template and tries its derivation shortcut which causes a BUG_ON in its implementation because no EC keys or certificate was provisioned in this TPM. James
EOn Tue, 2018-02-20 at 08:57 -0500, James Bottomley wrote: > On Tue, 2018-02-20 at 15:30 +0200, Jarkko Sakkinen wrote: > > The calls for tpm2_get_pcr_allocation() and tpm2_get_cc_attrs_tbl() > > could be also moved before the self test. > > That's not a good idea for a couple of reasons > > 1. You really should do as little as possible with the TPM before the > self test As Alexander correctly pointed out earlier, the section 12.3 Self-Test Modes of the architecture specification states that "If a command requires use of an untested algorithm or functional module, the TPM performs the test and then completes the command actions." It would mean only running the self test for GetCapability as the first test if I understand what I'm reading correctly. > 2. The TPM might not be started before the self test, so it would error > all commands with TPM_RC_INITIALIZE anyway (this was the problem > with the initial version of the patch set). Do not see an issue to run Startup beforehand. > So self test should be the first command we send to the TPM. The only > reason I was suspicious of tpm_validate_command() is because it can > manufacture a TPM_RC_COMMAND_CODE return. However, that turned out not > to be the case (and tpm_validate_command() has a bypass for sending > everything to the TPM before the attribute table is initialized, so > it's all working correctly). > > James /Jarkko
On Tue, 2018-02-20 at 19:22 +0200, Jarkko Sakkinen wrote: > EOn Tue, 2018-02-20 at 08:57 -0500, James Bottomley wrote: > > > > On Tue, 2018-02-20 at 15:30 +0200, Jarkko Sakkinen wrote: > > > > > > The calls for tpm2_get_pcr_allocation() and > > > tpm2_get_cc_attrs_tbl() > > > could be also moved before the self test. > > > > That's not a good idea for a couple of reasons > > > > 1. You really should do as little as possible with the TPM > > before the > > self test > > As Alexander correctly pointed out earlier, the section 12.3 > Self-Test Modes of the architecture specification states that > > "If a command requires use of an untested algorithm or functional > module, the TPM performs the test and then completes the command > actions." > > It would mean only running the self test for GetCapability as the > first test if I understand what I'm reading correctly. > > > > > 2. The TPM might not be started before the self test, so it > > would error > > all commands with TPM_RC_INITIALIZE anyway (this was the > > problem > > with the initial version of the patch set). > > Do not see an issue to run Startup beforehand. I still don't think it serves any useful purpose and it gives us more to unwind if the self test fails, so occams razor would say do it after the selftest passes. Jaems
On 2/20/2018 9:24 AM, Jarkko Sakkinen wrote: > On Fri, 2018-02-16 at 11:45 -0800, James Bottomley wrote: >> tsscreateek -cp -alg ec -noflush > > Can you describe in high-level what this command does? I will rather > add a test to my smoke test suite than depend on TSS implementations > for various reasons. This seems like a good test case to add as > part of it. It actually does a lot under the covers, but the end result is an ECC Endorsement Key is created on the TPM. 1 - Reads the possible EK template and EK nonce from the TPM NV. This involves: nvreadpublic to check for the existence and get the index size nvread to get the data, possibly in a loop which in turn needs a getcapability to determine the chunk size for the NV read 2 - Runs createprimary 3 - Similar to #1 to read the EK certificate from NV It then validates the EK public key against the certificate (not using the TPM) to check that everything worked.
diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index 3cec403a80b3..c3e6d0248af8 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -521,12 +521,32 @@ ssize_t tpm_transmit_cmd(struct tpm_chip *chip, struct tpm_space *space, const struct tpm_output_header *header = buf; int err; ssize_t len; + unsigned int delay_msec = 20; - len = tpm_transmit(chip, space, (u8 *)buf, bufsiz, flags); - if (len < 0) - return len; + /* + * on first probe we kick off a TPM self test in the + * background This means the TPM may return RC_TESTING to any + * command that tries to use a subsystem under test, so do an + * exponential backoff wait if that happens + */ + for (;;) { + len = tpm_transmit(chip, space, (u8 *)buf, bufsiz, flags); + if (len < 0) + return len; + + err = be32_to_cpu(header->return_code); + if (err != TPM2_RC_TESTING || + (flags & TPM_TRANSMIT_NOWAIT)) + break; + + delay_msec *= 2; + if (delay_msec > TPM2_DURATION_LONG) { + dev_err(&chip->dev,"TPM: still running self tests, giving up waiting\n"); + break; + } + tpm_msleep(delay_msec); + } - err = be32_to_cpu(header->return_code); if (err != 0 && desc) dev_err(&chip->dev, "A TPM error (%d) occurred %s\n", err, desc); diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 528cffbd49d3..47c5a5206325 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -495,6 +495,7 @@ extern struct idr dev_nums_idr; enum tpm_transmit_flags { TPM_TRANSMIT_UNLOCKED = BIT(0), TPM_TRANSMIT_RAW = BIT(1), + TPM_TRANSMIT_NOWAIT = BIT(2), }; ssize_t tpm_transmit(struct tpm_chip *chip, struct tpm_space *space, diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index f6be08483ae6..169232179182 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -853,28 +853,24 @@ static const struct tpm_input_header tpm2_selftest_header = { static int tpm2_do_selftest(struct tpm_chip *chip) { int rc; - unsigned int delay_msec = 20; - long duration; struct tpm2_cmd cmd; - duration = jiffies_to_msecs( - tpm2_calc_ordinal_duration(chip, TPM2_CC_SELF_TEST)); - - while (duration > 0) { - cmd.header.in = tpm2_selftest_header; - cmd.params.selftest_in.full_test = 0; - - rc = tpm_transmit_cmd(chip, NULL, &cmd, TPM2_SELF_TEST_IN_SIZE, - 0, 0, "continue selftest"); - - if (rc != TPM2_RC_TESTING) - break; - - tpm_msleep(delay_msec); - duration -= delay_msec; - - /* wait longer the next round */ - delay_msec *= 2; + cmd.header.in = tpm2_selftest_header; + cmd.params.selftest_in.full_test = 0; + + rc = tpm_transmit_cmd(chip, NULL, &cmd, TPM2_SELF_TEST_IN_SIZE, + 0, TPM_TRANSMIT_NOWAIT, "continue selftest"); + + if (rc == TPM2_RC_TESTING) { + /* + * A return of RC_TESTING means the TPM is still + * running self tests. If one fails it will go into + * failure mode and return RC_FAILED to every command, + * so treat a still in testing return as a success + * rather than causing a driver detach. + */ + dev_info(&chip->dev,"TPM: Running self test in background\n"); + rc = TPM2_RC_SUCCESS; } return rc;