diff mbox

[04/12] fs: ceph: CURRENT_TIME with ktime_get_real_ts()

Message ID 1491613030-11599-5-git-send-email-deepa.kernel@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Deepa Dinamani April 8, 2017, 12:57 a.m. UTC
CURRENT_TIME is not y2038 safe.
The macro will be deleted and all the references to it
will be replaced by ktime_get_* apis.

struct timespec is also not y2038 safe.
Retain timespec for timestamp representation here as ceph
uses it internally everywhere.
These references will be changed to use struct timespec64
in a separate patch.

The current_fs_time() api is being changed to use vfs
struct inode* as an argument instead of struct super_block*.

Set the new mds client request r_stamp field using
ktime_get_real_ts() instead of using current_fs_time().

Also, since r_stamp is used as mtime on the server, use
timespec_trunc() to truncate the timestamp, using the right
granularity from the superblock.

This api will be transitioned to be y2038 safe along
with vfs.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/block/rbd.c   | 2 +-
 fs/ceph/mds_client.c  | 4 +++-
 net/ceph/messenger.c  | 6 ++++--
 net/ceph/osd_client.c | 4 ++--
 4 files changed, 10 insertions(+), 6 deletions(-)

Comments

Yan, Zheng June 1, 2017, 9:56 a.m. UTC | #1
On Sat, Apr 8, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
> CURRENT_TIME is not y2038 safe.
> The macro will be deleted and all the references to it
> will be replaced by ktime_get_* apis.
>
> struct timespec is also not y2038 safe.
> Retain timespec for timestamp representation here as ceph
> uses it internally everywhere.
> These references will be changed to use struct timespec64
> in a separate patch.
>
> The current_fs_time() api is being changed to use vfs
> struct inode* as an argument instead of struct super_block*.
>
> Set the new mds client request r_stamp field using
> ktime_get_real_ts() instead of using current_fs_time().
>
> Also, since r_stamp is used as mtime on the server, use
> timespec_trunc() to truncate the timestamp, using the right
> granularity from the superblock.
>
> This api will be transitioned to be y2038 safe along
> with vfs.
>
> Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
> Reviewed-by: Arnd Bergmann <arnd@arndb.de>
> ---
>  drivers/block/rbd.c   | 2 +-
>  fs/ceph/mds_client.c  | 4 +++-
>  net/ceph/messenger.c  | 6 ++++--
>  net/ceph/osd_client.c | 4 ++--
>  4 files changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 517838b..77204da 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -1922,7 +1922,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>  {
>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>
> -       osd_req->r_mtime = CURRENT_TIME;
> +       ktime_get_real_ts(&osd_req->r_mtime);
>         osd_req->r_data_offset = obj_request->offset;
>  }
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index c681762..1d3fa90 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1666,6 +1666,7 @@ struct ceph_mds_request *
>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>  {
>         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
> +       struct timespec ts;
>
>         if (!req)
>                 return ERR_PTR(-ENOMEM);
> @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>         init_completion(&req->r_safe_completion);
>         INIT_LIST_HEAD(&req->r_unsafe_item);
>
> -       req->r_stamp = current_fs_time(mdsc->fsc->sb);
> +       ktime_get_real_ts(&ts);
> +       req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);

This change causes our kernel_untar_tar test case to fail (inode's
ctime goes back). The reason is that there is time drift between the
time stamps got by  ktime_get_real_ts() and current_time(). We need to
revert this change until current_time() uses ktime_get_real_ts()
internally.

Regards
Yan, Zheng


>
>         req->r_op = op;
>         req->r_direct_mode = mode;
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index f76bb33..5766a6c 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -1386,8 +1386,9 @@ static void prepare_write_keepalive(struct ceph_connection *con)
>         dout("prepare_write_keepalive %p\n", con);
>         con_out_kvec_reset(con);
>         if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
> -               struct timespec now = CURRENT_TIME;
> +               struct timespec now;
>
> +               ktime_get_real_ts(&now);
>                 con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
>                 ceph_encode_timespec(&con->out_temp_keepalive2, &now);
>                 con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
> @@ -3176,8 +3177,9 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con,
>  {
>         if (interval > 0 &&
>             (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
> -               struct timespec now = CURRENT_TIME;
> +               struct timespec now;
>                 struct timespec ts;
> +               ktime_get_real_ts(&now);
>                 jiffies_to_timespec(interval, &ts);
>                 ts = timespec_add(con->last_keepalive_ack, ts);
>                 return timespec_compare(&now, &ts) >= 0;
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index e15ea9e..242d7c0 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -3574,7 +3574,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
>         ceph_oid_copy(&lreq->t.base_oid, oid);
>         ceph_oloc_copy(&lreq->t.base_oloc, oloc);
>         lreq->t.flags = CEPH_OSD_FLAG_WRITE;
> -       lreq->mtime = CURRENT_TIME;
> +       ktime_get_real_ts(&lreq->mtime);
>
>         lreq->reg_req = alloc_linger_request(lreq);
>         if (!lreq->reg_req) {
> @@ -3632,7 +3632,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
>         ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
>         ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
>         req->r_flags = CEPH_OSD_FLAG_WRITE;
> -       req->r_mtime = CURRENT_TIME;
> +       ktime_get_real_ts(&req->r_mtime);
>         osd_req_op_watch_init(req, 0, lreq->linger_id,
>                               CEPH_OSD_WATCH_OP_UNWATCH);
>
> --
> 2.7.4
>
Arnd Bergmann June 1, 2017, 10:22 a.m. UTC | #2
On Thu, Jun 1, 2017 at 11:56 AM, Yan, Zheng <ukernel@gmail.com> wrote:
> On Sat, Apr 8, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:

>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>> index 517838b..77204da 100644
>> --- a/drivers/block/rbd.c
>> +++ b/drivers/block/rbd.c
>> @@ -1922,7 +1922,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>>  {
>>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>>
>> -       osd_req->r_mtime = CURRENT_TIME;
>> +       ktime_get_real_ts(&osd_req->r_mtime);
>>         osd_req->r_data_offset = obj_request->offset;
>>  }
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index c681762..1d3fa90 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -1666,6 +1666,7 @@ struct ceph_mds_request *
>>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>  {
>>         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
>> +       struct timespec ts;
>>
>>         if (!req)
>>                 return ERR_PTR(-ENOMEM);
>> @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>         init_completion(&req->r_safe_completion);
>>         INIT_LIST_HEAD(&req->r_unsafe_item);
>>
>> -       req->r_stamp = current_fs_time(mdsc->fsc->sb);
>> +       ktime_get_real_ts(&ts);
>> +       req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
>
> This change causes our kernel_untar_tar test case to fail (inode's
> ctime goes back). The reason is that there is time drift between the
> time stamps got by  ktime_get_real_ts() and current_time(). We need to
> revert this change until current_time() uses ktime_get_real_ts()
> internally.

Hmm, the change was not supposed to have a user-visible effect, so
something has gone wrong, but I don't immediately see how it
relates to what you observe.

ktime_get_real_ts() and current_time() use the same time base, there
is no drift, but there is a difference in resolution, as the latter uses
the time stamp of the last jiffies update, which may be up to one jiffy
(10ms) behind the exact time we put in the request stamps here.

Do you still see problems if you use current_kernel_time() instead of
ktime_get_real_ts()?

        Arnd
Yan, Zheng June 2, 2017, 12:26 a.m. UTC | #3
On Thu, Jun 1, 2017 at 6:22 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Thu, Jun 1, 2017 at 11:56 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>> On Sat, Apr 8, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>
>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>> index 517838b..77204da 100644
>>> --- a/drivers/block/rbd.c
>>> +++ b/drivers/block/rbd.c
>>> @@ -1922,7 +1922,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>>>  {
>>>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>>>
>>> -       osd_req->r_mtime = CURRENT_TIME;
>>> +       ktime_get_real_ts(&osd_req->r_mtime);
>>>         osd_req->r_data_offset = obj_request->offset;
>>>  }
>>>
>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>> index c681762..1d3fa90 100644
>>> --- a/fs/ceph/mds_client.c
>>> +++ b/fs/ceph/mds_client.c
>>> @@ -1666,6 +1666,7 @@ struct ceph_mds_request *
>>>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>  {
>>>         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
>>> +       struct timespec ts;
>>>
>>>         if (!req)
>>>                 return ERR_PTR(-ENOMEM);
>>> @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>         init_completion(&req->r_safe_completion);
>>>         INIT_LIST_HEAD(&req->r_unsafe_item);
>>>
>>> -       req->r_stamp = current_fs_time(mdsc->fsc->sb);
>>> +       ktime_get_real_ts(&ts);
>>> +       req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
>>
>> This change causes our kernel_untar_tar test case to fail (inode's
>> ctime goes back). The reason is that there is time drift between the
>> time stamps got by  ktime_get_real_ts() and current_time(). We need to
>> revert this change until current_time() uses ktime_get_real_ts()
>> internally.
>
> Hmm, the change was not supposed to have a user-visible effect, so
> something has gone wrong, but I don't immediately see how it
> relates to what you observe.
>
> ktime_get_real_ts() and current_time() use the same time base, there
> is no drift, but there is a difference in resolution, as the latter uses
> the time stamp of the last jiffies update, which may be up to one jiffy
> (10ms) behind the exact time we put in the request stamps here.
>
> Do you still see problems if you use current_kernel_time() instead of
> ktime_get_real_ts()?

The problem disappears after using current_kernel_time().

https://github.com/ceph/ceph-client/commit/2e0f648da23167034a3cf1500bc90ec60aef2417


Regards
Yan, Zheng
>
>         Arnd
Yan, Zheng June 2, 2017, 12:35 a.m. UTC | #4
On Thu, Jun 1, 2017 at 6:22 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Thu, Jun 1, 2017 at 11:56 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>> On Sat, Apr 8, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>
>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>> index 517838b..77204da 100644
>>> --- a/drivers/block/rbd.c
>>> +++ b/drivers/block/rbd.c
>>> @@ -1922,7 +1922,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>>>  {
>>>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>>>
>>> -       osd_req->r_mtime = CURRENT_TIME;
>>> +       ktime_get_real_ts(&osd_req->r_mtime);
>>>         osd_req->r_data_offset = obj_request->offset;
>>>  }
>>>
>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>> index c681762..1d3fa90 100644
>>> --- a/fs/ceph/mds_client.c
>>> +++ b/fs/ceph/mds_client.c
>>> @@ -1666,6 +1666,7 @@ struct ceph_mds_request *
>>>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>  {
>>>         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
>>> +       struct timespec ts;
>>>
>>>         if (!req)
>>>                 return ERR_PTR(-ENOMEM);
>>> @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>         init_completion(&req->r_safe_completion);
>>>         INIT_LIST_HEAD(&req->r_unsafe_item);
>>>
>>> -       req->r_stamp = current_fs_time(mdsc->fsc->sb);
>>> +       ktime_get_real_ts(&ts);
>>> +       req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
>>
>> This change causes our kernel_untar_tar test case to fail (inode's
>> ctime goes back). The reason is that there is time drift between the
>> time stamps got by  ktime_get_real_ts() and current_time(). We need to
>> revert this change until current_time() uses ktime_get_real_ts()
>> internally.
>
> Hmm, the change was not supposed to have a user-visible effect, so
> something has gone wrong, but I don't immediately see how it
> relates to what you observe.
>
> ktime_get_real_ts() and current_time() use the same time base, there
> is no drift, but there is a difference in resolution, as the latter uses
> the time stamp of the last jiffies update, which may be up to one jiffy
> (10ms) behind the exact time we put in the request stamps here.
>
It happens in following sequence of events

1. create a new file, the inode's ctime is set to ktime_get_real_ts()
2. chmod the new file, the inode's ctime is set to current_time().

Inode's ctime goes back when current_time() is behind ktime_get_real_ts().

Regards
Yan, Zheng

> Do you still see problems if you use current_kernel_time() instead of
> ktime_get_real_ts()?
>
>         Arnd
John Stultz June 2, 2017, 12:36 a.m. UTC | #5
On Thu, Jun 1, 2017 at 5:26 PM, Yan, Zheng <ukernel@gmail.com> wrote:
> On Thu, Jun 1, 2017 at 6:22 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>> On Thu, Jun 1, 2017 at 11:56 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>> On Sat, Apr 8, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>
>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>> index 517838b..77204da 100644
>>>> --- a/drivers/block/rbd.c
>>>> +++ b/drivers/block/rbd.c
>>>> @@ -1922,7 +1922,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>>>>  {
>>>>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>>>>
>>>> -       osd_req->r_mtime = CURRENT_TIME;
>>>> +       ktime_get_real_ts(&osd_req->r_mtime);
>>>>         osd_req->r_data_offset = obj_request->offset;
>>>>  }
>>>>
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index c681762..1d3fa90 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -1666,6 +1666,7 @@ struct ceph_mds_request *
>>>>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>  {
>>>>         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
>>>> +       struct timespec ts;
>>>>
>>>>         if (!req)
>>>>                 return ERR_PTR(-ENOMEM);
>>>> @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>         init_completion(&req->r_safe_completion);
>>>>         INIT_LIST_HEAD(&req->r_unsafe_item);
>>>>
>>>> -       req->r_stamp = current_fs_time(mdsc->fsc->sb);
>>>> +       ktime_get_real_ts(&ts);
>>>> +       req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
>>>
>>> This change causes our kernel_untar_tar test case to fail (inode's
>>> ctime goes back). The reason is that there is time drift between the
>>> time stamps got by  ktime_get_real_ts() and current_time(). We need to
>>> revert this change until current_time() uses ktime_get_real_ts()
>>> internally.
>>
>> Hmm, the change was not supposed to have a user-visible effect, so
>> something has gone wrong, but I don't immediately see how it
>> relates to what you observe.
>>
>> ktime_get_real_ts() and current_time() use the same time base, there
>> is no drift, but there is a difference in resolution, as the latter uses
>> the time stamp of the last jiffies update, which may be up to one jiffy
>> (10ms) behind the exact time we put in the request stamps here.
>>
>> Do you still see problems if you use current_kernel_time() instead of
>> ktime_get_real_ts()?
>
> The problem disappears after using current_kernel_time().
>
> https://github.com/ceph/ceph-client/commit/2e0f648da23167034a3cf1500bc90ec60aef2417

From the commit above:
"It seems there is time drift between ktime_get_real_ts() and
current_kernel_time()"

Its more of a granularity difference. current_kernel_time() returns
the cached time at the last tick, where as ktime_get_real_ts() reads
the clocksource hardware and returns the immediate time.

Filesystems usually use the cached time (similar to
CLOCK_REALTIME_COARSE), for performance reasons, as touching the
clocksource takes time.

thanks
-john
Deepa Dinamani June 2, 2017, 12:57 a.m. UTC | #6
On Thu, Jun 1, 2017 at 5:36 PM, John Stultz <john.stultz@linaro.org> wrote:
> On Thu, Jun 1, 2017 at 5:26 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>> On Thu, Jun 1, 2017 at 6:22 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>> On Thu, Jun 1, 2017 at 11:56 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>> On Sat, Apr 8, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>>
>>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>>> index 517838b..77204da 100644
>>>>> --- a/drivers/block/rbd.c
>>>>> +++ b/drivers/block/rbd.c
>>>>> @@ -1922,7 +1922,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>>>>>  {
>>>>>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>>>>>
>>>>> -       osd_req->r_mtime = CURRENT_TIME;
>>>>> +       ktime_get_real_ts(&osd_req->r_mtime);
>>>>>         osd_req->r_data_offset = obj_request->offset;
>>>>>  }
>>>>>
>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>> index c681762..1d3fa90 100644
>>>>> --- a/fs/ceph/mds_client.c
>>>>> +++ b/fs/ceph/mds_client.c
>>>>> @@ -1666,6 +1666,7 @@ struct ceph_mds_request *
>>>>>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>>  {
>>>>>         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
>>>>> +       struct timespec ts;
>>>>>
>>>>>         if (!req)
>>>>>                 return ERR_PTR(-ENOMEM);
>>>>> @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>>         init_completion(&req->r_safe_completion);
>>>>>         INIT_LIST_HEAD(&req->r_unsafe_item);
>>>>>
>>>>> -       req->r_stamp = current_fs_time(mdsc->fsc->sb);
>>>>> +       ktime_get_real_ts(&ts);
>>>>> +       req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
>>>>
>>>> This change causes our kernel_untar_tar test case to fail (inode's
>>>> ctime goes back). The reason is that there is time drift between the
>>>> time stamps got by  ktime_get_real_ts() and current_time(). We need to
>>>> revert this change until current_time() uses ktime_get_real_ts()
>>>> internally.
>>>
>>> Hmm, the change was not supposed to have a user-visible effect, so
>>> something has gone wrong, but I don't immediately see how it
>>> relates to what you observe.
>>>
>>> ktime_get_real_ts() and current_time() use the same time base, there
>>> is no drift, but there is a difference in resolution, as the latter uses
>>> the time stamp of the last jiffies update, which may be up to one jiffy
>>> (10ms) behind the exact time we put in the request stamps here.
>>>
>>> Do you still see problems if you use current_kernel_time() instead of
>>> ktime_get_real_ts()?
>>
>> The problem disappears after using current_kernel_time().
>>
>> https://github.com/ceph/ceph-client/commit/2e0f648da23167034a3cf1500bc90ec60aef2417
>
> From the commit above:
> "It seems there is time drift between ktime_get_real_ts() and
> current_kernel_time()"
>
> Its more of a granularity difference. current_kernel_time() returns
> the cached time at the last tick, where as ktime_get_real_ts() reads
> the clocksource hardware and returns the immediate time.
>
> Filesystems usually use the cached time (similar to
> CLOCK_REALTIME_COARSE), for performance reasons, as touching the
> clocksource takes time.

Alternatively, it would be best for this code also to use current_time().
I had suggested this in one of the previous versions of the patch.
The implementation of current_time() will change when we switch vfs to
use 64 bit time. This will prevent such errors from happening again.
But, this also means there is more code reordering for these modules
to get a reference to inode.

-Deepa
Yan, Zheng June 2, 2017, 2:09 a.m. UTC | #7
On Fri, Jun 2, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
> On Thu, Jun 1, 2017 at 5:36 PM, John Stultz <john.stultz@linaro.org> wrote:
>> On Thu, Jun 1, 2017 at 5:26 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>> On Thu, Jun 1, 2017 at 6:22 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>>> On Thu, Jun 1, 2017 at 11:56 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>>> On Sat, Apr 8, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>>>
>>>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>>>> index 517838b..77204da 100644
>>>>>> --- a/drivers/block/rbd.c
>>>>>> +++ b/drivers/block/rbd.c
>>>>>> @@ -1922,7 +1922,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>>>>>>  {
>>>>>>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>>>>>>
>>>>>> -       osd_req->r_mtime = CURRENT_TIME;
>>>>>> +       ktime_get_real_ts(&osd_req->r_mtime);
>>>>>>         osd_req->r_data_offset = obj_request->offset;
>>>>>>  }
>>>>>>
>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>> index c681762..1d3fa90 100644
>>>>>> --- a/fs/ceph/mds_client.c
>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>> @@ -1666,6 +1666,7 @@ struct ceph_mds_request *
>>>>>>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>>>  {
>>>>>>         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
>>>>>> +       struct timespec ts;
>>>>>>
>>>>>>         if (!req)
>>>>>>                 return ERR_PTR(-ENOMEM);
>>>>>> @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>>>         init_completion(&req->r_safe_completion);
>>>>>>         INIT_LIST_HEAD(&req->r_unsafe_item);
>>>>>>
>>>>>> -       req->r_stamp = current_fs_time(mdsc->fsc->sb);
>>>>>> +       ktime_get_real_ts(&ts);
>>>>>> +       req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
>>>>>
>>>>> This change causes our kernel_untar_tar test case to fail (inode's
>>>>> ctime goes back). The reason is that there is time drift between the
>>>>> time stamps got by  ktime_get_real_ts() and current_time(). We need to
>>>>> revert this change until current_time() uses ktime_get_real_ts()
>>>>> internally.
>>>>
>>>> Hmm, the change was not supposed to have a user-visible effect, so
>>>> something has gone wrong, but I don't immediately see how it
>>>> relates to what you observe.
>>>>
>>>> ktime_get_real_ts() and current_time() use the same time base, there
>>>> is no drift, but there is a difference in resolution, as the latter uses
>>>> the time stamp of the last jiffies update, which may be up to one jiffy
>>>> (10ms) behind the exact time we put in the request stamps here.
>>>>
>>>> Do you still see problems if you use current_kernel_time() instead of
>>>> ktime_get_real_ts()?
>>>
>>> The problem disappears after using current_kernel_time().
>>>
>>> https://github.com/ceph/ceph-client/commit/2e0f648da23167034a3cf1500bc90ec60aef2417
>>
>> From the commit above:
>> "It seems there is time drift between ktime_get_real_ts() and
>> current_kernel_time()"
>>
>> Its more of a granularity difference. current_kernel_time() returns
>> the cached time at the last tick, where as ktime_get_real_ts() reads
>> the clocksource hardware and returns the immediate time.
>>
>> Filesystems usually use the cached time (similar to
>> CLOCK_REALTIME_COARSE), for performance reasons, as touching the
>> clocksource takes time.
>
> Alternatively, it would be best for this code also to use current_time().
> I had suggested this in one of the previous versions of the patch.
> The implementation of current_time() will change when we switch vfs to
> use 64 bit time. This will prevent such errors from happening again.
> But, this also means there is more code reordering for these modules
> to get a reference to inode.
>

I took a look. it's quite inconvenience to use current_time(). I
prefer to temporarily use current_kernel_time().

Regards
Yan, Zheng



> -Deepa
Arnd Bergmann June 2, 2017, 9:45 a.m. UTC | #8
On Fri, Jun 2, 2017 at 4:09 AM, Yan, Zheng <ukernel@gmail.com> wrote:
> On Fri, Jun 2, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>> On Thu, Jun 1, 2017 at 5:36 PM, John Stultz <john.stultz@linaro.org> wrote:
>>> On Thu, Jun 1, 2017 at 5:26 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>> On Thu, Jun 1, 2017 at 6:22 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>>>> On Thu, Jun 1, 2017 at 11:56 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>>>> On Sat, Apr 8, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>>>>
>>>>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>>>>> index 517838b..77204da 100644
>>>>>>> --- a/drivers/block/rbd.c
>>>>>>> +++ b/drivers/block/rbd.c
>>>>>>> @@ -1922,7 +1922,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>>>>>>>  {
>>>>>>>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>>>>>>>
>>>>>>> -       osd_req->r_mtime = CURRENT_TIME;
>>>>>>> +       ktime_get_real_ts(&osd_req->r_mtime);
>>>>>>>         osd_req->r_data_offset = obj_request->offset;
>>>>>>>  }
>>>>>>>
>>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>>> index c681762..1d3fa90 100644
>>>>>>> --- a/fs/ceph/mds_client.c
>>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>>> @@ -1666,6 +1666,7 @@ struct ceph_mds_request *
>>>>>>>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>>>>  {
>>>>>>>         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
>>>>>>> +       struct timespec ts;
>>>>>>>
>>>>>>>         if (!req)
>>>>>>>                 return ERR_PTR(-ENOMEM);
>>>>>>> @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>>>>         init_completion(&req->r_safe_completion);
>>>>>>>         INIT_LIST_HEAD(&req->r_unsafe_item);
>>>>>>>
>>>>>>> -       req->r_stamp = current_fs_time(mdsc->fsc->sb);
>>>>>>> +       ktime_get_real_ts(&ts);
>>>>>>> +       req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
>>>>>>
>>>>>> This change causes our kernel_untar_tar test case to fail (inode's
>>>>>> ctime goes back). The reason is that there is time drift between the
>>>>>> time stamps got by  ktime_get_real_ts() and current_time(). We need to
>>>>>> revert this change until current_time() uses ktime_get_real_ts()
>>>>>> internally.
>>>>>
>>>>> Hmm, the change was not supposed to have a user-visible effect, so
>>>>> something has gone wrong, but I don't immediately see how it
>>>>> relates to what you observe.
>>>>>
>>>>> ktime_get_real_ts() and current_time() use the same time base, there
>>>>> is no drift, but there is a difference in resolution, as the latter uses
>>>>> the time stamp of the last jiffies update, which may be up to one jiffy
>>>>> (10ms) behind the exact time we put in the request stamps here.
>>>>>
>>>>> Do you still see problems if you use current_kernel_time() instead of
>>>>> ktime_get_real_ts()?
>>>>
>>>> The problem disappears after using current_kernel_time().
>>>>
>>>> https://github.com/ceph/ceph-client/commit/2e0f648da23167034a3cf1500bc90ec60aef2417
>>>
>>> From the commit above:
>>> "It seems there is time drift between ktime_get_real_ts() and
>>> current_kernel_time()"
>>>
>>> Its more of a granularity difference. current_kernel_time() returns
>>> the cached time at the last tick, where as ktime_get_real_ts() reads
>>> the clocksource hardware and returns the immediate time.
>>>
>>> Filesystems usually use the cached time (similar to
>>> CLOCK_REALTIME_COARSE), for performance reasons, as touching the
>>> clocksource takes time.
>>
>> Alternatively, it would be best for this code also to use current_time().
>> I had suggested this in one of the previous versions of the patch.
>> The implementation of current_time() will change when we switch vfs to
>> use 64 bit time. This will prevent such errors from happening again.
>> But, this also means there is more code reordering for these modules
>> to get a reference to inode.
>>
>
> I took a look. it's quite inconvenience to use current_time(). I
> prefer to temporarily use current_kernel_time().

I've looked at the code some more and I think there is another angle to it:
In your test case, 'tar' calls into the utimes syscall (or a member of its
family), which sets the i_ctime field in the inode to the curren time
(using current_time()), then calls __ceph_setattr(), which creates
a mds client request, and ceph_mdsc_create_request() takes another
time stamp and stores it in r_stamp.

We then store the first timestamp (only) in the in-memory inode, and the
second time stamp in the request. Depending on the state of the inode,
we may also set the ctime to a third timestamp we again take using
current_time().

The mtime and atime from user space get passed correctly through
union ceph_mds_request_args->setattr and are kept in sync between
the in-memory inode and the persistent inode data, but the ctime
in the inode never makes it to the lower protocol levels and instead
we use the r_stamp field that got set a little earlier or a little later.

I believe the bug you see is the result of the two timestamps
currently being almost guaranteed to be different in the latest
kernels.
Changing r_stamp to use current_kernel_time() will make it the
same value most of the time (as it was before Deepa's patch),
but when the timer interrupt happens between the timestamps,
the two are still different, it's just much harder to hit.

I think the proper solution should be to change __ceph_setattr()
in a way that has req->r_stamp always synchronized with i_ctime.
If we copy i_ctime to r_stamp, that will also take care of the
future issues with the planned changes to current_time().

The part I don't understand is what else r_stamp (i.e. the time
stamp in ceph_msg_data with type==
CEPH_MSG_CLIENT_REQUEST) is used for, other than setting
ctime in CEPH_MDS_OP_SETATTR.

Will this be used to update the stored i_ctime for other operations
too? If so, we would need to synchronize it with the in-memory
i_ctime for all operations that do this.

Semi-related side note: I see that the granularity for ceph_timespec
is 1000 nanoseconds, so the value is always a multiple of 1000.
If the full 32-bit data gets stored, we could use this it to store the
epoch number in the future:

static inline void ceph_decode_timespec(struct timespec64 *ts,
                                        const struct ceph_timespec *tv)
{
        u32 ns_epoch = le32_to_cpu(tv->tv_nsec);
        u32 epoch = ns_epoch % 1000;

        /* tv_sec is traditionally interpreted as unsigned
         * with time ranges 1970-2106, we extend
         * it to 1970-138069 */
        ts->tv_sec = (u64)le32_to_cpu(tv->tv_sec) +
                            (u64)epoch << 32;
        ts->tv_nsec = ns_epoch - epoch;
}

      Arnd
Yan, Zheng June 2, 2017, 10:10 a.m. UTC | #9
On Fri, Jun 2, 2017 at 5:45 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Fri, Jun 2, 2017 at 4:09 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>> On Fri, Jun 2, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>> On Thu, Jun 1, 2017 at 5:36 PM, John Stultz <john.stultz@linaro.org> wrote:
>>>> On Thu, Jun 1, 2017 at 5:26 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>>> On Thu, Jun 1, 2017 at 6:22 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>>>>> On Thu, Jun 1, 2017 at 11:56 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>>>>> On Sat, Apr 8, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>>>>>
>>>>>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>>>>>> index 517838b..77204da 100644
>>>>>>>> --- a/drivers/block/rbd.c
>>>>>>>> +++ b/drivers/block/rbd.c
>>>>>>>> @@ -1922,7 +1922,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>>>>>>>>  {
>>>>>>>>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>>>>>>>>
>>>>>>>> -       osd_req->r_mtime = CURRENT_TIME;
>>>>>>>> +       ktime_get_real_ts(&osd_req->r_mtime);
>>>>>>>>         osd_req->r_data_offset = obj_request->offset;
>>>>>>>>  }
>>>>>>>>
>>>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>>>> index c681762..1d3fa90 100644
>>>>>>>> --- a/fs/ceph/mds_client.c
>>>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>>>> @@ -1666,6 +1666,7 @@ struct ceph_mds_request *
>>>>>>>>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>>>>>  {
>>>>>>>>         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
>>>>>>>> +       struct timespec ts;
>>>>>>>>
>>>>>>>>         if (!req)
>>>>>>>>                 return ERR_PTR(-ENOMEM);
>>>>>>>> @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>>>>>>>>         init_completion(&req->r_safe_completion);
>>>>>>>>         INIT_LIST_HEAD(&req->r_unsafe_item);
>>>>>>>>
>>>>>>>> -       req->r_stamp = current_fs_time(mdsc->fsc->sb);
>>>>>>>> +       ktime_get_real_ts(&ts);
>>>>>>>> +       req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
>>>>>>>
>>>>>>> This change causes our kernel_untar_tar test case to fail (inode's
>>>>>>> ctime goes back). The reason is that there is time drift between the
>>>>>>> time stamps got by  ktime_get_real_ts() and current_time(). We need to
>>>>>>> revert this change until current_time() uses ktime_get_real_ts()
>>>>>>> internally.
>>>>>>
>>>>>> Hmm, the change was not supposed to have a user-visible effect, so
>>>>>> something has gone wrong, but I don't immediately see how it
>>>>>> relates to what you observe.
>>>>>>
>>>>>> ktime_get_real_ts() and current_time() use the same time base, there
>>>>>> is no drift, but there is a difference in resolution, as the latter uses
>>>>>> the time stamp of the last jiffies update, which may be up to one jiffy
>>>>>> (10ms) behind the exact time we put in the request stamps here.
>>>>>>
>>>>>> Do you still see problems if you use current_kernel_time() instead of
>>>>>> ktime_get_real_ts()?
>>>>>
>>>>> The problem disappears after using current_kernel_time().
>>>>>
>>>>> https://github.com/ceph/ceph-client/commit/2e0f648da23167034a3cf1500bc90ec60aef2417
>>>>
>>>> From the commit above:
>>>> "It seems there is time drift between ktime_get_real_ts() and
>>>> current_kernel_time()"
>>>>
>>>> Its more of a granularity difference. current_kernel_time() returns
>>>> the cached time at the last tick, where as ktime_get_real_ts() reads
>>>> the clocksource hardware and returns the immediate time.
>>>>
>>>> Filesystems usually use the cached time (similar to
>>>> CLOCK_REALTIME_COARSE), for performance reasons, as touching the
>>>> clocksource takes time.
>>>
>>> Alternatively, it would be best for this code also to use current_time().
>>> I had suggested this in one of the previous versions of the patch.
>>> The implementation of current_time() will change when we switch vfs to
>>> use 64 bit time. This will prevent such errors from happening again.
>>> But, this also means there is more code reordering for these modules
>>> to get a reference to inode.
>>>
>>
>> I took a look. it's quite inconvenience to use current_time(). I
>> prefer to temporarily use current_kernel_time().
>
> I've looked at the code some more and I think there is another angle to it:
> In your test case, 'tar' calls into the utimes syscall (or a member of its
> family), which sets the i_ctime field in the inode to the curren time
> (using current_time()), then calls __ceph_setattr(), which creates
> a mds client request, and ceph_mdsc_create_request() takes another
> time stamp and stores it in r_stamp.
>
> We then store the first timestamp (only) in the in-memory inode, and the
> second time stamp in the request. Depending on the state of the inode,
> we may also set the ctime to a third timestamp we again take using
> current_time().
>
> The mtime and atime from user space get passed correctly through
> union ceph_mds_request_args->setattr and are kept in sync between
> the in-memory inode and the persistent inode data, but the ctime
> in the inode never makes it to the lower protocol levels and instead
> we use the r_stamp field that got set a little earlier or a little later.
>
> I believe the bug you see is the result of the two timestamps
> currently being almost guaranteed to be different in the latest
> kernels.
> Changing r_stamp to use current_kernel_time() will make it the
> same value most of the time (as it was before Deepa's patch),
> but when the timer interrupt happens between the timestamps,
> the two are still different, it's just much harder to hit.
>
> I think the proper solution should be to change __ceph_setattr()
> in a way that has req->r_stamp always synchronized with i_ctime.
> If we copy i_ctime to r_stamp, that will also take care of the
> future issues with the planned changes to current_time().
>
I already have a patch
https://github.com/ceph/ceph-client/commit/24f54cd18e195a002ee3d2ab50dbc952fd9f82af


> The part I don't understand is what else r_stamp (i.e. the time
> stamp in ceph_msg_data with type==
> CEPH_MSG_CLIENT_REQUEST) is used for, other than setting
> ctime in CEPH_MDS_OP_SETATTR.
>
> Will this be used to update the stored i_ctime for other operations
> too? If so, we would need to synchronize it with the in-memory
> i_ctime for all operations that do this.
>

yes,  mds uses it to update ctime of modified inodes. For example,
when handling mkdir, mds set ctime of both parent inode and new inode
to r_stamp.

Regards
Yan, Zheng

> Semi-related side note: I see that the granularity for ceph_timespec
> is 1000 nanoseconds, so the value is always a multiple of 1000.
> If the full 32-bit data gets stored, we could use this it to store the
> epoch number in the future:
>
> static inline void ceph_decode_timespec(struct timespec64 *ts,
>                                         const struct ceph_timespec *tv)
> {
>         u32 ns_epoch = le32_to_cpu(tv->tv_nsec);
>         u32 epoch = ns_epoch % 1000;
>
>         /* tv_sec is traditionally interpreted as unsigned
>          * with time ranges 1970-2106, we extend
>          * it to 1970-138069 */
>         ts->tv_sec = (u64)le32_to_cpu(tv->tv_sec) +
>                             (u64)epoch << 32;
>         ts->tv_nsec = ns_epoch - epoch;
> }
>
>       Arnd
Arnd Bergmann June 2, 2017, 10:51 a.m. UTC | #10
On Fri, Jun 2, 2017 at 12:10 PM, Yan, Zheng <ukernel@gmail.com> wrote:
> On Fri, Jun 2, 2017 at 5:45 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>> On Fri, Jun 2, 2017 at 4:09 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>> On Fri, Jun 2, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>>> On Thu, Jun 1, 2017 at 5:36 PM, John Stultz <john.stultz@linaro.org> wrote:
>>>>> On Thu, Jun 1, 2017 at 5:26 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>
>> I believe the bug you see is the result of the two timestamps
>> currently being almost guaranteed to be different in the latest
>> kernels.
>> Changing r_stamp to use current_kernel_time() will make it the
>> same value most of the time (as it was before Deepa's patch),
>> but when the timer interrupt happens between the timestamps,
>> the two are still different, it's just much harder to hit.
>>
>> I think the proper solution should be to change __ceph_setattr()
>> in a way that has req->r_stamp always synchronized with i_ctime.
>> If we copy i_ctime to r_stamp, that will also take care of the
>> future issues with the planned changes to current_time().
>>
> I already have a patch
> https://github.com/ceph/ceph-client/commit/24f54cd18e195a002ee3d2ab50dbc952fd9f82af

Looks good to me. In case anyone cares:
Acked-by: Arnd Bergmann <arnd@arndb.de>

>> The part I don't understand is what else r_stamp (i.e. the time
>> stamp in ceph_msg_data with type==
>> CEPH_MSG_CLIENT_REQUEST) is used for, other than setting
>> ctime in CEPH_MDS_OP_SETATTR.
>>
>> Will this be used to update the stored i_ctime for other operations
>> too? If so, we would need to synchronize it with the in-memory
>> i_ctime for all operations that do this.
>>
>
> yes,  mds uses it to update ctime of modified inodes. For example,
> when handling mkdir, mds set ctime of both parent inode and new inode
> to r_stamp.

I see, so we may have a variation of that problem there as well: From
my reading of the code, the child inode is not in memory yet, so
that seems fine, but I could not find where the parent in-memory inode
i_ctime is updated in ceph, but it is most likely not the same as
req->r_stamp (assuming it gets updated at all).

Would it make sense require all callers of ceph_mdsc_do_request()
to update r_stamp at the same time as i_ctime to keep them in sync?

        Arnd
Yan, Zheng June 2, 2017, 11:18 a.m. UTC | #11
On Fri, Jun 2, 2017 at 6:51 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Fri, Jun 2, 2017 at 12:10 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>> On Fri, Jun 2, 2017 at 5:45 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>> On Fri, Jun 2, 2017 at 4:09 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>> On Fri, Jun 2, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>>>> On Thu, Jun 1, 2017 at 5:36 PM, John Stultz <john.stultz@linaro.org> wrote:
>>>>>> On Thu, Jun 1, 2017 at 5:26 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>
>>> I believe the bug you see is the result of the two timestamps
>>> currently being almost guaranteed to be different in the latest
>>> kernels.
>>> Changing r_stamp to use current_kernel_time() will make it the
>>> same value most of the time (as it was before Deepa's patch),
>>> but when the timer interrupt happens between the timestamps,
>>> the two are still different, it's just much harder to hit.
>>>
>>> I think the proper solution should be to change __ceph_setattr()
>>> in a way that has req->r_stamp always synchronized with i_ctime.
>>> If we copy i_ctime to r_stamp, that will also take care of the
>>> future issues with the planned changes to current_time().
>>>
>> I already have a patch
>> https://github.com/ceph/ceph-client/commit/24f54cd18e195a002ee3d2ab50dbc952fd9f82af
>
> Looks good to me. In case anyone cares:
> Acked-by: Arnd Bergmann <arnd@arndb.de>
>
>>> The part I don't understand is what else r_stamp (i.e. the time
>>> stamp in ceph_msg_data with type==
>>> CEPH_MSG_CLIENT_REQUEST) is used for, other than setting
>>> ctime in CEPH_MDS_OP_SETATTR.
>>>
>>> Will this be used to update the stored i_ctime for other operations
>>> too? If so, we would need to synchronize it with the in-memory
>>> i_ctime for all operations that do this.
>>>
>>
>> yes,  mds uses it to update ctime of modified inodes. For example,
>> when handling mkdir, mds set ctime of both parent inode and new inode
>> to r_stamp.
>
> I see, so we may have a variation of that problem there as well: From
> my reading of the code, the child inode is not in memory yet, so
> that seems fine, but I could not find where the parent in-memory inode
> i_ctime is updated in ceph, but it is most likely not the same as
> req->r_stamp (assuming it gets updated at all).

i_ctime is updated when handling request reply, by ceph_fill_file_time().
__ceph_setattr() can update the in-memory inode's ctime after request
reply is received. The difference between ktime_get_real_ts() and
current_time() can be larger than round-trip time of request. So it's
still possible that __ceph_setattr() make ctime go back.

Regards
Yan, Zheng


>
> Would it make sense require all callers of ceph_mdsc_do_request()
> to update r_stamp at the same time as i_ctime to keep them in sync?
>
>         Arnd
Arnd Bergmann June 2, 2017, 11:33 a.m. UTC | #12
On Fri, Jun 2, 2017 at 1:18 PM, Yan, Zheng <ukernel@gmail.com> wrote:
> On Fri, Jun 2, 2017 at 6:51 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>> On Fri, Jun 2, 2017 at 12:10 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>> On Fri, Jun 2, 2017 at 5:45 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>>> On Fri, Jun 2, 2017 at 4:09 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>>> On Fri, Jun 2, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>>>>> On Thu, Jun 1, 2017 at 5:36 PM, John Stultz <john.stultz@linaro.org> wrote:
>>>>>>> On Thu, Jun 1, 2017 at 5:26 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>>
>>>> I believe the bug you see is the result of the two timestamps
>>>> currently being almost guaranteed to be different in the latest
>>>> kernels.
>>>> Changing r_stamp to use current_kernel_time() will make it the
>>>> same value most of the time (as it was before Deepa's patch),
>>>> but when the timer interrupt happens between the timestamps,
>>>> the two are still different, it's just much harder to hit.
>>>>
>>>> I think the proper solution should be to change __ceph_setattr()
>>>> in a way that has req->r_stamp always synchronized with i_ctime.
>>>> If we copy i_ctime to r_stamp, that will also take care of the
>>>> future issues with the planned changes to current_time().
>>>>
>>> I already have a patch
>>> https://github.com/ceph/ceph-client/commit/24f54cd18e195a002ee3d2ab50dbc952fd9f82af
>>
>> Looks good to me. In case anyone cares:
>> Acked-by: Arnd Bergmann <arnd@arndb.de>
>>
>>>> The part I don't understand is what else r_stamp (i.e. the time
>>>> stamp in ceph_msg_data with type==
>>>> CEPH_MSG_CLIENT_REQUEST) is used for, other than setting
>>>> ctime in CEPH_MDS_OP_SETATTR.
>>>>
>>>> Will this be used to update the stored i_ctime for other operations
>>>> too? If so, we would need to synchronize it with the in-memory
>>>> i_ctime for all operations that do this.
>>>>
>>>
>>> yes,  mds uses it to update ctime of modified inodes. For example,
>>> when handling mkdir, mds set ctime of both parent inode and new inode
>>> to r_stamp.
>>
>> I see, so we may have a variation of that problem there as well: From
>> my reading of the code, the child inode is not in memory yet, so
>> that seems fine, but I could not find where the parent in-memory inode
>> i_ctime is updated in ceph, but it is most likely not the same as
>> req->r_stamp (assuming it gets updated at all).
>
> i_ctime is updated when handling request reply, by ceph_fill_file_time().
> __ceph_setattr() can update the in-memory inode's ctime after request
> reply is received. The difference between ktime_get_real_ts() and
> current_time() can be larger than round-trip time of request. So it's
> still possible that __ceph_setattr() make ctime go back.

But the __ceph_setattr() problem should be fixed by your patch, right?

What I meant is another related problem in ceph_mkdir() where the
i_ctime field of the parent inode is different between the persistent
representation in the mds and the in-memory representation.

    Arnd

>> Would it make sense require all callers of ceph_mdsc_do_request()
>> to update r_stamp at the same time as i_ctime to keep them in sync?
Yan, Zheng June 2, 2017, 12:18 p.m. UTC | #13
On Fri, Jun 2, 2017 at 7:33 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Fri, Jun 2, 2017 at 1:18 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>> On Fri, Jun 2, 2017 at 6:51 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>> On Fri, Jun 2, 2017 at 12:10 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>> On Fri, Jun 2, 2017 at 5:45 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>>>> On Fri, Jun 2, 2017 at 4:09 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>>>> On Fri, Jun 2, 2017 at 8:57 AM, Deepa Dinamani <deepa.kernel@gmail.com> wrote:
>>>>>>> On Thu, Jun 1, 2017 at 5:36 PM, John Stultz <john.stultz@linaro.org> wrote:
>>>>>>>> On Thu, Jun 1, 2017 at 5:26 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>>>
>>>>> I believe the bug you see is the result of the two timestamps
>>>>> currently being almost guaranteed to be different in the latest
>>>>> kernels.
>>>>> Changing r_stamp to use current_kernel_time() will make it the
>>>>> same value most of the time (as it was before Deepa's patch),
>>>>> but when the timer interrupt happens between the timestamps,
>>>>> the two are still different, it's just much harder to hit.
>>>>>
>>>>> I think the proper solution should be to change __ceph_setattr()
>>>>> in a way that has req->r_stamp always synchronized with i_ctime.
>>>>> If we copy i_ctime to r_stamp, that will also take care of the
>>>>> future issues with the planned changes to current_time().
>>>>>
>>>> I already have a patch
>>>> https://github.com/ceph/ceph-client/commit/24f54cd18e195a002ee3d2ab50dbc952fd9f82af
>>>
>>> Looks good to me. In case anyone cares:
>>> Acked-by: Arnd Bergmann <arnd@arndb.de>
>>>
>>>>> The part I don't understand is what else r_stamp (i.e. the time
>>>>> stamp in ceph_msg_data with type==
>>>>> CEPH_MSG_CLIENT_REQUEST) is used for, other than setting
>>>>> ctime in CEPH_MDS_OP_SETATTR.
>>>>>
>>>>> Will this be used to update the stored i_ctime for other operations
>>>>> too? If so, we would need to synchronize it with the in-memory
>>>>> i_ctime for all operations that do this.
>>>>>
>>>>
>>>> yes,  mds uses it to update ctime of modified inodes. For example,
>>>> when handling mkdir, mds set ctime of both parent inode and new inode
>>>> to r_stamp.
>>>
>>> I see, so we may have a variation of that problem there as well: From
>>> my reading of the code, the child inode is not in memory yet, so
>>> that seems fine, but I could not find where the parent in-memory inode
>>> i_ctime is updated in ceph, but it is most likely not the same as
>>> req->r_stamp (assuming it gets updated at all).
>>
>> i_ctime is updated when handling request reply, by ceph_fill_file_time().
>> __ceph_setattr() can update the in-memory inode's ctime after request
>> reply is received. The difference between ktime_get_real_ts() and
>> current_time() can be larger than round-trip time of request. So it's
>> still possible that __ceph_setattr() make ctime go back.
>
> But the __ceph_setattr() problem should be fixed by your patch, right?
>
> What I meant is another related problem in ceph_mkdir() where the
> i_ctime field of the parent inode is different between the persistent
> representation in the mds and the in-memory representation.
>

I don't see any problem in mkdir case. Parent inode's i_ctime in mds is set to
r_stamp. When client receives request reply, it set its in-memory inode's ctime
to the same time stamp.

Regards
Yan, Zheng

>     Arnd
>
>>> Would it make sense require all callers of ceph_mdsc_do_request()
>>> to update r_stamp at the same time as i_ctime to keep them in sync?
Arnd Bergmann June 2, 2017, 2:18 p.m. UTC | #14
On Fri, Jun 2, 2017 at 2:18 PM, Yan, Zheng <ukernel@gmail.com> wrote:
> On Fri, Jun 2, 2017 at 7:33 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>> On Fri, Jun 2, 2017 at 1:18 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>> What I meant is another related problem in ceph_mkdir() where the
>> i_ctime field of the parent inode is different between the persistent
>> representation in the mds and the in-memory representation.
>>
>
> I don't see any problem in mkdir case. Parent inode's i_ctime in mds is set to
> r_stamp. When client receives request reply, it set its in-memory inode's ctime
> to the same time stamp.

Ok, I see it now, thanks for the clarification. Most other file systems do this
the other way round and update all fields in the in-memory inode structure
first and then write that to persistent storage, so I was getting confused about
the order of events here.

If I understand it all right, we have three different behaviors in ceph now,
though the differences are very minor and probably don't ever matter:

- in setattr(), we update ctime in the in-memory inode first and then send
  the same time to the mds, and expect to set it again when the reply comes.

- in ceph_write_iter write() and mmap/page_mkwrite(), we call
  file_update_time() to set i_mtime and i_ctime to the same
  timestamp first once a write is observed by the fs and then take
  two other timestamps that we send to the mds, and update the
  in-memory inode a second time when the reply comes. ctime
  is never older than mtime here, as far as I can tell, but it may
  be newer when the timer interrupt happens between taking the
  two stamps.

- in all other calls, we only update the inode (and/or parent inode)
  after the reply arrives.

       Arnd
Yan, Zheng June 4, 2017, 7:54 a.m. UTC | #15
On Fri, Jun 2, 2017 at 10:18 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Fri, Jun 2, 2017 at 2:18 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>> On Fri, Jun 2, 2017 at 7:33 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>> On Fri, Jun 2, 2017 at 1:18 PM, Yan, Zheng <ukernel@gmail.com> wrote:
>>> What I meant is another related problem in ceph_mkdir() where the
>>> i_ctime field of the parent inode is different between the persistent
>>> representation in the mds and the in-memory representation.
>>>
>>
>> I don't see any problem in mkdir case. Parent inode's i_ctime in mds is set to
>> r_stamp. When client receives request reply, it set its in-memory inode's ctime
>> to the same time stamp.
>
> Ok, I see it now, thanks for the clarification. Most other file systems do this
> the other way round and update all fields in the in-memory inode structure
> first and then write that to persistent storage, so I was getting confused about
> the order of events here.
>
> If I understand it all right, we have three different behaviors in ceph now,
> though the differences are very minor and probably don't ever matter:
>
> - in setattr(), we update ctime in the in-memory inode first and then send
>   the same time to the mds, and expect to set it again when the reply comes.
>
> - in ceph_write_iter write() and mmap/page_mkwrite(), we call
>   file_update_time() to set i_mtime and i_ctime to the same
>   timestamp first once a write is observed by the fs and then take
>   two other timestamps that we send to the mds, and update the
>   in-memory inode a second time when the reply comes. ctime
>   is never older than mtime here, as far as I can tell, but it may
>   be newer when the timer interrupt happens between taking the
>   two stamps.

We don't use request to send i_mtime/i_ctime to mds in this case.
Instead, we use cap flush message. i_mtime/i_ctime are directly
encoded in cap flush message. When mds receives the cap flush message,
it writes i_mtime/i_ctime to persistent storage and sends a cap flush
ack message to client. (when client receives the cap flush ack
message, it does not update i_mtime/i_ctime). There is no issue as you
described.

>
> - in all other calls, we only update the inode (and/or parent inode)
>   after the reply arrives.

There are two cases. 1. Client updates in-memory inode's ctime, it
sends the new ctime to mds through cap flush message. 2. client set
mds request's r_stamp and send the request to mds. MDS updates
relavent inodes' ctime and sends reply to client. Client updates
in-memory inodes' ctime according to the reply.

Regards
Yan, Zheng

>
>        Arnd
diff mbox

Patch

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 517838b..77204da 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1922,7 +1922,7 @@  static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 {
 	struct ceph_osd_request *osd_req = obj_request->osd_req;
 
-	osd_req->r_mtime = CURRENT_TIME;
+	ktime_get_real_ts(&osd_req->r_mtime);
 	osd_req->r_data_offset = obj_request->offset;
 }
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c681762..1d3fa90 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1666,6 +1666,7 @@  struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 {
 	struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+	struct timespec ts;
 
 	if (!req)
 		return ERR_PTR(-ENOMEM);
@@ -1684,7 +1685,8 @@  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	init_completion(&req->r_safe_completion);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 
-	req->r_stamp = current_fs_time(mdsc->fsc->sb);
+	ktime_get_real_ts(&ts);
+	req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
 
 	req->r_op = op;
 	req->r_direct_mode = mode;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f76bb33..5766a6c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1386,8 +1386,9 @@  static void prepare_write_keepalive(struct ceph_connection *con)
 	dout("prepare_write_keepalive %p\n", con);
 	con_out_kvec_reset(con);
 	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
-		struct timespec now = CURRENT_TIME;
+		struct timespec now;
 
+		ktime_get_real_ts(&now);
 		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
 		ceph_encode_timespec(&con->out_temp_keepalive2, &now);
 		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
@@ -3176,8 +3177,9 @@  bool ceph_con_keepalive_expired(struct ceph_connection *con,
 {
 	if (interval > 0 &&
 	    (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
-		struct timespec now = CURRENT_TIME;
+		struct timespec now;
 		struct timespec ts;
+		ktime_get_real_ts(&now);
 		jiffies_to_timespec(interval, &ts);
 		ts = timespec_add(con->last_keepalive_ack, ts);
 		return timespec_compare(&now, &ts) >= 0;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e15ea9e..242d7c0 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3574,7 +3574,7 @@  ceph_osdc_watch(struct ceph_osd_client *osdc,
 	ceph_oid_copy(&lreq->t.base_oid, oid);
 	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
 	lreq->t.flags = CEPH_OSD_FLAG_WRITE;
-	lreq->mtime = CURRENT_TIME;
+	ktime_get_real_ts(&lreq->mtime);
 
 	lreq->reg_req = alloc_linger_request(lreq);
 	if (!lreq->reg_req) {
@@ -3632,7 +3632,7 @@  int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
 	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
 	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
 	req->r_flags = CEPH_OSD_FLAG_WRITE;
-	req->r_mtime = CURRENT_TIME;
+	ktime_get_real_ts(&req->r_mtime);
 	osd_req_op_watch_init(req, 0, lreq->linger_id,
 			      CEPH_OSD_WATCH_OP_UNWATCH);