Message ID | 20241003123933.2589036-3-vadfed@meta.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | eth: fbnic: add timestamping support | expand |
On 10/3/2024 5:39 AM, Vadim Fedorenko wrote: > +/* FBNIC timing & PTP implementation > + * Datapath uses truncated 40b timestamps for scheduling and event reporting. > + * We need to promote those to full 64b, hence we periodically cache the top > + * 32bit of the HW time counter. Since this makes our time reporting non-atomic > + * we leave the HW clock free running and adjust time offsets in SW as needed. > + * Time offset is 64bit - we need a seq counter for 32bit machines. > + * Time offset and the cache of top bits are independent so we don't need > + * a coherent snapshot of both - READ_ONCE()/WRITE_ONCE() + writer side lock > + * are enough. > + */ > + If you're going to implement adjustments only in software anyways, can you use a timecounter+cyclecounter instead of re-implementing? > +/* Period of refresh of top bits of timestamp, give ourselves a 8x margin. > + * This should translate to once a minute. > + * The use of nsecs_to_jiffies() should be safe for a <=40b nsec value. > + */ > +#define FBNIC_TS_HIGH_REFRESH_JIF nsecs_to_jiffies((1ULL << 40) / 16) > + > +static struct fbnic_dev *fbnic_from_ptp_info(struct ptp_clock_info *ptp) > +{ > + return container_of(ptp, struct fbnic_dev, ptp_info); > +} > + > +/* This function is "slow" because we could try guessing which high part > + * is correct based on low instead of re-reading, and skip reading @hi > + * twice altogether if @lo is far enough from 0. > + */ > +static u64 __fbnic_time_get_slow(struct fbnic_dev *fbd) > +{ > + u32 hi, lo; > + > + lockdep_assert_held(&fbd->time_lock); > + > + do { > + hi = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI); > + lo = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_LO); > + } while (hi != fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI)); > + How long does it take hi to overflow? You may be able to get away without looping. I think another way to implement this is to read lo, then hi, then lo again, and if lo2 is smaller than lo, you know hi overflowed and you can re-read hi > +static int fbnic_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) > +{ > + int scale = 16; /* scaled_ppm has 16 fractional places */ > + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); > + u64 scaled_delta, dclk_period; > + unsigned long flags; > + s64 delta; > + int sgn; > + > + sgn = scaled_ppm >= 0 ? 1 : -1; > + scaled_ppm *= sgn; > + > + /* d_clock is 600 MHz; which in Q16.32 fixed point ns is: */ > + dclk_period = (((u64)1000000000) << 32) / FBNIC_CLOCK_FREQ; > + > + while (scaled_ppm > U64_MAX / dclk_period) { > + scaled_ppm >>= 1; > + scale--; > + } > + > + scaled_delta = (u64)scaled_ppm * dclk_period; > + delta = div_u64(scaled_delta, 1000 * 1000) >> scale; > + delta *= sgn; Please use adjust_by_scaled_ppm or diff_by_scaled_ppm. It makes use of mul_u64_u64_div_u64 where feasible to do the temporary multiplication step as 128 bit arithmetic. > + > + spin_lock_irqsave(&fbd->time_lock, flags); > + __fbnic_time_set_addend(fbd, dclk_period + delta); > + fbnic_wr32(fbd, FBNIC_PTP_ADJUST, FBNIC_PTP_ADJUST_ADDEND_SET); > + > + /* Flush, make sure FBNIC_PTP_ADD_VAL_* is stable for at least 4 clks */ > + fbnic_rd32(fbd, FBNIC_PTP_SPARE); > + spin_unlock_irqrestore(&fbd->time_lock, flags); > + > + return fbnic_present(fbd) ? 0 : -EIO; > +} > + > +static int fbnic_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) > +{ > + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); > + struct fbnic_net *fbn; > + unsigned long flags; > + > + fbn = netdev_priv(fbd->netdev); > + > + spin_lock_irqsave(&fbd->time_lock, flags); > + u64_stats_update_begin(&fbn->time_seq); > + WRITE_ONCE(fbn->time_offset, READ_ONCE(fbn->time_offset) + delta); > + u64_stats_update_end(&fbn->time_seq); > + spin_unlock_irqrestore(&fbd->time_lock, flags); > + > + return 0; > +} > + > +static int > +fbnic_ptp_gettimex64(struct ptp_clock_info *ptp, struct timespec64 *ts, > + struct ptp_system_timestamp *sts) > +{ > + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); > + struct fbnic_net *fbn; > + unsigned long flags; > + u64 time_ns; > + u32 hi, lo; > + > + fbn = netdev_priv(fbd->netdev); > + > + spin_lock_irqsave(&fbd->time_lock, flags); > + > + do { > + hi = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI); > + ptp_read_system_prets(sts); > + lo = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_LO); > + ptp_read_system_postts(sts); > + /* Similarly to comment above __fbnic_time_get_slow() > + * - this can be optimized if needed. > + */ > + } while (hi != fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI)); > + > + time_ns = ((u64)hi << 32 | lo) + fbn->time_offset; > + spin_unlock_irqrestore(&fbd->time_lock, flags); > + > + if (!fbnic_present(fbd)) > + return -EIO; > + > + *ts = ns_to_timespec64(time_ns); > + > + return 0; > +} > + > +static int > +fbnic_ptp_settime64(struct ptp_clock_info *ptp, const struct timespec64 *ts) > +{ > + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); > + struct fbnic_net *fbn; > + unsigned long flags; > + u64 dev_ns, host_ns; > + int ret; > + > + fbn = netdev_priv(fbd->netdev); > + > + host_ns = timespec64_to_ns(ts); > + > + spin_lock_irqsave(&fbd->time_lock, flags); > + > + dev_ns = __fbnic_time_get_slow(fbd); > + > + if (fbnic_present(fbd)) { > + u64_stats_update_begin(&fbn->time_seq); > + WRITE_ONCE(fbn->time_offset, host_ns - dev_ns); > + u64_stats_update_end(&fbn->time_seq); > + ret = 0; > + } else { > + ret = -EIO; > + } > + spin_unlock_irqrestore(&fbd->time_lock, flags); > + > + return ret; > +} > + Since all your operations are using a software offset and leaving the timer free-running, I think this really would make more sense using a timecounter and cyclecounter. > +static const struct ptp_clock_info fbnic_ptp_info = { > + .owner = THIS_MODULE, > + /* 1,000,000,000 - 1 PPB to ensure increment is positive > + * after max negative adjustment. > + */ > + .max_adj = 999999999, > + .do_aux_work = fbnic_ptp_do_aux_work, > + .adjfine = fbnic_ptp_adjfine, > + .adjtime = fbnic_ptp_adjtime, > + .gettimex64 = fbnic_ptp_gettimex64, > + .settime64 = fbnic_ptp_settime64, > +}; > + > +static void fbnic_ptp_reset(struct fbnic_dev *fbd) > +{ > + struct fbnic_net *fbn = netdev_priv(fbd->netdev); > + u64 dclk_period; > + > + fbnic_wr32(fbd, FBNIC_PTP_CTRL, > + FBNIC_PTP_CTRL_EN | > + FIELD_PREP(FBNIC_PTP_CTRL_TICK_IVAL, 1)); > + > + /* d_clock is 600 MHz; which in Q16.32 fixed point ns is: */ > + dclk_period = (((u64)1000000000) << 32) / FBNIC_CLOCK_FREQ; > + > + __fbnic_time_set_addend(fbd, dclk_period); > + > + fbnic_wr32(fbd, FBNIC_PTP_INIT_HI, 0); > + fbnic_wr32(fbd, FBNIC_PTP_INIT_LO, 0); > + > + fbnic_wr32(fbd, FBNIC_PTP_ADJUST, FBNIC_PTP_ADJUST_INIT); > + > + fbnic_wr32(fbd, FBNIC_PTP_CTRL, > + FBNIC_PTP_CTRL_EN | > + FBNIC_PTP_CTRL_TQS_OUT_EN | > + FIELD_PREP(FBNIC_PTP_CTRL_MAC_OUT_IVAL, 3) | > + FIELD_PREP(FBNIC_PTP_CTRL_TICK_IVAL, 1)); > + > + fbnic_rd32(fbd, FBNIC_PTP_SPARE); > + > + fbn->time_offset = 0; > + fbn->time_high = 0; Not entirely sure how it works for you, but we found that most users expect to minimize the time loss or changes due to a reset, and we re-apply the last known configuration during a reset instead of letting the clock reset back to base state.
On 05/10/2024 00:05, Jacob Keller wrote: > > > On 10/3/2024 5:39 AM, Vadim Fedorenko wrote: >> +/* FBNIC timing & PTP implementation >> + * Datapath uses truncated 40b timestamps for scheduling and event reporting. >> + * We need to promote those to full 64b, hence we periodically cache the top >> + * 32bit of the HW time counter. Since this makes our time reporting non-atomic >> + * we leave the HW clock free running and adjust time offsets in SW as needed. >> + * Time offset is 64bit - we need a seq counter for 32bit machines. >> + * Time offset and the cache of top bits are independent so we don't need >> + * a coherent snapshot of both - READ_ONCE()/WRITE_ONCE() + writer side lock >> + * are enough. >> + */ >> + > > If you're going to implement adjustments only in software anyways, can > you use a timecounter+cyclecounter instead of re-implementing? Thanks for pointing this out, I'll make it with timecounter/cyclecounter > >> +/* Period of refresh of top bits of timestamp, give ourselves a 8x margin. >> + * This should translate to once a minute. >> + * The use of nsecs_to_jiffies() should be safe for a <=40b nsec value. >> + */ >> +#define FBNIC_TS_HIGH_REFRESH_JIF nsecs_to_jiffies((1ULL << 40) / 16) >> + >> +static struct fbnic_dev *fbnic_from_ptp_info(struct ptp_clock_info *ptp) >> +{ >> + return container_of(ptp, struct fbnic_dev, ptp_info); >> +} >> + >> +/* This function is "slow" because we could try guessing which high part >> + * is correct based on low instead of re-reading, and skip reading @hi >> + * twice altogether if @lo is far enough from 0. >> + */ >> +static u64 __fbnic_time_get_slow(struct fbnic_dev *fbd) >> +{ >> + u32 hi, lo; >> + >> + lockdep_assert_held(&fbd->time_lock); >> + >> + do { >> + hi = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI); >> + lo = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_LO); >> + } while (hi != fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI)); >> + > > How long does it take hi to overflow? You may be able to get away > without looping. According to comment above it may take up to 8 minutes to overflow, but the updates to the cache should be done every minute. We do not expect this cycle to happen often. > I think another way to implement this is to read lo, then hi, then lo > again, and if lo2 is smaller than lo, you know hi overflowed and you can > re-read hi That's an option too, I'll think of it, thanks! > >> +static int fbnic_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) >> +{ >> + int scale = 16; /* scaled_ppm has 16 fractional places */ >> + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); >> + u64 scaled_delta, dclk_period; >> + unsigned long flags; >> + s64 delta; >> + int sgn; >> + >> + sgn = scaled_ppm >= 0 ? 1 : -1; >> + scaled_ppm *= sgn; >> + >> + /* d_clock is 600 MHz; which in Q16.32 fixed point ns is: */ >> + dclk_period = (((u64)1000000000) << 32) / FBNIC_CLOCK_FREQ; >> + >> + while (scaled_ppm > U64_MAX / dclk_period) { >> + scaled_ppm >>= 1; >> + scale--; >> + } >> + >> + scaled_delta = (u64)scaled_ppm * dclk_period; >> + delta = div_u64(scaled_delta, 1000 * 1000) >> scale; >> + delta *= sgn; > > > Please use adjust_by_scaled_ppm or diff_by_scaled_ppm. It makes use of > mul_u64_u64_div_u64 where feasible to do the temporary multiplication > step as 128 bit arithmetic. > Got it, will use it in the next version. >> + >> + spin_lock_irqsave(&fbd->time_lock, flags); >> + __fbnic_time_set_addend(fbd, dclk_period + delta); >> + fbnic_wr32(fbd, FBNIC_PTP_ADJUST, FBNIC_PTP_ADJUST_ADDEND_SET); >> + >> + /* Flush, make sure FBNIC_PTP_ADD_VAL_* is stable for at least 4 clks */ >> + fbnic_rd32(fbd, FBNIC_PTP_SPARE); >> + spin_unlock_irqrestore(&fbd->time_lock, flags); >> + >> + return fbnic_present(fbd) ? 0 : -EIO; >> +} >> + >> +static int fbnic_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) >> +{ >> + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); >> + struct fbnic_net *fbn; >> + unsigned long flags; >> + >> + fbn = netdev_priv(fbd->netdev); >> + >> + spin_lock_irqsave(&fbd->time_lock, flags); >> + u64_stats_update_begin(&fbn->time_seq); >> + WRITE_ONCE(fbn->time_offset, READ_ONCE(fbn->time_offset) + delta); >> + u64_stats_update_end(&fbn->time_seq); >> + spin_unlock_irqrestore(&fbd->time_lock, flags); >> + >> + return 0; >> +} >> + >> +static int >> +fbnic_ptp_gettimex64(struct ptp_clock_info *ptp, struct timespec64 *ts, >> + struct ptp_system_timestamp *sts) >> +{ >> + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); >> + struct fbnic_net *fbn; >> + unsigned long flags; >> + u64 time_ns; >> + u32 hi, lo; >> + >> + fbn = netdev_priv(fbd->netdev); >> + >> + spin_lock_irqsave(&fbd->time_lock, flags); >> + >> + do { >> + hi = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI); >> + ptp_read_system_prets(sts); >> + lo = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_LO); >> + ptp_read_system_postts(sts); >> + /* Similarly to comment above __fbnic_time_get_slow() >> + * - this can be optimized if needed. >> + */ >> + } while (hi != fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI)); >> + >> + time_ns = ((u64)hi << 32 | lo) + fbn->time_offset; >> + spin_unlock_irqrestore(&fbd->time_lock, flags); >> + >> + if (!fbnic_present(fbd)) >> + return -EIO; >> + >> + *ts = ns_to_timespec64(time_ns); >> + >> + return 0; >> +} >> + >> +static int >> +fbnic_ptp_settime64(struct ptp_clock_info *ptp, const struct timespec64 *ts) >> +{ >> + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); >> + struct fbnic_net *fbn; >> + unsigned long flags; >> + u64 dev_ns, host_ns; >> + int ret; >> + >> + fbn = netdev_priv(fbd->netdev); >> + >> + host_ns = timespec64_to_ns(ts); >> + >> + spin_lock_irqsave(&fbd->time_lock, flags); >> + >> + dev_ns = __fbnic_time_get_slow(fbd); >> + >> + if (fbnic_present(fbd)) { >> + u64_stats_update_begin(&fbn->time_seq); >> + WRITE_ONCE(fbn->time_offset, host_ns - dev_ns); >> + u64_stats_update_end(&fbn->time_seq); >> + ret = 0; >> + } else { >> + ret = -EIO; >> + } >> + spin_unlock_irqrestore(&fbd->time_lock, flags); >> + >> + return ret; >> +} >> + > > Since all your operations are using a software offset and leaving the > timer free-running, I think this really would make more sense using a > timecounter and cyclecounter. > Yeah, looks like it's better to use common interfaces. >> +static const struct ptp_clock_info fbnic_ptp_info = { >> + .owner = THIS_MODULE, >> + /* 1,000,000,000 - 1 PPB to ensure increment is positive >> + * after max negative adjustment. >> + */ >> + .max_adj = 999999999, >> + .do_aux_work = fbnic_ptp_do_aux_work, >> + .adjfine = fbnic_ptp_adjfine, >> + .adjtime = fbnic_ptp_adjtime, >> + .gettimex64 = fbnic_ptp_gettimex64, >> + .settime64 = fbnic_ptp_settime64, >> +}; >> + >> +static void fbnic_ptp_reset(struct fbnic_dev *fbd) >> +{ >> + struct fbnic_net *fbn = netdev_priv(fbd->netdev); >> + u64 dclk_period; >> + >> + fbnic_wr32(fbd, FBNIC_PTP_CTRL, >> + FBNIC_PTP_CTRL_EN | >> + FIELD_PREP(FBNIC_PTP_CTRL_TICK_IVAL, 1)); >> + >> + /* d_clock is 600 MHz; which in Q16.32 fixed point ns is: */ >> + dclk_period = (((u64)1000000000) << 32) / FBNIC_CLOCK_FREQ; >> + >> + __fbnic_time_set_addend(fbd, dclk_period); >> + >> + fbnic_wr32(fbd, FBNIC_PTP_INIT_HI, 0); >> + fbnic_wr32(fbd, FBNIC_PTP_INIT_LO, 0); >> + >> + fbnic_wr32(fbd, FBNIC_PTP_ADJUST, FBNIC_PTP_ADJUST_INIT); >> + >> + fbnic_wr32(fbd, FBNIC_PTP_CTRL, >> + FBNIC_PTP_CTRL_EN | >> + FBNIC_PTP_CTRL_TQS_OUT_EN | >> + FIELD_PREP(FBNIC_PTP_CTRL_MAC_OUT_IVAL, 3) | >> + FIELD_PREP(FBNIC_PTP_CTRL_TICK_IVAL, 1)); >> + >> + fbnic_rd32(fbd, FBNIC_PTP_SPARE); >> + >> + fbn->time_offset = 0; >> + fbn->time_high = 0; > > Not entirely sure how it works for you, but we found that most users > expect to minimize the time loss or changes due to a reset, and we > re-apply the last known configuration during a reset instead of letting > the clock reset back to base state. In case of reset the counter will be re-initialized, which means the stored offset will be invalid anyways. It's better to reset values back to base state because it will make easier for the app to decide to make a step rather then adjust frequency. And it will make clocks align faster.
On Mon, 7 Oct 2024 14:07:17 +0100 Vadim Fedorenko wrote: > On 05/10/2024 00:05, Jacob Keller wrote: > > On 10/3/2024 5:39 AM, Vadim Fedorenko wrote: > >> +/* FBNIC timing & PTP implementation > >> + * Datapath uses truncated 40b timestamps for scheduling and event reporting. > >> + * We need to promote those to full 64b, hence we periodically cache the top > >> + * 32bit of the HW time counter. Since this makes our time reporting non-atomic > >> + * we leave the HW clock free running and adjust time offsets in SW as needed. > >> + * Time offset is 64bit - we need a seq counter for 32bit machines. > >> + * Time offset and the cache of top bits are independent so we don't need > >> + * a coherent snapshot of both - READ_ONCE()/WRITE_ONCE() + writer side lock > >> + * are enough. > >> + */ > >> + > > > > If you're going to implement adjustments only in software anyways, can > > you use a timecounter+cyclecounter instead of re-implementing? > > Thanks for pointing this out, I'll make it with timecounter/cyclecounter Please don't, the clock is synthonized, we only do simple offsetting. > >> +/* Period of refresh of top bits of timestamp, give ourselves a 8x margin. > >> + * This should translate to once a minute. > >> + * The use of nsecs_to_jiffies() should be safe for a <=40b nsec value. > >> + */ > >> +#define FBNIC_TS_HIGH_REFRESH_JIF nsecs_to_jiffies((1ULL << 40) / 16) > >> + > >> +static struct fbnic_dev *fbnic_from_ptp_info(struct ptp_clock_info *ptp) > >> +{ > >> + return container_of(ptp, struct fbnic_dev, ptp_info); > >> +} > >> + > >> +/* This function is "slow" because we could try guessing which high part > >> + * is correct based on low instead of re-reading, and skip reading @hi > >> + * twice altogether if @lo is far enough from 0. > >> + */ > >> +static u64 __fbnic_time_get_slow(struct fbnic_dev *fbd) > >> +{ > >> + u32 hi, lo; > >> + > >> + lockdep_assert_held(&fbd->time_lock); > >> + > >> + do { > >> + hi = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI); > >> + lo = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_LO); > >> + } while (hi != fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI)); > >> + > > > > How long does it take hi to overflow? You may be able to get away > > without looping. > > According to comment above it may take up to 8 minutes to overflow, but > the updates to the cache should be done every minute. We do not expect > this cycle to happen often. > > > I think another way to implement this is to read lo, then hi, then lo > > again, and if lo2 is smaller than lo, you know hi overflowed and you can > > re-read hi > > That's an option too, I'll think of it, thanks! The triple read is less neat in case hi jumps by more than 1.
On 10/7/2024 4:09 PM, Jakub Kicinski wrote: > On Mon, 7 Oct 2024 14:07:17 +0100 Vadim Fedorenko wrote: >> On 05/10/2024 00:05, Jacob Keller wrote: >>> On 10/3/2024 5:39 AM, Vadim Fedorenko wrote: >>>> +/* FBNIC timing & PTP implementation >>>> + * Datapath uses truncated 40b timestamps for scheduling and event reporting. >>>> + * We need to promote those to full 64b, hence we periodically cache the top >>>> + * 32bit of the HW time counter. Since this makes our time reporting non-atomic >>>> + * we leave the HW clock free running and adjust time offsets in SW as needed. >>>> + * Time offset is 64bit - we need a seq counter for 32bit machines. >>>> + * Time offset and the cache of top bits are independent so we don't need >>>> + * a coherent snapshot of both - READ_ONCE()/WRITE_ONCE() + writer side lock >>>> + * are enough. >>>> + */ >>>> + >>> >>> If you're going to implement adjustments only in software anyways, can >>> you use a timecounter+cyclecounter instead of re-implementing? >> >> Thanks for pointing this out, I'll make it with timecounter/cyclecounter > > Please don't, the clock is synthonized, we only do simple offsetting. > I still think it makes sense to re-use the logic for converting cycles to full 64bit time values if possible. If you're already doing offset adjustment, you still have to apply the same logic to every timestamp, which is exactly what a timecounter does for you. You can even use a timecounter and cyclecounter without using its ability to do syntonizing, by just setting the cyclecounter values appropriately, and leaving the syntonizing to the hardware mechanism. I think the result is much easier to understand and follow than re-implementing a different mechanism for offset correction with bespoke code.
On 10/7/2024 4:09 PM, Jakub Kicinski wrote: >>> I think another way to implement this is to read lo, then hi, then lo >>> again, and if lo2 is smaller than lo, you know hi overflowed and you can >>> re-read hi >> >> That's an option too, I'll think of it, thanks! > > The triple read is less neat in case hi jumps by more than 1. > Wouldn't that require the lower bits to accumulate more than an entire cycle for that to happen?
On Mon, 7 Oct 2024 16:49:45 -0700 Jacob Keller wrote: > >> Thanks for pointing this out, I'll make it with timecounter/cyclecounter > > > > Please don't, the clock is synthonized, we only do simple offsetting. > > > I still think it makes sense to re-use the logic for converting cycles > to full 64bit time values if possible. > > If you're already doing offset adjustment, you still have to apply the > same logic to every timestamp, which is exactly what a timecounter does > for you. "exactly what a timecounter does for you" is an overstatement. timecounter tracks the overflows by itself and synthonizes. We have the 64b value, just need to combine the top bits. > You can even use a timecounter and cyclecounter without using its > ability to do syntonizing, by just setting the cyclecounter values > appropriately, and leaving the syntonizing to the hardware mechanism. > > I think the result is much easier to understand and follow than > re-implementing a different mechanism for offset correction with bespoke > code. This is fast path code, I hope we can get a simple addition and an overflow check right.
diff --git a/drivers/net/ethernet/meta/fbnic/Makefile b/drivers/net/ethernet/meta/fbnic/Makefile index ed4533a73c57..8615d516a274 100644 --- a/drivers/net/ethernet/meta/fbnic/Makefile +++ b/drivers/net/ethernet/meta/fbnic/Makefile @@ -18,4 +18,5 @@ fbnic-y := fbnic_devlink.o \ fbnic_phylink.o \ fbnic_rpc.o \ fbnic_tlv.o \ - fbnic_txrx.o + fbnic_txrx.o \ + fbnic_time.o diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 0f9e8d79461c..ca59261f0155 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -6,6 +6,7 @@ #include <linux/interrupt.h> #include <linux/io.h> +#include <linux/ptp_clock_kernel.h> #include <linux/types.h> #include <linux/workqueue.h> @@ -49,6 +50,16 @@ struct fbnic_dev { /* Number of TCQs/RCQs available on hardware */ u16 max_num_queues; + /* Lock protecting writes to @time_high, @time_offset of fbnic_netdev, + * and the HW time CSR machinery. + */ + spinlock_t time_lock; + /* Externally accessible PTP clock, may be NULL */ + struct ptp_clock *ptp; + struct ptp_clock_info ptp_info; + /* Last @time_high refresh time in jiffies (to catch stalls) */ + unsigned long last_read; + /* Local copy of hardware statistics */ struct fbnic_hw_stats hw_stats; }; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index 21db509acbc1..290b924b7749 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -413,6 +413,44 @@ enum { #define FBNIC_TMI_DROP_CTRL 0x04401 /* 0x11004 */ #define FBNIC_TMI_DROP_CTRL_EN CSR_BIT(0) #define FBNIC_CSR_END_TMI 0x0443f /* CSR section delimiter */ + +/* Precision Time Protocol Registers */ +#define FBNIC_CSR_START_PTP 0x04800 /* CSR section delimiter */ +#define FBNIC_PTP_REG_BASE 0x04800 /* 0x12000 */ + +#define FBNIC_PTP_CTRL 0x04800 /* 0x12000 */ +#define FBNIC_PTP_CTRL_EN CSR_BIT(0) +#define FBNIC_PTP_CTRL_MONO_EN CSR_BIT(4) +#define FBNIC_PTP_CTRL_TQS_OUT_EN CSR_BIT(8) +#define FBNIC_PTP_CTRL_MAC_OUT_IVAL CSR_GENMASK(16, 12) +#define FBNIC_PTP_CTRL_TICK_IVAL CSR_GENMASK(23, 20) + +#define FBNIC_PTP_ADJUST 0x04801 /* 0x12004 */ +#define FBNIC_PTP_ADJUST_INIT CSR_BIT(0) +#define FBNIC_PTP_ADJUST_SUB_NUDGE CSR_BIT(8) +#define FBNIC_PTP_ADJUST_ADD_NUDGE CSR_BIT(16) +#define FBNIC_PTP_ADJUST_ADDEND_SET CSR_BIT(24) + +#define FBNIC_PTP_INIT_HI 0x04802 /* 0x12008 */ +#define FBNIC_PTP_INIT_LO 0x04803 /* 0x1200c */ + +#define FBNIC_PTP_NUDGE_NS 0x04804 /* 0x12010 */ +#define FBNIC_PTP_NUDGE_SUBNS 0x04805 /* 0x12014 */ + +#define FBNIC_PTP_ADD_VAL_NS 0x04806 /* 0x12018 */ +#define FBNIC_PTP_ADD_VAL_NS_MASK CSR_GENMASK(15, 0) +#define FBNIC_PTP_ADD_VAL_SUBNS 0x04807 /* 0x1201c */ + +#define FBNIC_PTP_CTR_VAL_HI 0x04808 /* 0x12020 */ +#define FBNIC_PTP_CTR_VAL_LO 0x04809 /* 0x12024 */ + +#define FBNIC_PTP_MONO_PTP_CTR_HI 0x0480a /* 0x12028 */ +#define FBNIC_PTP_MONO_PTP_CTR_LO 0x0480b /* 0x1202c */ + +#define FBNIC_PTP_CDC_FIFO_STATUS 0x0480c /* 0x12030 */ +#define FBNIC_PTP_SPARE 0x0480d /* 0x12034 */ +#define FBNIC_CSR_END_PTP 0x0480d /* CSR section delimiter */ + /* Rx Buffer Registers */ #define FBNIC_CSR_START_RXB 0x08000 /* CSR section delimiter */ enum { diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index a400616a24d4..6e6d8988db54 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -42,18 +42,24 @@ int __fbnic_open(struct fbnic_net *fbn) goto free_resources; } - err = fbnic_fw_init_heartbeat(fbd, false); + err = fbnic_time_start(fbn); if (err) goto release_ownership; + err = fbnic_fw_init_heartbeat(fbd, false); + if (err) + goto time_stop; + err = fbnic_pcs_irq_enable(fbd); if (err) - goto release_ownership; + goto time_stop; /* Pull the BMC config and initialize the RPC */ fbnic_bmc_rpc_init(fbd); fbnic_rss_reinit(fbd, fbn); return 0; +time_stop: + fbnic_time_stop(fbn); release_ownership: fbnic_fw_xmit_ownership_msg(fbn->fbd, false); free_resources: @@ -82,6 +88,7 @@ static int fbnic_stop(struct net_device *netdev) fbnic_down(fbn); fbnic_pcs_irq_disable(fbn->fbd); + fbnic_time_stop(fbn); fbnic_fw_xmit_ownership_msg(fbn->fbd, false); fbnic_free_resources(fbn); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h index 6c27da09a612..f530e3235634 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h @@ -33,6 +33,15 @@ struct fbnic_net { u8 fec; u8 link_mode; + /* Cached top bits of the HW time counter for 40b -> 64b conversion */ + u32 time_high; + /* Protect readers of @time_offset, writers take @time_lock. */ + struct u64_stats_sync time_seq; + /* Offset in ns between free running NIC PHC and time set via PTP + * clock callbacks + */ + s64 time_offset; + u16 num_tx_queues; u16 num_rx_queues; @@ -60,6 +69,12 @@ void fbnic_reset_queues(struct fbnic_net *fbn, unsigned int tx, unsigned int rx); void fbnic_set_ethtool_ops(struct net_device *dev); +int fbnic_ptp_setup(struct fbnic_dev *fbd); +void fbnic_ptp_destroy(struct fbnic_dev *fbd); +void fbnic_time_init(struct fbnic_net *fbn); +int fbnic_time_start(struct fbnic_net *fbn); +void fbnic_time_stop(struct fbnic_net *fbn); + void __fbnic_set_rx_mode(struct net_device *netdev); void fbnic_clear_rx_mode(struct net_device *netdev); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index a4809fe0fc24..32e5b4cc55bd 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -300,14 +300,20 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto init_failure_mode; } + err = fbnic_ptp_setup(fbd); + if (err) + goto ifm_free_netdev; + err = fbnic_netdev_register(netdev); if (err) { dev_err(&pdev->dev, "Netdev registration failed: %d\n", err); - goto ifm_free_netdev; + goto ifm_destroy_ptp; } return 0; +ifm_destroy_ptp: + fbnic_ptp_destroy(fbd); ifm_free_netdev: fbnic_netdev_free(fbd); init_failure_mode: @@ -342,6 +348,7 @@ static void fbnic_remove(struct pci_dev *pdev) fbnic_netdev_unregister(netdev); cancel_delayed_work_sync(&fbd->service_task); + fbnic_ptp_destroy(fbd); fbnic_netdev_free(fbd); } diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_time.c b/drivers/net/ethernet/meta/fbnic/fbnic_time.c new file mode 100644 index 000000000000..f79b1388cb6e --- /dev/null +++ b/drivers/net/ethernet/meta/fbnic/fbnic_time.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#include <linux/bitfield.h> +#include <linux/jiffies.h> +#include <linux/limits.h> +#include <linux/ptp_clock_kernel.h> +#include <linux/timer.h> + +#include "fbnic.h" +#include "fbnic_csr.h" +#include "fbnic_netdev.h" + +/* FBNIC timing & PTP implementation + * Datapath uses truncated 40b timestamps for scheduling and event reporting. + * We need to promote those to full 64b, hence we periodically cache the top + * 32bit of the HW time counter. Since this makes our time reporting non-atomic + * we leave the HW clock free running and adjust time offsets in SW as needed. + * Time offset is 64bit - we need a seq counter for 32bit machines. + * Time offset and the cache of top bits are independent so we don't need + * a coherent snapshot of both - READ_ONCE()/WRITE_ONCE() + writer side lock + * are enough. + */ + +/* Period of refresh of top bits of timestamp, give ourselves a 8x margin. + * This should translate to once a minute. + * The use of nsecs_to_jiffies() should be safe for a <=40b nsec value. + */ +#define FBNIC_TS_HIGH_REFRESH_JIF nsecs_to_jiffies((1ULL << 40) / 16) + +static struct fbnic_dev *fbnic_from_ptp_info(struct ptp_clock_info *ptp) +{ + return container_of(ptp, struct fbnic_dev, ptp_info); +} + +/* This function is "slow" because we could try guessing which high part + * is correct based on low instead of re-reading, and skip reading @hi + * twice altogether if @lo is far enough from 0. + */ +static u64 __fbnic_time_get_slow(struct fbnic_dev *fbd) +{ + u32 hi, lo; + + lockdep_assert_held(&fbd->time_lock); + + do { + hi = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI); + lo = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_LO); + } while (hi != fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI)); + + return (u64)hi << 32 | lo; +} + +static void __fbnic_time_set_addend(struct fbnic_dev *fbd, u64 addend) +{ + lockdep_assert_held(&fbd->time_lock); + + fbnic_wr32(fbd, FBNIC_PTP_ADD_VAL_NS, + FIELD_PREP(FBNIC_PTP_ADD_VAL_NS_MASK, addend >> 32)); + fbnic_wr32(fbd, FBNIC_PTP_ADD_VAL_SUBNS, (u32)addend); +} + +static void fbnic_ptp_fresh_check(struct fbnic_dev *fbd) +{ + if (time_is_after_jiffies(fbd->last_read + + FBNIC_TS_HIGH_REFRESH_JIF * 3 / 2)) + return; + + dev_warn(fbd->dev, "NIC timestamp refresh stall, delayed by %lu sec\n", + (jiffies - fbd->last_read - FBNIC_TS_HIGH_REFRESH_JIF) / HZ); +} + +static void fbnic_ptp_refresh_time(struct fbnic_dev *fbd, struct fbnic_net *fbn) +{ + unsigned long flags; + u32 hi; + + spin_lock_irqsave(&fbd->time_lock, flags); + hi = fbnic_rd32(fbn->fbd, FBNIC_PTP_CTR_VAL_HI); + if (!fbnic_present(fbd)) + goto out; /* Don't bother handling, reset is pending */ + WRITE_ONCE(fbn->time_high, hi); + fbd->last_read = jiffies; + out: + spin_unlock_irqrestore(&fbd->time_lock, flags); +} + +static long fbnic_ptp_do_aux_work(struct ptp_clock_info *ptp) +{ + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); + struct fbnic_net *fbn; + + fbn = netdev_priv(fbd->netdev); + + fbnic_ptp_fresh_check(fbd); + fbnic_ptp_refresh_time(fbd, fbn); + + return FBNIC_TS_HIGH_REFRESH_JIF; +} + +static int fbnic_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) +{ + int scale = 16; /* scaled_ppm has 16 fractional places */ + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); + u64 scaled_delta, dclk_period; + unsigned long flags; + s64 delta; + int sgn; + + sgn = scaled_ppm >= 0 ? 1 : -1; + scaled_ppm *= sgn; + + /* d_clock is 600 MHz; which in Q16.32 fixed point ns is: */ + dclk_period = (((u64)1000000000) << 32) / FBNIC_CLOCK_FREQ; + + while (scaled_ppm > U64_MAX / dclk_period) { + scaled_ppm >>= 1; + scale--; + } + + scaled_delta = (u64)scaled_ppm * dclk_period; + delta = div_u64(scaled_delta, 1000 * 1000) >> scale; + delta *= sgn; + + spin_lock_irqsave(&fbd->time_lock, flags); + __fbnic_time_set_addend(fbd, dclk_period + delta); + fbnic_wr32(fbd, FBNIC_PTP_ADJUST, FBNIC_PTP_ADJUST_ADDEND_SET); + + /* Flush, make sure FBNIC_PTP_ADD_VAL_* is stable for at least 4 clks */ + fbnic_rd32(fbd, FBNIC_PTP_SPARE); + spin_unlock_irqrestore(&fbd->time_lock, flags); + + return fbnic_present(fbd) ? 0 : -EIO; +} + +static int fbnic_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) +{ + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); + struct fbnic_net *fbn; + unsigned long flags; + + fbn = netdev_priv(fbd->netdev); + + spin_lock_irqsave(&fbd->time_lock, flags); + u64_stats_update_begin(&fbn->time_seq); + WRITE_ONCE(fbn->time_offset, READ_ONCE(fbn->time_offset) + delta); + u64_stats_update_end(&fbn->time_seq); + spin_unlock_irqrestore(&fbd->time_lock, flags); + + return 0; +} + +static int +fbnic_ptp_gettimex64(struct ptp_clock_info *ptp, struct timespec64 *ts, + struct ptp_system_timestamp *sts) +{ + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); + struct fbnic_net *fbn; + unsigned long flags; + u64 time_ns; + u32 hi, lo; + + fbn = netdev_priv(fbd->netdev); + + spin_lock_irqsave(&fbd->time_lock, flags); + + do { + hi = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI); + ptp_read_system_prets(sts); + lo = fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_LO); + ptp_read_system_postts(sts); + /* Similarly to comment above __fbnic_time_get_slow() + * - this can be optimized if needed. + */ + } while (hi != fbnic_rd32(fbd, FBNIC_PTP_CTR_VAL_HI)); + + time_ns = ((u64)hi << 32 | lo) + fbn->time_offset; + spin_unlock_irqrestore(&fbd->time_lock, flags); + + if (!fbnic_present(fbd)) + return -EIO; + + *ts = ns_to_timespec64(time_ns); + + return 0; +} + +static int +fbnic_ptp_settime64(struct ptp_clock_info *ptp, const struct timespec64 *ts) +{ + struct fbnic_dev *fbd = fbnic_from_ptp_info(ptp); + struct fbnic_net *fbn; + unsigned long flags; + u64 dev_ns, host_ns; + int ret; + + fbn = netdev_priv(fbd->netdev); + + host_ns = timespec64_to_ns(ts); + + spin_lock_irqsave(&fbd->time_lock, flags); + + dev_ns = __fbnic_time_get_slow(fbd); + + if (fbnic_present(fbd)) { + u64_stats_update_begin(&fbn->time_seq); + WRITE_ONCE(fbn->time_offset, host_ns - dev_ns); + u64_stats_update_end(&fbn->time_seq); + ret = 0; + } else { + ret = -EIO; + } + spin_unlock_irqrestore(&fbd->time_lock, flags); + + return ret; +} + +static const struct ptp_clock_info fbnic_ptp_info = { + .owner = THIS_MODULE, + /* 1,000,000,000 - 1 PPB to ensure increment is positive + * after max negative adjustment. + */ + .max_adj = 999999999, + .do_aux_work = fbnic_ptp_do_aux_work, + .adjfine = fbnic_ptp_adjfine, + .adjtime = fbnic_ptp_adjtime, + .gettimex64 = fbnic_ptp_gettimex64, + .settime64 = fbnic_ptp_settime64, +}; + +static void fbnic_ptp_reset(struct fbnic_dev *fbd) +{ + struct fbnic_net *fbn = netdev_priv(fbd->netdev); + u64 dclk_period; + + fbnic_wr32(fbd, FBNIC_PTP_CTRL, + FBNIC_PTP_CTRL_EN | + FIELD_PREP(FBNIC_PTP_CTRL_TICK_IVAL, 1)); + + /* d_clock is 600 MHz; which in Q16.32 fixed point ns is: */ + dclk_period = (((u64)1000000000) << 32) / FBNIC_CLOCK_FREQ; + + __fbnic_time_set_addend(fbd, dclk_period); + + fbnic_wr32(fbd, FBNIC_PTP_INIT_HI, 0); + fbnic_wr32(fbd, FBNIC_PTP_INIT_LO, 0); + + fbnic_wr32(fbd, FBNIC_PTP_ADJUST, FBNIC_PTP_ADJUST_INIT); + + fbnic_wr32(fbd, FBNIC_PTP_CTRL, + FBNIC_PTP_CTRL_EN | + FBNIC_PTP_CTRL_TQS_OUT_EN | + FIELD_PREP(FBNIC_PTP_CTRL_MAC_OUT_IVAL, 3) | + FIELD_PREP(FBNIC_PTP_CTRL_TICK_IVAL, 1)); + + fbnic_rd32(fbd, FBNIC_PTP_SPARE); + + fbn->time_offset = 0; + fbn->time_high = 0; +} + +void fbnic_time_init(struct fbnic_net *fbn) +{ + /* This is not really a statistic, but the lockng primitive fits + * our usecase perfectly, we need an atomic 8 bytes READ_ONCE() / + * WRITE_ONCE() behavior. + */ + u64_stats_init(&fbn->time_seq); +} + +int fbnic_time_start(struct fbnic_net *fbn) +{ + fbnic_ptp_refresh_time(fbn->fbd, fbn); + /* Assume that fbnic_ptp_do_aux_work() will never be called if not + * scheduled here + */ + return ptp_schedule_worker(fbn->fbd->ptp, FBNIC_TS_HIGH_REFRESH_JIF); +} + +void fbnic_time_stop(struct fbnic_net *fbn) +{ + ptp_cancel_worker_sync(fbn->fbd->ptp); + fbnic_ptp_fresh_check(fbn->fbd); +} + +int fbnic_ptp_setup(struct fbnic_dev *fbd) +{ + struct device *dev = fbd->dev; + unsigned long flags; + + spin_lock_init(&fbd->time_lock); + + spin_lock_irqsave(&fbd->time_lock, flags); /* Appease lockdep */ + fbnic_ptp_reset(fbd); + spin_unlock_irqrestore(&fbd->time_lock, flags); + + memcpy(&fbd->ptp_info, &fbnic_ptp_info, sizeof(fbnic_ptp_info)); + + fbd->ptp = ptp_clock_register(&fbd->ptp_info, dev); + if (IS_ERR(fbd->ptp)) + dev_err(dev, "Failed to register PTP: %pe\n", fbd->ptp); + + return PTR_ERR_OR_ZERO(fbd->ptp); +} + +void fbnic_ptp_destroy(struct fbnic_dev *fbd) +{ + if (!fbd->ptp) + return; + ptp_clock_unregister(fbd->ptp); +}
Create PHC device and provide callbacks needed for ptp_clock device. Signed-off-by: Vadim Fedorenko <vadfed@meta.com> --- drivers/net/ethernet/meta/fbnic/Makefile | 3 +- drivers/net/ethernet/meta/fbnic/fbnic.h | 11 + drivers/net/ethernet/meta/fbnic/fbnic_csr.h | 38 +++ .../net/ethernet/meta/fbnic/fbnic_netdev.c | 11 +- .../net/ethernet/meta/fbnic/fbnic_netdev.h | 15 + drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 9 +- drivers/net/ethernet/meta/fbnic/fbnic_time.c | 311 ++++++++++++++++++ 7 files changed, 394 insertions(+), 4 deletions(-) create mode 100644 drivers/net/ethernet/meta/fbnic/fbnic_time.c