diff mbox series

[net-next,2/2] net: dsa: microchip: Add SGMII port support to KSZ9477 switch

Message ID 20241109015633.82638-3-Tristram.Ha@microchip.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net: dsa: microchip: Add SGMII port support to KSZ9477 switch | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: linux@armlinux.org.uk
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 783 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-11-09--03-00 (tests: 784)

Commit Message

Tristram.Ha@microchip.com Nov. 9, 2024, 1:56 a.m. UTC
From: Tristram Ha <tristram.ha@microchip.com>

The SGMII module of KSZ9477 switch can be setup in 3 ways: 0 for direct
connect, 1 for 1000BaseT/1000BaseX SFP, and 2 for 10/100/1000BaseT SFP.

SFP is typically used so the default is 1.  The driver can detect
10/100/1000BaseT SFP and change the mode to 2.  For direct connect the
device tree can use fixed-link for the SGMII port as this link will
never be disconnected.

The SGMII module can only support basic link status of the SFP, so the
port can be simulated as having a regular internal PHY when SFP cage
logic is not used.

One issue for the 1000BaseX SFP is there is no link down interrupt, so
the driver has to use polling to detect link off when the link is up.

Note the SGMII interrupt cannot be masked in hardware.  Also the module
is not reset when the switch is reset.  It is important to reset the
module properly to make sure interrupt is not triggered prematurely.

A PCS driver for the SGMII port is added to accommodate the SFP cage
logic used in the phylink code.  It is used to confirm the link is up
and process the SGMII interrupt.

Signed-off-by: Tristram Ha <tristram.ha@microchip.com>
---
 drivers/net/dsa/microchip/ksz9477.c     | 450 +++++++++++++++++++++++-
 drivers/net/dsa/microchip/ksz9477.h     |   7 +-
 drivers/net/dsa/microchip/ksz9477_reg.h |   6 +
 drivers/net/dsa/microchip/ksz_common.c  |  94 ++++-
 drivers/net/dsa/microchip/ksz_common.h  |  21 ++
 5 files changed, 571 insertions(+), 7 deletions(-)

Comments

Andrew Lunn Nov. 9, 2024, 3:13 p.m. UTC | #1
On Fri, Nov 08, 2024 at 05:56:33PM -0800, Tristram.Ha@microchip.com wrote:
> From: Tristram Ha <tristram.ha@microchip.com>
> 
> The SGMII module of KSZ9477 switch can be setup in 3 ways: 0 for direct
> connect, 1 for 1000BaseT/1000BaseX SFP, and 2 for 10/100/1000BaseT SFP.

This naming is rather odd. First off, i would drop 'SFP'. It does not
have to be an SFP on the other end, it could be another switch for
example. 1 is PHY_INTERFACE_MODE_1000BASEX and 2 is
PHY_INTERFACE_MODE_SGMII.

> SFP is typically used so the default is 1.  The driver can detect
> 10/100/1000BaseT SFP and change the mode to 2.

phylink will tell you want mode to use. I would ignore what the
hardware detects, so this driver is just the same as every other
driver, making it easier to maintain.

	Andrew
Andrew Lunn Nov. 9, 2024, 3:43 p.m. UTC | #2
> +static void port_sgmii_r(struct ksz_device *dev, uint port, u16 devid, u16 reg,
> +			 u16 *buf, u16 len)
> +{
> +	u32 data;
> +
> +	port_sgmii_s(dev, port, devid, reg, len);
> +	while (len) {
> +		ksz_pread32(dev, port, REG_PORT_SGMII_DATA__4, &data);
> +		*buf++ = (u16)data;
> +		len--;
> +	}
> +}
> +
> +static void port_sgmii_w(struct ksz_device *dev, uint port, u16 devid, u16 reg,
> +			 u16 *buf, u16 len)
> +{
> +	u32 data;
> +
> +	port_sgmii_s(dev, port, devid, reg, len);
> +	while (len) {
> +		data = *buf++;
> +		ksz_pwrite32(dev, port, REG_PORT_SGMII_DATA__4, data);
> +		len--;
> +	}
> +}

This kind of looks like a C45 only MDIO bus.

#define MMD_DEVICE_ID_VENDOR_MII	0x1F

#define SR_MII				MMD_DEVICE_ID_VENDOR_MII

This is identical to MDIO_MMD_VEND2.

#define SR_MII_RESET			BIT(15)
#define SR_MII_LOOPBACK			BIT(14)
#define SR_MII_SPEED_100MBIT		BIT(13)
#define SR_MII_AUTO_NEG_ENABLE		BIT(12)
#define SR_MII_POWER_DOWN		BIT(11)
#define SR_MII_AUTO_NEG_RESTART		BIT(9)
#define SR_MII_FULL_DUPLEX		BIT(8)
#define SR_MII_SPEED_1000MBIT		BIT(6)

A standard BMCR.

#define MMD_SR_MII_STATUS		0x0001
#define MMD_SR_MII_ID_1			0x0002
#define MMD_SR_MII_ID_2			0x0003
#define MMD_SR_MII_AUTO_NEGOTIATION	0x0004

Same as:

#define MII_BMSR                0x01    /* Basic mode status register  */
#define MII_PHYSID1             0x02    /* PHYS ID 1                   */
#define MII_PHYSID2             0x03    /* PHYS ID 2                   */
#define MII_ADVERTISE           0x04    /* Advertisement control reg   */

So i think your first patch should be to replace all these with the
standard macros.

That will then help make it clearer how much is generic, could the
existing helpers be used, probably with a wrapper to make your C22
device mapped to C45 MDIO_MMD_VEND2 look like a C22 device?

	Andrew
Vladimir Oltean Nov. 10, 2024, 3:50 p.m. UTC | #3
On Sat, 9 Nov 2024 at 03:56, <Tristram.Ha@microchip.com> wrote:
> diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
> index f73833e24622..8163342d778a 100644
> --- a/drivers/net/dsa/microchip/ksz_common.c
> +++ b/drivers/net/dsa/microchip/ksz_common.c
> @@ -354,10 +354,30 @@ static void ksz9477_phylink_mac_link_up(struct phylink_config *config,
>                                         int speed, int duplex, bool tx_pause,
>                                         bool rx_pause);
>
> +static struct phylink_pcs *
> +ksz_phylink_mac_select_pcs(struct phylink_config *config,
> +                          phy_interface_t interface)
> +{
> +       struct dsa_port *dp = dsa_phylink_to_port(config);
> +       struct ksz_device *dev = dp->ds->priv;
> +       struct ksz_port *p = &dev->ports[dp->index];
> +
> +       if (!p->sgmii)
> +               return ERR_PTR(-EOPNOTSUPP);

Since commit 7530ea26c810 ("net: phylink: remove "using_mac_select_pcs""),
returning ERR_PTR(-EOPNOTSUPP) here would actually be fatal. This error
code no longer carries any special meaning.

It would be a good idea to Cc Russell King for phylink changes.

> +       switch (interface) {
> +       case PHY_INTERFACE_MODE_SGMII:
> +       case PHY_INTERFACE_MODE_1000BASEX:
> +               return &p->pcs_priv->pcs;
> +       default:
> +               return NULL;
> +       }
> +}
> +
>  static const struct phylink_mac_ops ksz9477_phylink_mac_ops = {
>         .mac_config     = ksz_phylink_mac_config,
>         .mac_link_down  = ksz_phylink_mac_link_down,
>         .mac_link_up    = ksz9477_phylink_mac_link_up,
> +       .mac_select_pcs = ksz_phylink_mac_select_pcs,
>  };
Tristram.Ha@microchip.com Nov. 12, 2024, 2:43 a.m. UTC | #4
> Subject: Re: [PATCH net-next 2/2] net: dsa: microchip: Add SGMII port support to
> KSZ9477 switch
> 
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content
> is safe
> 
> On Sat, 9 Nov 2024 at 03:56, <Tristram.Ha@microchip.com> wrote:
> > diff --git a/drivers/net/dsa/microchip/ksz_common.c
> b/drivers/net/dsa/microchip/ksz_common.c
> > index f73833e24622..8163342d778a 100644
> > --- a/drivers/net/dsa/microchip/ksz_common.c
> > +++ b/drivers/net/dsa/microchip/ksz_common.c
> > @@ -354,10 +354,30 @@ static void ksz9477_phylink_mac_link_up(struct
> phylink_config *config,
> >                                         int speed, int duplex, bool tx_pause,
> >                                         bool rx_pause);
> >
> > +static struct phylink_pcs *
> > +ksz_phylink_mac_select_pcs(struct phylink_config *config,
> > +                          phy_interface_t interface)
> > +{
> > +       struct dsa_port *dp = dsa_phylink_to_port(config);
> > +       struct ksz_device *dev = dp->ds->priv;
> > +       struct ksz_port *p = &dev->ports[dp->index];
> > +
> > +       if (!p->sgmii)
> > +               return ERR_PTR(-EOPNOTSUPP);
> 
> Since commit 7530ea26c810 ("net: phylink: remove "using_mac_select_pcs""),
> returning ERR_PTR(-EOPNOTSUPP) here would actually be fatal. This error
> code no longer carries any special meaning.
> 
> It would be a good idea to Cc Russell King for phylink changes.

Thanks.  Will update the code.
Tristram.Ha@microchip.com Nov. 12, 2024, 2:55 a.m. UTC | #5
> On Fri, Nov 08, 2024 at 05:56:33PM -0800, Tristram.Ha@microchip.com wrote:
> > From: Tristram Ha <tristram.ha@microchip.com>
> >
> > The SGMII module of KSZ9477 switch can be setup in 3 ways: 0 for direct
> > connect, 1 for 1000BaseT/1000BaseX SFP, and 2 for 10/100/1000BaseT SFP.
> 
> This naming is rather odd. First off, i would drop 'SFP'. It does not
> have to be an SFP on the other end, it could be another switch for
> example. 1 is PHY_INTERFACE_MODE_1000BASEX and 2 is
> PHY_INTERFACE_MODE_SGMII.
> 
> > SFP is typically used so the default is 1.  The driver can detect
> > 10/100/1000BaseT SFP and change the mode to 2.
> 
> phylink will tell you want mode to use. I would ignore what the
> hardware detects, so this driver is just the same as every other
> driver, making it easier to maintain.

There are some issues I found that will need your advises.

The phylink SFP code categorizes SFP using fiber cable as
PHY_INTERFACE_MODE_1000BASEX and SFP using a regular RJ45 connector as 
PHY_INTERFACE_MODE_SGMII, which has a PHY that can be accessed through
I2C connection with a PHY driver.  Now when SGMII SFP is used the phylink
cannot be created because it fails the validation in
phylink_sfp_config_phy().  The reason is the phydev has empty supported
and advertising data fields as it is just created.  Even if the provided
PCS driver can change the supported field to pass the initial validation
it cannot change the advertising field as it is readonly to the driver,
and this fails the second validation with phylink_sfp_select_interface().
This problem is so obvious I thought I must be doing something wrong.
This problem occurs in Linux 6.1 so it is not a new problem and nobody
encountered it?

This problem does not happen with SFP using fiber cable as a different
function is used to initialize the phylink.  As the SFP code already
retrieves the basic support information from the SFP EEPROM and stores
it in sfp_support it can be copied to supported and advertising to pass
the validation.

I mentioned the SGMII module operates differently for two types of SFP:
SGMII and 1000BASEX.  The 1000Base-T SFP operates the same as 1000Base-SX
fiber SFP, and the driver would like it to be assigned
PHY_INTERFACE_MODE_1000BASEX, but it is always assigned
PHY_INTERFACE_MODE_SGMII in sfp_select_interface because 1000baseT_Full
is compared before 1000baseX_Full.

Now I am not sure if those SFPs I tested have correct EEPROM.  Some
no-brand ones return 0xff value when the PHY driver reads the link status
from them and so that driver cannot tell when the link is down.  Other
than that those SFPs operate correctly in forwarding traffic.

It seems there is no way to assign an interupt to that PHY and so polling
is always used.

The SFP using fiber cable does not have the above issues but has its own
issue.  The SFP cage can detect the cable is being plugged in as the
light is detected.  The PCS driver is then consulted to confirm the link.
However as the cable is not completely plugged in the driver can report
the link is not up.  After two calls are done the port can be left into
unconnected state forever even after the cable is plugged in.  The driver
can detect there is something different about the link value and can try
to read it later to get a firm reading.  This just means the driver needs
to poll anyway.  But if interrupt is used and the driver uses it to
report link this issue is moot.

I just feel the SFP code offered in the phylink model does not help to
operate SFP well in the KSZ9477 switch, and it does not need that code to
provide basic SGMII port connection.
Tristram.Ha@microchip.com Nov. 12, 2024, 2:58 a.m. UTC | #6
> > +static void port_sgmii_r(struct ksz_device *dev, uint port, u16 devid, u16 reg,
> > +                      u16 *buf, u16 len)
> > +{
> > +     u32 data;
> > +
> > +     port_sgmii_s(dev, port, devid, reg, len);
> > +     while (len) {
> > +             ksz_pread32(dev, port, REG_PORT_SGMII_DATA__4, &data);
> > +             *buf++ = (u16)data;
> > +             len--;
> > +     }
> > +}
> > +
> > +static void port_sgmii_w(struct ksz_device *dev, uint port, u16 devid, u16 reg,
> > +                      u16 *buf, u16 len)
> > +{
> > +     u32 data;
> > +
> > +     port_sgmii_s(dev, port, devid, reg, len);
> > +     while (len) {
> > +             data = *buf++;
> > +             ksz_pwrite32(dev, port, REG_PORT_SGMII_DATA__4, data);
> > +             len--;
> > +     }
> > +}
> 
> This kind of looks like a C45 only MDIO bus.
> 
> #define MMD_DEVICE_ID_VENDOR_MII        0x1F
> 
> #define SR_MII                          MMD_DEVICE_ID_VENDOR_MII
> 
> This is identical to MDIO_MMD_VEND2.
> 
> #define SR_MII_RESET                    BIT(15)
> #define SR_MII_LOOPBACK                 BIT(14)
> #define SR_MII_SPEED_100MBIT            BIT(13)
> #define SR_MII_AUTO_NEG_ENABLE          BIT(12)
> #define SR_MII_POWER_DOWN               BIT(11)
> #define SR_MII_AUTO_NEG_RESTART         BIT(9)
> #define SR_MII_FULL_DUPLEX              BIT(8)
> #define SR_MII_SPEED_1000MBIT           BIT(6)
> 
> A standard BMCR.
> 
> #define MMD_SR_MII_STATUS               0x0001
> #define MMD_SR_MII_ID_1                 0x0002
> #define MMD_SR_MII_ID_2                 0x0003
> #define MMD_SR_MII_AUTO_NEGOTIATION     0x0004
> 
> Same as:
> 
> #define MII_BMSR                0x01    /* Basic mode status register  */
> #define MII_PHYSID1             0x02    /* PHYS ID 1                   */
> #define MII_PHYSID2             0x03    /* PHYS ID 2                   */
> #define MII_ADVERTISE           0x04    /* Advertisement control reg   */
> 
> So i think your first patch should be to replace all these with the
> standard macros.
> 
> That will then help make it clearer how much is generic, could the
> existing helpers be used, probably with a wrapper to make your C22
> device mapped to C45 MDIO_MMD_VEND2 look like a C22 device?

I just realize there are already 1000BASEX definitions in mii.h that can
be used.  I will try to use generic names as much as possible.
Andrew Lunn Nov. 12, 2024, 1:50 p.m. UTC | #7
On Tue, Nov 12, 2024 at 02:55:29AM +0000, Tristram.Ha@microchip.com wrote:
> > On Fri, Nov 08, 2024 at 05:56:33PM -0800, Tristram.Ha@microchip.com wrote:
> > > From: Tristram Ha <tristram.ha@microchip.com>
> > >
> > > The SGMII module of KSZ9477 switch can be setup in 3 ways: 0 for direct
> > > connect, 1 for 1000BaseT/1000BaseX SFP, and 2 for 10/100/1000BaseT SFP.
> > 
> > This naming is rather odd. First off, i would drop 'SFP'. It does not
> > have to be an SFP on the other end, it could be another switch for
> > example. 1 is PHY_INTERFACE_MODE_1000BASEX and 2 is
> > PHY_INTERFACE_MODE_SGMII.
> > 
> > > SFP is typically used so the default is 1.  The driver can detect
> > > 10/100/1000BaseT SFP and change the mode to 2.
> > 
> > phylink will tell you want mode to use. I would ignore what the
> > hardware detects, so this driver is just the same as every other
> > driver, making it easier to maintain.
> 
> There are some issues I found that will need your advises.
> 
> The phylink SFP code categorizes SFP using fiber cable as
> PHY_INTERFACE_MODE_1000BASEX and SFP using a regular RJ45 connector as 
> PHY_INTERFACE_MODE_SGMII, which has a PHY that can be accessed through
> I2C connection with a PHY driver.

Not quite correct, i think. If MDIO over I2C does not work, it will
still decide on 1000BaseX vs SGMII from the SFP eeprom contents. There
are some SFPs where the PHY is not accessible, and we have to live
with however it is configured.

> Now when SGMII SFP is used the phylink
> cannot be created because it fails the validation in
> phylink_sfp_config_phy().

Please stop using 'SGMII SFP'. It should just be SGMII. The MAC should
not care what is on the other end, it could be a PHY, and SFP, or a
switch, all using Cisco SGMII.

> The reason is the phydev has empty supported
> and advertising data fields as it is just created.

Do you mean the phydev for the PHY in the SFP? Or do you have a second
phydev here? I'm confused.

> I mentioned the SGMII module operates differently for two types of SFP:
> SGMII and 1000BASEX.  The 1000Base-T SFP operates the same as 1000Base-SX
> fiber SFP, and the driver would like it to be assigned
> PHY_INTERFACE_MODE_1000BASEX, but it is always assigned
> PHY_INTERFACE_MODE_SGMII in sfp_select_interface because 1000baseT_Full
> is compared before 1000baseX_Full.
> 
> Now I am not sure if those SFPs I tested have correct EEPROM.  Some
> no-brand ones return 0xff value when the PHY driver reads the link status
> from them and so that driver cannot tell when the link is down.  Other
> than that those SFPs operate correctly in forwarding traffic.

There is no standardisation of how you access the PHY in an SFP. So
each manufacture can do their own thing. However, there are a small
number of PHYs actually used inside SFPs, and we have support for
those common ones.

> It seems there is no way to assign an interupt to that PHY and so polling
> is always used.

Correct, the interface between the SFP and the SFP cage does not have
an interrupt pin the PHY can use.

> The SFP using fiber cable does not have the above issues but has its own
> issue.  The SFP cage can detect the cable is being plugged in as the
> light is detected.  The PCS driver is then consulted to confirm the link.
> However as the cable is not completely plugged in the driver can report
> the link is not up.  After two calls are done the port can be left into
> unconnected state forever even after the cable is plugged in.  The driver
> can detect there is something different about the link value and can try
> to read it later to get a firm reading.  This just means the driver needs
> to poll anyway.  But if interrupt is used and the driver uses it to
> report link this issue is moot.

Have you set pcs.poll? phylink will then poll the PCS every
second. You can report PCS status any time.

> I just feel the SFP code offered in the phylink model does not help to
> operate SFP well in the KSZ9477 switch, and it does not need that code to
> provide basic SGMII port connection.

We need to understand what is different about the KSZ9477 switch that
phylink does not work for it, yet phylink does work for mv88e6xxx,
sja1105, rzn1, qca, ocelot, and other MAC drivers.

	Andrew
Tristram.Ha@microchip.com Nov. 13, 2024, 2:12 a.m. UTC | #8
> On Tue, Nov 12, 2024 at 02:55:29AM +0000, Tristram.Ha@microchip.com wrote:
> > > On Fri, Nov 08, 2024 at 05:56:33PM -0800, Tristram.Ha@microchip.com wrote:
> > > > From: Tristram Ha <tristram.ha@microchip.com>
> > > >
> > > > The SGMII module of KSZ9477 switch can be setup in 3 ways: 0 for direct
> > > > connect, 1 for 1000BaseT/1000BaseX SFP, and 2 for 10/100/1000BaseT SFP.
> > >
> > > This naming is rather odd. First off, i would drop 'SFP'. It does not
> > > have to be an SFP on the other end, it could be another switch for
> > > example. 1 is PHY_INTERFACE_MODE_1000BASEX and 2 is
> > > PHY_INTERFACE_MODE_SGMII.
> > >
> > > > SFP is typically used so the default is 1.  The driver can detect
> > > > 10/100/1000BaseT SFP and change the mode to 2.
> > >
> > > phylink will tell you want mode to use. I would ignore what the
> > > hardware detects, so this driver is just the same as every other
> > > driver, making it easier to maintain.
> >
> > There are some issues I found that will need your advises.
> >
> > The phylink SFP code categorizes SFP using fiber cable as
> > PHY_INTERFACE_MODE_1000BASEX and SFP using a regular RJ45 connector as
> > PHY_INTERFACE_MODE_SGMII, which has a PHY that can be accessed through
> > I2C connection with a PHY driver.
> 
> Not quite correct, i think. If MDIO over I2C does not work, it will
> still decide on 1000BaseX vs SGMII from the SFP eeprom contents. There
> are some SFPs where the PHY is not accessible, and we have to live
> with however it is configured.
> 
> > Now when SGMII SFP is used the phylink
> > cannot be created because it fails the validation in
> > phylink_sfp_config_phy().
> 
> Please stop using 'SGMII SFP'. It should just be SGMII. The MAC should
> not care what is on the other end, it could be a PHY, and SFP, or a
> switch, all using Cisco SGMII.
> 
> > The reason is the phydev has empty supported
> > and advertising data fields as it is just created.
> 
> Do you mean the phydev for the PHY in the SFP? Or do you have a second
> phydev here? I'm confused.

I am a little confused.  There may be regular PHY using SGMII with MDIO
access just like a RGMII PHY, but we are dealing specifically SFP here.
The KSZ9477 switch board has a SFP cage where different SFP can be
plugged in.  The SFP driver has to be enabled in kernel configuration
under Hardware Monitoring.  The driver then can read the EEPROM of the
SFP and access its PHY if available and provide support to the phylink
driver.

When the SFP EEPROM says it does not support 1000Base-T then the SFP bus
code does not consider the SFP has a PHY and skips creating a MDIO bus
for it and phylink_sfp_config_optical() is called to create the phylink.

When the SFP says it supports 1000Base-T sfp_add_phy() is called by the
SFP state machine and phylink_sfp_connect_phy() and
phylink_sfp_config_phy() are run.  It is in the last function that the
validation fails as the just created phy device does not initialize its
supported and advertising fields yet.  The phy device has the
opportunity later to fill them up if the phylink creation goes through,
but that never happens.

A fix is to fill those fields with sfp_support like this:

@@ -3228,6 +3228,11 @@ static int phylink_sfp_config_phy(struct
    struct phylink_link_state config;
    int ret;

+    /* The newly created PHY device has empty settings. */
+    if (linkmode_empty(phy->supported)) {
+        linkmode_copy(phy->supported, pl->sfp_support);
+        linkmode_copy(phy->advertising, pl->sfp_support);
+    }
    linkmode_copy(support, phy->supported);

    memset(&config, 0, sizeof(config));

The provided PCS driver from the DSA driver has an opportunity to change
support with its validation check, but that does not look right as
generally those checks remove certain bits from the link mode, but this
requires completely copying new ones.  And this still does not work as
the advertising field passed to the PCS driver has a const modifier.

> > I mentioned the SGMII module operates differently for two types of SFP:
> > SGMII and 1000BASEX.  The 1000Base-T SFP operates the same as 1000Base-SX
> > fiber SFP, and the driver would like it to be assigned
> > PHY_INTERFACE_MODE_1000BASEX, but it is always assigned
> > PHY_INTERFACE_MODE_SGMII in sfp_select_interface because 1000baseT_Full
> > is compared before 1000baseX_Full.
> >
> > Now I am not sure if those SFPs I tested have correct EEPROM.  Some
> > no-brand ones return 0xff value when the PHY driver reads the link status
> > from them and so that driver cannot tell when the link is down.  Other
> > than that those SFPs operate correctly in forwarding traffic.
> 
> There is no standardisation of how you access the PHY in an SFP. So
> each manufacture can do their own thing. However, there are a small
> number of PHYs actually used inside SFPs, and we have support for
> those common ones.

Now back to the discussion of the different modes used by the SGMII
module.  I think a better term like SerDes can be used to help
understanding the operation, although I still cannot narrow down the
precise definitions from looking at the internet.  SGMII mode is
said to support 10/100/1000Mbit.  This is the default setting, so
plugging such SFP allows the port to communicate without any register
programming.  The other mode is SerDes, which is fixed at 1000Mbit.  This
is typically used by SFP using fiber optics.  This requires changing a
register to make the port works.  It seems those 1000Base-T SFPs all run
in SerDes mode, at least from all SFPs I tried.

The issue is then phylink assigns SGMII phy mode to such SFP as its
EEPROM just says 1000Base-T support and not 1000BASEX phy mode so that
the DSA driver can program the register correspondingly.  Because of that
the driver still needs to rely on its own detection to find out which
mode to use.
 
> Have you set pcs.poll? phylink will then poll the PCS every
> second. You can report PCS status any time.

I know about PCS polling.  The SFP cage driver can provide link_up and
link_down indications to the phylink driver.  I thought this feature can
be used without activating polling and when interrupt is not used.  But
that link_up indication can result with the port not connected as I
mentioned.

Anyway the SFP driver has its own state machine doing polling, so it is
not like resources are not used.

One more issue is if a SFP is not plugged in eventually the SFP driver
says "please wait, module slow to respond."  It may look like an error
to regular users.  The next error message definitely looks like it:
"failed to read EEPROM: -EREMOTEIO."

Do you know if anything can be done like defining some GPIO pins like
setting up the tx_disable pin?
Vladimir Oltean Nov. 13, 2024, 2:42 p.m. UTC | #9
On Wed, Nov 13, 2024 at 02:12:36AM +0000, Tristram.Ha@microchip.com wrote:
> When the SFP says it supports 1000Base-T sfp_add_phy() is called by the
> SFP state machine and phylink_sfp_connect_phy() and
> phylink_sfp_config_phy() are run.  It is in the last function that the
> validation fails as the just created phy device does not initialize its
> supported and advertising fields yet.  The phy device has the
> opportunity later to fill them up if the phylink creation goes through,
> but that never happens.
> 
> A fix is to fill those fields with sfp_support like this:
> 
> @@ -3228,6 +3228,11 @@ static int phylink_sfp_config_phy(struct
>     struct phylink_link_state config;
>     int ret;
> 
> +    /* The newly created PHY device has empty settings. */
> +    if (linkmode_empty(phy->supported)) {
> +        linkmode_copy(phy->supported, pl->sfp_support);
> +        linkmode_copy(phy->advertising, pl->sfp_support);
> +    }
>     linkmode_copy(support, phy->supported);
> 
>     memset(&config, 0, sizeof(config));
> 
> The provided PCS driver from the DSA driver has an opportunity to change
> support with its validation check, but that does not look right as
> generally those checks remove certain bits from the link mode, but this
> requires completely copying new ones.  And this still does not work as
> the advertising field passed to the PCS driver has a const modifier.

I think I know what's happening, it's unfortunate it pushed you towards
wrong conclusions.

The "fix" you posted is wrong, and no, the PCS driver should not expand
the supported mask, just restrict it as you said. The phydev->supported
mask normally comes from the phy_probe() logic:

	/* Start out supporting everything. Eventually,
	 * a controller will attach, and may modify one
	 * or both of these values
	 */
	if (phydrv->features) {
		linkmode_copy(phydev->supported, phydrv->features);
		genphy_c45_read_eee_abilities(phydev);
	}
	else if (phydrv->get_features)
		err = phydrv->get_features(phydev);
	else if (phydev->is_c45)
		err = genphy_c45_pma_read_abilities(phydev);
	else
		err = genphy_read_abilities(phydev);

The SFP bus code depends strictly on sfp_sm_probe_phy() -> phy_device_register()
actually loading a precise device driver for the PHY synchronously via
phy_bus_match(). There is another lazy loading mechanism later in
phy_attach_direct(), for the Generic PHY driver:

	/* Assume that if there is no driver, that it doesn't
	 * exist, and we should use the genphy driver.
	 */

but that is too late for this code path, because as you say,
phylink_sfp_config_phy() is coded up to only call phylink_attach_phy()
if phylink_validate() succeeds. But phylink_validate() will only see a
valid phydev->supported mask with the Generic PHY driver if we let that
driver attach in phylink_attach_phy() in the first place.

Personally, I think SFP modules with embedded PHYs strictly require the
matching driver to be available to the kernel, due to that odd way in
which the Generic PHY driver is loaded, but I will let the PHY library
experts share their opinion as well.

You would be better off improving the error message, see what PHY ID you
get, then find and load the driver for it:

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 7dbcbf0a4ee2..8be473a7d262 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -1817,9 +1817,12 @@ static int sfp_sm_probe_phy(struct sfp *sfp, int addr, bool is_c45)
 
 	err = sfp_add_phy(sfp->sfp_bus, phy);
 	if (err) {
+		dev_err(sfp->dev,
+			"sfp_add_phy() for PHY %s (ID 0x%.8lx) failed: %pe, maybe PHY driver not loaded?\n",
+			phydev_name(phy), (unsigned long)phy->phy_id,
+			ERR_PTR(err));
 		phy_device_remove(phy);
 		phy_device_free(phy);
-		dev_err(sfp->dev, "sfp_add_phy failed: %pe\n", ERR_PTR(err));
 		return err;
 	}
 

Chances are it's one of CONFIG_MARVELL_PHY or CONFIG_AQUANTIA_PHY.
Andrew Lunn Nov. 14, 2024, 1:43 a.m. UTC | #10
> When the SFP EEPROM says it does not support 1000Base-T then the SFP bus
> code does not consider the SFP has a PHY and skips creating a MDIO bus
> for it and phylink_sfp_config_optical() is called to create the phylink.

There are many SFPs out there with broken EEPROM contents. Do the SFPs
you have say they are not 1000Base-T but actually are? If so, they are
broken, and need a quirk adding.

Russell King keeps a database of SFP EEPROM contents. Send him the
output of `ethtool -m eth42 raw on hex on` 

> Now back to the discussion of the different modes used by the SGMII
> module.  I think a better term like SerDes can be used to help
> understanding the operation, although I still cannot narrow down the
> precise definitions from looking at the internet.  SGMII mode is
> said to support 10/100/1000Mbit.  This is the default setting, so
> plugging such SFP allows the port to communicate without any register
> programming.  The other mode is SerDes, which is fixed at 1000Mbit.  This
> is typically used by SFP using fiber optics.  This requires changing a
> register to make the port works.  It seems those 1000Base-T SFPs all run
> in SerDes mode, at least from all SFPs I tried.

There is a comment in the code:

/* Probe a SFP for a PHY device if the module supports copper - the PHY
 * normally sits at I2C bus address 0x56, and may either be a clause 22
 * or clause 45 PHY.
 *
 * Clause 22 copper SFP modules normally operate in Cisco SGMII mode with
 * negotiation enabled, but some may be in 1000base-X - which is for the
 * PHY driver to determine.

So the default is SGMII for copper SFPs, but there are a few oddballs
using 1000BaseX. The Marvell PHY driver should figure this out, and
the phylink will tell you want mode to use.


> The issue is then phylink assigns SGMII phy mode to such SFP as its
> EEPROM just says 1000Base-T support and not 1000BASEX phy mode so that
> the DSA driver can program the register correspondingly.  Because of that
> the driver still needs to rely on its own detection to find out which
> mode to use.
>  
> > Have you set pcs.poll? phylink will then poll the PCS every
> > second. You can report PCS status any time.
> 
> I know about PCS polling.  The SFP cage driver can provide link_up and
> link_down indications to the phylink driver.

The SPF cage provides LOS, Loss of Signal. This basically means there
is light coming into the SFP, but not much more. It is not a
trustworthy signal on its own. Phylink combines this with the PCS
status, does the PCS also have link. You need the combination.

> One more issue is if a SFP is not plugged in eventually the SFP driver
> says "please wait, module slow to respond."

Something is wrong with your GPIOs. Phylink thinks there is a module
inserted, when in fact there is not. Add #define DEBUG 1 to the very
top of sfp.c, so you can see the state transitions. I guess there is
something wrong with the MODDEF0 GPIO.

	Andrew
Tristram.Ha@microchip.com Nov. 15, 2024, 1:53 a.m. UTC | #11
> On Wed, Nov 13, 2024 at 02:12:36AM +0000, Tristram.Ha@microchip.com wrote:
> > When the SFP says it supports 1000Base-T sfp_add_phy() is called by the
> > SFP state machine and phylink_sfp_connect_phy() and
> > phylink_sfp_config_phy() are run.  It is in the last function that the
> > validation fails as the just created phy device does not initialize its
> > supported and advertising fields yet.  The phy device has the
> > opportunity later to fill them up if the phylink creation goes through,
> > but that never happens.
> >
> > A fix is to fill those fields with sfp_support like this:
> >
> > @@ -3228,6 +3228,11 @@ static int phylink_sfp_config_phy(struct
> >     struct phylink_link_state config;
> >     int ret;
> >
> > +    /* The newly created PHY device has empty settings. */
> > +    if (linkmode_empty(phy->supported)) {
> > +        linkmode_copy(phy->supported, pl->sfp_support);
> > +        linkmode_copy(phy->advertising, pl->sfp_support);
> > +    }
> >     linkmode_copy(support, phy->supported);
> >
> >     memset(&config, 0, sizeof(config));
> >
> > The provided PCS driver from the DSA driver has an opportunity to change
> > support with its validation check, but that does not look right as
> > generally those checks remove certain bits from the link mode, but this
> > requires completely copying new ones.  And this still does not work as
> > the advertising field passed to the PCS driver has a const modifier.
> 
> I think I know what's happening, it's unfortunate it pushed you towards
> wrong conclusions.
> 
> The "fix" you posted is wrong, and no, the PCS driver should not expand
> the supported mask, just restrict it as you said. The phydev->supported
> mask normally comes from the phy_probe() logic:
> 
>         /* Start out supporting everything. Eventually,
>          * a controller will attach, and may modify one
>          * or both of these values
>          */
>         if (phydrv->features) {
>                 linkmode_copy(phydev->supported, phydrv->features);
>                 genphy_c45_read_eee_abilities(phydev);
>         }
>         else if (phydrv->get_features)
>                 err = phydrv->get_features(phydev);
>         else if (phydev->is_c45)
>                 err = genphy_c45_pma_read_abilities(phydev);
>         else
>                 err = genphy_read_abilities(phydev);
> 
> The SFP bus code depends strictly on sfp_sm_probe_phy() -> phy_device_register()
> actually loading a precise device driver for the PHY synchronously via
> phy_bus_match(). There is another lazy loading mechanism later in
> phy_attach_direct(), for the Generic PHY driver:
> 
>         /* Assume that if there is no driver, that it doesn't
>          * exist, and we should use the genphy driver.
>          */
> 
> but that is too late for this code path, because as you say,
> phylink_sfp_config_phy() is coded up to only call phylink_attach_phy()
> if phylink_validate() succeeds. But phylink_validate() will only see a
> valid phydev->supported mask with the Generic PHY driver if we let that
> driver attach in phylink_attach_phy() in the first place.
> 
> Personally, I think SFP modules with embedded PHYs strictly require the
> matching driver to be available to the kernel, due to that odd way in
> which the Generic PHY driver is loaded, but I will let the PHY library
> experts share their opinion as well.
> 
> You would be better off improving the error message, see what PHY ID you
> get, then find and load the driver for it:
> 
> diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
> index 7dbcbf0a4ee2..8be473a7d262 100644
> --- a/drivers/net/phy/sfp.c
> +++ b/drivers/net/phy/sfp.c
> @@ -1817,9 +1817,12 @@ static int sfp_sm_probe_phy(struct sfp *sfp, int addr, bool
> is_c45)
> 
>         err = sfp_add_phy(sfp->sfp_bus, phy);
>         if (err) {
> +               dev_err(sfp->dev,
> +                       "sfp_add_phy() for PHY %s (ID 0x%.8lx) failed: %pe, maybe PHY driver
> not loaded?\n",
> +                       phydev_name(phy), (unsigned long)phy->phy_id,
> +                       ERR_PTR(err));
>                 phy_device_remove(phy);
>                 phy_device_free(phy);
> -               dev_err(sfp->dev, "sfp_add_phy failed: %pe\n", ERR_PTR(err));
>                 return err;
>         }
> 
> 
> Chances are it's one of CONFIG_MARVELL_PHY or CONFIG_AQUANTIA_PHY.

Indeed adding the Marvell PHY driver fixed the problem.

There is nothing special about the Marvell PHY driver.  It is just
phy_probe() is called during PHY device creation just as you said.

It may not be right to use sfp_support, but all three (sfp_support,
supported, advertising) have about the same value at that point after the
PHY driver is invoked: 0x62ff and 0x60f0.

I mentioned before that some SFPs have faulty implementation where part
of the returned PHY register value is 0xff.  For example, PHY register 0
is 0x11ff, PHY register 1 is 0x79ff, and PHY register 2 is 0x01ff.  The
Marvell PHY id is 0x01410cc0, and I saw there is a special PHY id
0x01ff0cc0 defined for Finisar to accommodate this situation.  Were those
SFPs made by Finisar originally?

Some of those PHY registers are correct, so I do not know if those wrong
registers are intentional or not, but the link status register always
has 0xff value and that cannot be right.
Tristram.Ha@microchip.com Nov. 15, 2024, 2 a.m. UTC | #12
> > When the SFP EEPROM says it does not support 1000Base-T then the SFP bus
> > code does not consider the SFP has a PHY and skips creating a MDIO bus
> > for it and phylink_sfp_config_optical() is called to create the phylink.
> 
> There are many SFPs out there with broken EEPROM contents. Do the SFPs
> you have say they are not 1000Base-T but actually are? If so, they are
> broken, and need a quirk adding.
> 
> Russell King keeps a database of SFP EEPROM contents. Send him the
> output of `ethtool -m eth42 raw on hex on`
> 
> > Now back to the discussion of the different modes used by the SGMII
> > module.  I think a better term like SerDes can be used to help
> > understanding the operation, although I still cannot narrow down the
> > precise definitions from looking at the internet.  SGMII mode is
> > said to support 10/100/1000Mbit.  This is the default setting, so
> > plugging such SFP allows the port to communicate without any register
> > programming.  The other mode is SerDes, which is fixed at 1000Mbit.  This
> > is typically used by SFP using fiber optics.  This requires changing a
> > register to make the port works.  It seems those 1000Base-T SFPs all run
> > in SerDes mode, at least from all SFPs I tried.
> 
> There is a comment in the code:
> 
> /* Probe a SFP for a PHY device if the module supports copper - the PHY
>  * normally sits at I2C bus address 0x56, and may either be a clause 22
>  * or clause 45 PHY.
>  *
>  * Clause 22 copper SFP modules normally operate in Cisco SGMII mode with
>  * negotiation enabled, but some may be in 1000base-X - which is for the
>  * PHY driver to determine.
> 
> So the default is SGMII for copper SFPs, but there are a few oddballs
> using 1000BaseX. The Marvell PHY driver should figure this out, and
> the phylink will tell you want mode to use.

Adding Marvell PHY driver fixes the main issue I raised.  But the PHY
support still shows only SGMII interface inside the PHY driver, and the
ethtool module-info command shows 1000BASE-T.  The only good brand-name
SFPs I have are all 10/100/1000 type.  I am going to get other brands to
see if they get better or always use SerDes mode.

That leaves the one situation where the SGMII port is connected directly
to a MAC or each other.  A customer once tried to do that and the SGMII
register write was changed to support that, but I do not know if that
project became a real product.  Microchip does have a board connecting
the SGMII ports of two chips together, but that does not run Linux, and
there is no special configuration to make the ports work.

As there is no SFP or PHY to tell phylink which interface to use I think
the only solution is to declare fixed-link or some other phy modes in the
device tree?

I currently use fixed-link to handle this situation.  There is another
new issue though.

The SGMII port in another chip can use 2.5G.  The driver uses fixed PHY
to get the MAC running.  But the fixed PHY driver can only support speed
up to 1000.  There is no issue to adding higher speeds to that driver, but
I think that is not advised?
Andrew Lunn Nov. 15, 2024, 6:08 p.m. UTC | #13
> That leaves the one situation where the SGMII port is connected directly
> to a MAC or each other.  A customer once tried to do that and the SGMII
> register write was changed to support that, but I do not know if that
> project became a real product.

This is often done to cascade switches. In this setup, going down to
100Mbps or 10Mbps makes no sense, so 1000BaseX is used, not
SGMII. Today, fixed-link is used in this situation, combined with
setting phy-mode to 1000basex. Russell King has said in the past that
phylink could probably support this without fixed-link.

> The SGMII port in another chip can use 2.5G.  The driver uses fixed PHY
> to get the MAC running.  But the fixed PHY driver can only support speed
> up to 1000.  There is no issue to adding higher speeds to that driver, but
> I think that is not advised?

Well, 2.5G is obviously not SGMII. It is 2500BaseX. There are also
many broken 2500BaseX implementations out there, which are SGMII cores
overclocked, and the signalling disabled, because SGMII signalling at
2.5G makes no sense, you want 2500BaseX signalling.

phylink fixed link also is not limited to 1G, it can do any speed.

	Andrew
diff mbox series

Patch

diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c
index 0ba658a72d8f..1a53980b6e80 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -2,7 +2,7 @@ 
 /*
  * Microchip KSZ9477 switch driver main logic
  *
- * Copyright (C) 2017-2019 Microchip Technology Inc.
+ * Copyright (C) 2017-2024 Microchip Technology Inc.
  */
 
 #include <linux/kernel.h>
@@ -12,6 +12,8 @@ 
 #include <linux/phy.h>
 #include <linux/if_bridge.h>
 #include <linux/if_vlan.h>
+#include <linux/irqdomain.h>
+#include <linux/phylink.h>
 #include <net/dsa.h>
 #include <net/switchdev.h>
 
@@ -161,6 +163,382 @@  static int ksz9477_wait_alu_sta_ready(struct ksz_device *dev)
 					10, 1000);
 }
 
+static void port_sgmii_s(struct ksz_device *dev, uint port, u16 devid, u16 reg,
+			 u16 len)
+{
+	u32 data;
+
+	data = devid & PORT_SGMII_DEVICE_ID_M;
+	data <<= PORT_SGMII_DEVICE_ID_S;
+	data |= reg;
+	if (len > 1)
+		data |= PORT_SGMII_AUTO_INCR;
+	ksz_pwrite32(dev, port, REG_PORT_SGMII_ADDR__4, data);
+}
+
+static void port_sgmii_r(struct ksz_device *dev, uint port, u16 devid, u16 reg,
+			 u16 *buf, u16 len)
+{
+	u32 data;
+
+	port_sgmii_s(dev, port, devid, reg, len);
+	while (len) {
+		ksz_pread32(dev, port, REG_PORT_SGMII_DATA__4, &data);
+		*buf++ = (u16)data;
+		len--;
+	}
+}
+
+static void port_sgmii_w(struct ksz_device *dev, uint port, u16 devid, u16 reg,
+			 u16 *buf, u16 len)
+{
+	u32 data;
+
+	port_sgmii_s(dev, port, devid, reg, len);
+	while (len) {
+		data = *buf++;
+		ksz_pwrite32(dev, port, REG_PORT_SGMII_DATA__4, data);
+		len--;
+	}
+}
+
+static int port_sgmii_detect(struct ksz_device *dev, uint p)
+{
+	struct ksz_port *port = &dev->ports[p];
+	int ret = 0;
+
+	if (dev->sgmii_mode) {
+		u16 buf[6];
+		int i = 0;
+
+		do {
+			port_sgmii_r(dev, p, SR_MII, 0, buf, 6);
+			i++;
+		} while (!buf[5] && i < 10);
+		if (buf[5] & SR_MII_REMOTE_ACK) {
+			if (buf[5] & (SR_MII_REMOTE_HALF_DUPLEX |
+				      SR_MII_REMOTE_FULL_DUPLEX))
+				port->fiber = 1;
+			else if (dev->sgmii_mode == 1)
+				dev->sgmii_mode = 2;
+			ret = 1;
+		} else if (dev->sgmii_mode == 1) {
+			port->fiber = 1;
+			ret = 1;
+		}
+	} else {
+		/* Need to be told to run in direct mode. */
+		port->fiber = 1;
+		ret = 1;
+	}
+	return ret;
+}
+
+static void port_sgmii_reset(struct ksz_device *dev, uint p)
+{
+	u16 ctrl;
+
+	port_sgmii_r(dev, p, SR_MII, MMD_SR_MII_CTRL, &ctrl, 1);
+	ctrl |= SR_MII_RESET;
+	port_sgmii_w(dev, p, SR_MII, MMD_SR_MII_CTRL, &ctrl, 1);
+}
+
+static void port_sgmii_setup(struct ksz_device *dev, uint p, bool pcs,
+			     bool master, bool autoneg, bool intr, int speed,
+			     int duplex)
+{
+	u16 ctrl;
+	u16 cfg;
+	u16 adv;
+
+	cfg = 0;
+	if (pcs)
+		cfg |= SR_MII_PCS_SGMII << SR_MII_PCS_MODE_S;
+	if (master) {
+		cfg |= SR_MII_TX_CFG_PHY_MASTER;
+		cfg |= SR_MII_SGMII_LINK_UP;
+	}
+	if (intr)
+		cfg |= SR_MII_AUTO_NEG_COMPLETE_INTR;
+	port_sgmii_w(dev, p, SR_MII, MMD_SR_MII_AUTO_NEG_CTRL, &cfg, 1);
+	port_sgmii_r(dev, p, SR_MII, MMD_SR_MII_CTRL, &ctrl, 1);
+	if (master || !autoneg) {
+		switch (speed) {
+		case SPEED_100:
+			ctrl |= SR_MII_SPEED_100MBIT;
+			break;
+		case SPEED_1000:
+			ctrl |= SR_MII_SPEED_1000MBIT;
+			break;
+		}
+	}
+	if (!autoneg) {
+		ctrl &= ~SR_MII_AUTO_NEG_ENABLE;
+		port_sgmii_w(dev, p, SR_MII, MMD_SR_MII_CTRL, &ctrl, 1);
+		return;
+	} else if (!(ctrl & SR_MII_AUTO_NEG_ENABLE)) {
+		ctrl |= SR_MII_AUTO_NEG_ENABLE;
+		port_sgmii_w(dev, p, SR_MII, MMD_SR_MII_CTRL, &ctrl, 1);
+	}
+
+	/* Need to write to advertise register to send correct signal. */
+	/* Default value is 0x0020. */
+	adv = SR_MII_AUTO_NEG_ASYM_PAUSE_RX << SR_MII_AUTO_NEG_PAUSE_S;
+	if (duplex == DUPLEX_FULL)
+		adv |= SR_MII_AUTO_NEG_FULL_DUPLEX;
+	else
+		adv |= SR_MII_AUTO_NEG_HALF_DUPLEX;
+	port_sgmii_w(dev, p, SR_MII, MMD_SR_MII_AUTO_NEGOTIATION, &adv, 1);
+	if (master && autoneg) {
+		ctrl |= SR_MII_AUTO_NEG_RESTART;
+		port_sgmii_w(dev, p, SR_MII, MMD_SR_MII_CTRL, &ctrl, 1);
+	}
+}
+
+#define PORT_LINK_UP		BIT(0)
+#define PORT_LINK_CHANGE	BIT(1)
+#define PORT_LINK_INVALID	BIT(2)
+
+static int sgmii_port_get_speed(struct ksz_device *dev, uint p)
+{
+	struct ksz_port *info = &dev->ports[p];
+	int ret = 0;
+	u16 status;
+	u16 speed;
+	u16 data;
+	u8 link;
+
+	port_sgmii_r(dev, p, SR_MII, MMD_SR_MII_STATUS, &status, 1);
+	port_sgmii_r(dev, p, SR_MII, MMD_SR_MII_STATUS, &status, 1);
+	port_sgmii_r(dev, p, SR_MII, MMD_SR_MII_AUTO_NEG_STATUS, &data, 1);
+
+	/* Typical register values in different modes.
+	 * 10/100/1000: 1f0001 = 01ad  1f0005 = 4000  1f8002 = 0008
+	 *              1f0001 = 01bd  1f0005 = d000  1f8002 = 001a
+	 * 1000:        1f0001 = 018d  1f0005 = 0000  1f8002 = 0000
+	 *              1f0001 = 01ad  1f0005 = 40a0  1f8002 = 0001
+	 *              1f0001 = 01ad  1f0005 = 41a0  1f8002 = 0001
+	 * fiber:       1f0001 = 0189  1f0005 = 0000  1f8002 = 0000
+	 *              1f0001 = 01ad  1f0005 = 41a0  1f8002 = 0001
+	 */
+
+	if (data & SR_MII_AUTO_NEG_COMPLETE_INTR) {
+		data &= ~SR_MII_AUTO_NEG_COMPLETE_INTR;
+		port_sgmii_w(dev, p, SR_MII, MMD_SR_MII_AUTO_NEG_STATUS, &data,
+			     1);
+	}
+
+	/* Running in fiber mode. */
+	if (info->fiber && !data) {
+		u16 link_up = PORT_LINK_STATUS;
+
+		if (dev->sgmii_mode == 1)
+			link_up |= PORT_AUTO_NEG_ACKNOWLEDGE;
+		if ((status & link_up) == link_up)
+			data = SR_MII_STAT_LINK_UP |
+			       (SR_MII_STAT_1000_MBPS << SR_MII_STAT_S) |
+			       SR_MII_STAT_FULL_DUPLEX;
+	}
+	if (data & SR_MII_STAT_LINK_UP) {
+		ret = PORT_LINK_UP;
+	} else if (info->interface == PHY_INTERFACE_MODE_1000BASEX &&
+		 status == 0x018d) {
+		info->sgmii_link = 0xff;
+		return PORT_LINK_INVALID;
+	}
+
+	link = (data & ~SR_MII_AUTO_NEG_COMPLETE_INTR);
+	if (info->sgmii_link == link)
+		return ret;
+
+	if (data & SR_MII_STAT_LINK_UP) {
+		u16 ctrl;
+
+		/* Need to update control register with same link setting. */
+		ctrl = SR_MII_AUTO_NEG_ENABLE;
+		speed = (data >> SR_MII_STAT_S) & SR_MII_STAT_M;
+		if (speed == SR_MII_STAT_1000_MBPS)
+			ctrl |= SR_MII_SPEED_1000MBIT;
+		else if (speed == SR_MII_STAT_100_MBPS)
+			ctrl |= SR_MII_SPEED_100MBIT;
+		if (data & SR_MII_STAT_FULL_DUPLEX)
+			ctrl |= SR_MII_FULL_DUPLEX;
+		port_sgmii_w(dev, p, SR_MII, MMD_SR_MII_CTRL, &ctrl, 1);
+
+		speed = (data >> SR_MII_STAT_S) & SR_MII_STAT_M;
+		info->phydev.speed = SPEED_10;
+		if (speed == SR_MII_STAT_1000_MBPS)
+			info->phydev.speed = SPEED_1000;
+		else if (speed == SR_MII_STAT_100_MBPS)
+			info->phydev.speed = SPEED_100;
+
+		info->phydev.duplex = 0;
+		if (data & SR_MII_STAT_FULL_DUPLEX)
+			info->phydev.duplex = 1;
+	}
+	ret |= PORT_LINK_CHANGE;
+	info->sgmii_link = link;
+	info->phydev.link = (ret & PORT_LINK_UP);
+	return ret;
+}
+
+static bool sgmii_need_polling(struct ksz_device *dev, struct ksz_port *p)
+{
+	/* SGMII mode 2 has link up and link down interrupts. */
+	if (dev->sgmii_mode == 2 && p->sgmii_has_intr)
+		return false;
+
+	/* SGMII mode 1 has link up interrupt but not link down interrupt. */
+	if (dev->sgmii_mode == 1 && p->sgmii_has_intr && !p->phydev.link)
+		return false;
+
+	/* SGMII mode 0 for direct connect has no link change. */
+	if (dev->sgmii_mode == 0)
+		return false;
+	return true;
+}
+
+static void sgmii_check_work(struct work_struct *work)
+{
+	struct ksz_device *dev = container_of(work, struct ksz_device,
+					      sgmii_check.work);
+	u8 port = dev->info->sgmii_port - 1;
+	struct ksz_port *p = &dev->ports[port];
+	int ret;
+
+	ret = sgmii_port_get_speed(dev, port);
+	if (ret & PORT_LINK_CHANGE) {
+		struct phy_device *phydev;
+
+		/* When simulating PHY. */
+		p->phydev.interrupts = PHY_INTERRUPT_ENABLED;
+		phydev = mdiobus_get_phy(dev->ds->user_mii_bus, port);
+		if (phydev)
+			phy_trigger_machine(phydev);
+
+		/* When using SFP code. */
+		dsa_port_phylink_mac_change(dev->ds, port, p->phydev.link);
+	}
+	if (sgmii_need_polling(dev, p))
+		schedule_delayed_work(&dev->sgmii_check,
+				      msecs_to_jiffies(500));
+}
+
+static irqreturn_t ksz9477_sgmii_irq_thread_fn(int irq, void *dev_id)
+{
+	struct ksz_pcs *priv = dev_id;
+	struct ksz_device *dev = priv->dev;
+	u8 port = priv->port;
+	u16 data16 = 0;
+	int ret;
+
+	port_sgmii_w(dev, port, SR_MII, MMD_SR_MII_AUTO_NEG_STATUS, &data16, 1);
+	ret = sgmii_port_get_speed(dev, port);
+	if (ret & PORT_LINK_CHANGE) {
+		struct ksz_port *p = &dev->ports[port];
+		struct phy_device *phydev;
+
+		/* When simulating PHY. */
+		p->phydev.interrupts = PHY_INTERRUPT_ENABLED;
+		phydev = mdiobus_get_phy(dev->ds->user_mii_bus, port);
+		if (phydev)
+			phy_trigger_machine(phydev);
+
+		/* When using SFP code. */
+		dsa_port_phylink_mac_change(dev->ds, port, p->phydev.link);
+
+		/* No interrupt for link down. */
+		if (sgmii_need_polling(dev, p))
+			schedule_delayed_work(&dev->sgmii_check,
+					      msecs_to_jiffies(500));
+	}
+	return IRQ_HANDLED;
+}
+
+static void sgmii_initial_setup(struct ksz_device *dev, int port)
+{
+	struct ksz_port *p = &dev->ports[port];
+	/* Assume SGMII mode is 2. */
+	bool autoneg = true;
+	bool master = false;
+	bool intr = false;
+	bool pcs = true;
+	int irq, ret;
+
+	/* Only setup SGMII port once. */
+	if (!p->sgmii || p->sgmii_setup)
+		return;
+
+	irq = irq_find_mapping(p->pirq.domain, PORT_SGMII_INT_LOC);
+	if (irq > 0) {
+		ret = request_threaded_irq(irq, NULL,
+					   ksz9477_sgmii_irq_thread_fn,
+					   IRQF_ONESHOT, "SGMII",
+					   p->pcs_priv);
+		if (!ret) {
+			intr = true;
+			p->sgmii_has_intr = 1;
+		}
+	}
+
+	/* Make invalid so the correct value is set. */
+	p->sgmii_link = 0xff;
+
+	INIT_DELAYED_WORK(&dev->sgmii_check, sgmii_check_work);
+	if (dev->sgmii_mode == 0) {
+		master = true;
+		autoneg = false;
+	} else if (dev->sgmii_mode == 1) {
+		pcs = false;
+		master = true;
+	}
+	port_sgmii_setup(dev, port, pcs, master, autoneg, intr, SPEED_1000,
+			 DUPLEX_FULL);
+
+	p->sgmii_setup = 1;
+	sgmii_port_get_speed(dev, port);
+
+	/* Need to check link down if using fiber SFP. */
+	if (sgmii_need_polling(dev, p))
+		schedule_delayed_work(&dev->sgmii_check,
+				      msecs_to_jiffies(500));
+}
+
+void ksz9477_pcs_get_state(struct phylink_pcs *pcs,
+			   struct phylink_link_state *state)
+{
+	struct ksz_pcs *priv = container_of(pcs, struct ksz_pcs, pcs);
+	struct ksz_device *dev = priv->dev;
+	struct ksz_port *p = &dev->ports[priv->port];
+	int ret;
+
+	ret = sgmii_port_get_speed(dev, priv->port);
+	if (!(ret & PORT_LINK_UP))
+		state->link = false;
+	state->duplex = p->phydev.duplex;
+	state->speed = p->phydev.speed;
+	if (state->interface == PHY_INTERFACE_MODE_SGMII) {
+		state->an_complete = state->link;
+	} else if (state->interface == PHY_INTERFACE_MODE_1000BASEX) {
+		if (ret == PORT_LINK_INVALID)
+			schedule_delayed_work(&dev->sgmii_check,
+					      msecs_to_jiffies(200));
+	}
+}
+
+void ksz9477_pcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
+			 phy_interface_t interface, int speed, int duplex)
+{
+	struct ksz_pcs *priv = container_of(pcs, struct ksz_pcs, pcs);
+	struct ksz_device *dev = priv->dev;
+	struct ksz_port *p = &dev->ports[priv->port];
+
+	/* No interrupt for link down. */
+	if (sgmii_need_polling(dev, p))
+		schedule_delayed_work(&dev->sgmii_check,
+				      msecs_to_jiffies(500));
+}
+
 int ksz9477_reset_switch(struct ksz_device *dev)
 {
 	u8 data8;
@@ -345,7 +723,7 @@  int ksz9477_r_phy(struct ksz_device *dev, u16 addr, u16 reg, u16 *data)
 	 * A fixed PHY can be setup in the device tree, but this function is
 	 * still called for that port during initialization.
 	 * For RGMII PHY there is no way to access it so the fixed PHY should
-	 * be used.  For SGMII PHY the supporting code will be added later.
+	 * be used.  SGMII PHY is simulated as a regular PHY.
 	 */
 	if (!dev->info->internal_phy[addr]) {
 		struct ksz_port *p = &dev->ports[addr];
@@ -355,7 +733,10 @@  int ksz9477_r_phy(struct ksz_device *dev, u16 addr, u16 reg, u16 *data)
 			val = 0x1140;
 			break;
 		case MII_BMSR:
-			val = 0x796d;
+			if (p->phydev.link)
+				val = 0x796d;
+			else
+				val = 0x7949;
 			break;
 		case MII_PHYSID1:
 			val = 0x0022;
@@ -368,6 +749,10 @@  int ksz9477_r_phy(struct ksz_device *dev, u16 addr, u16 reg, u16 *data)
 			break;
 		case MII_LPA:
 			val = 0xc5e1;
+			if (p->phydev.speed == SPEED_10)
+				val &= ~0x0180;
+			if (p->phydev.duplex == 0)
+				val &= ~0x0140;
 			break;
 		case MII_CTRL1000:
 			val = 0x0700;
@@ -378,6 +763,24 @@  int ksz9477_r_phy(struct ksz_device *dev, u16 addr, u16 reg, u16 *data)
 			else
 				val = 0;
 			break;
+		case MII_ESTATUS:
+			val = 0x3000;
+			break;
+
+		/* This register holds the PHY interrupt status. */
+		case MII_TPISTATUS:
+			val = (LINK_DOWN_INT | LINK_UP_INT) << 8;
+			if (p->phydev.interrupts == PHY_INTERRUPT_ENABLED) {
+				if (p->phydev.link)
+					val |= LINK_UP_INT;
+				else
+					val |= LINK_DOWN_INT;
+			}
+			p->phydev.interrupts = 0;
+			break;
+		default:
+			val = 0;
+			break;
 		}
 	} else {
 		ret = ksz_pread16(dev, addr, 0x100 + (reg << 1), &val);
@@ -973,11 +1376,27 @@  static phy_interface_t ksz9477_get_interface(struct ksz_device *dev, int port)
 void ksz9477_get_caps(struct ksz_device *dev, int port,
 		      struct phylink_config *config)
 {
+	struct ksz_port *p = &dev->ports[port];
+
 	config->mac_capabilities = MAC_10 | MAC_100 | MAC_ASYM_PAUSE |
 				   MAC_SYM_PAUSE;
 
 	if (dev->info->gbit_capable[port])
 		config->mac_capabilities |= MAC_1000FD;
+
+	if (p->sgmii) {
+		struct phy_device *phydev;
+
+		phydev = mdiobus_get_phy(dev->ds->user_mii_bus, port);
+
+		/* Change this port interface to SGMII. */
+		if (phydev)
+			phydev->interface = PHY_INTERFACE_MODE_SGMII;
+		__set_bit(PHY_INTERFACE_MODE_1000BASEX,
+			  config->supported_interfaces);
+		__set_bit(PHY_INTERFACE_MODE_SGMII,
+			  config->supported_interfaces);
+	}
 }
 
 int ksz9477_set_ageing_time(struct ksz_device *dev, unsigned int msecs)
@@ -1054,6 +1473,8 @@  void ksz9477_port_setup(struct ksz_device *dev, int port, bool cpu_port)
 		     PORT_FORCE_TX_FLOW_CTRL | PORT_FORCE_RX_FLOW_CTRL,
 		     !dev->info->internal_phy[port]);
 
+	sgmii_initial_setup(dev, port);
+
 	if (cpu_port)
 		member = dsa_user_ports(ds);
 	else
@@ -1132,6 +1553,10 @@  void ksz9477_config_cpu_port(struct dsa_switch *ds)
 			continue;
 		ksz_port_stp_state_set(ds, i, BR_STATE_DISABLED);
 	}
+
+	/* Switch reset does not reset SGMII module. */
+	if (dev->info->sgmii_port > 0)
+		port_sgmii_reset(dev, dev->info->sgmii_port - 1);
 }
 
 int ksz9477_enable_stp_addr(struct ksz_device *dev)
@@ -1201,6 +1626,23 @@  int ksz9477_setup(struct dsa_switch *ds)
 	/* enable global MIB counter freeze function */
 	ksz_cfg(dev, REG_SW_MAC_CTRL_6, SW_MIB_COUNTER_FREEZE, true);
 
+	if (dev->info->sgmii_port > 0) {
+		struct ksz_port *p = &dev->ports[dev->info->sgmii_port - 1];
+		struct ksz_pcs *pcs_priv;
+
+		p->sgmii = port_sgmii_detect(dev, dev->info->sgmii_port - 1);
+		if (p->sgmii) {
+			pcs_priv = devm_kzalloc(dev->dev,
+						sizeof(struct ksz_pcs),
+						GFP_KERNEL);
+			if (!pcs_priv)
+				return -ENOMEM;
+			p->pcs_priv = pcs_priv;
+			pcs_priv->dev = dev;
+			pcs_priv->port = dev->info->sgmii_port - 1;
+		}
+	}
+
 	/* Make sure PME (WoL) is not enabled. If requested, it will
 	 * be enabled by ksz_wol_pre_shutdown(). Otherwise, some PMICs
 	 * do not like PME events changes before shutdown.
@@ -1319,6 +1761,8 @@  int ksz9477_switch_init(struct ksz_device *dev)
 
 void ksz9477_switch_exit(struct ksz_device *dev)
 {
+	if (delayed_work_pending(&dev->sgmii_check))
+		cancel_delayed_work_sync(&dev->sgmii_check);
 	ksz9477_reset_switch(dev);
 }
 
diff --git a/drivers/net/dsa/microchip/ksz9477.h b/drivers/net/dsa/microchip/ksz9477.h
index d2166b0d881e..24a42d5ee023 100644
--- a/drivers/net/dsa/microchip/ksz9477.h
+++ b/drivers/net/dsa/microchip/ksz9477.h
@@ -2,7 +2,7 @@ 
 /*
  * Microchip KSZ9477 series Header file
  *
- * Copyright (C) 2017-2022 Microchip Technology Inc.
+ * Copyright (C) 2017-2024 Microchip Technology Inc.
  */
 
 #ifndef __KSZ9477_H
@@ -97,4 +97,9 @@  void ksz9477_acl_match_process_l2(struct ksz_device *dev, int port,
 				  u16 ethtype, u8 *src_mac, u8 *dst_mac,
 				  unsigned long cookie, u32 prio);
 
+void ksz9477_pcs_get_state(struct phylink_pcs *pcs,
+			   struct phylink_link_state *state);
+void ksz9477_pcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
+			 phy_interface_t interface, int speed, int duplex);
+
 #endif
diff --git a/drivers/net/dsa/microchip/ksz9477_reg.h b/drivers/net/dsa/microchip/ksz9477_reg.h
index 04235c22bf40..0c15c0b16b42 100644
--- a/drivers/net/dsa/microchip/ksz9477_reg.h
+++ b/drivers/net/dsa/microchip/ksz9477_reg.h
@@ -805,6 +805,7 @@ 
 #define REG_PORT_INT_STATUS		0x001B
 #define REG_PORT_INT_MASK		0x001F
 
+#define PORT_SGMII_INT_LOC		3
 #define PORT_SGMII_INT			BIT(3)
 #define PORT_PTP_INT			BIT(2)
 #define PORT_PHY_INT			BIT(1)
@@ -1025,6 +1026,11 @@ 
 #define SR_MII_AUTO_NEG_FULL_DUPLEX	BIT(5)
 
 #define MMD_SR_MII_REMOTE_CAPABILITY	0x0005
+
+#define SR_MII_REMOTE_ACK		BIT(14)
+#define SR_MII_REMOTE_HALF_DUPLEX	BIT(6)
+#define SR_MII_REMOTE_FULL_DUPLEX	BIT(5)
+
 #define MMD_SR_MII_AUTO_NEG_EXP		0x0006
 #define MMD_SR_MII_AUTO_NEG_EXT		0x000F
 
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index f73833e24622..8163342d778a 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -354,10 +354,30 @@  static void ksz9477_phylink_mac_link_up(struct phylink_config *config,
 					int speed, int duplex, bool tx_pause,
 					bool rx_pause);
 
+static struct phylink_pcs *
+ksz_phylink_mac_select_pcs(struct phylink_config *config,
+			   phy_interface_t interface)
+{
+	struct dsa_port *dp = dsa_phylink_to_port(config);
+	struct ksz_device *dev = dp->ds->priv;
+	struct ksz_port *p = &dev->ports[dp->index];
+
+	if (!p->sgmii)
+		return ERR_PTR(-EOPNOTSUPP);
+	switch (interface) {
+	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_1000BASEX:
+		return &p->pcs_priv->pcs;
+	default:
+		return NULL;
+	}
+}
+
 static const struct phylink_mac_ops ksz9477_phylink_mac_ops = {
 	.mac_config	= ksz_phylink_mac_config,
 	.mac_link_down	= ksz_phylink_mac_link_down,
 	.mac_link_up	= ksz9477_phylink_mac_link_up,
+	.mac_select_pcs	= ksz_phylink_mac_select_pcs,
 };
 
 static const struct ksz_dev_ops ksz9477_dev_ops = {
@@ -395,6 +415,8 @@  static const struct ksz_dev_ops ksz9477_dev_ops = {
 	.reset = ksz9477_reset_switch,
 	.init = ksz9477_switch_init,
 	.exit = ksz9477_switch_exit,
+	.pcs_get_state = ksz9477_pcs_get_state,
+	.pcs_link_up = ksz9477_pcs_link_up,
 };
 
 static const struct phylink_mac_ops lan937x_phylink_mac_ops = {
@@ -1033,8 +1055,7 @@  static const struct regmap_range ksz9477_valid_regs[] = {
 	regmap_reg_range(0x701b, 0x701b),
 	regmap_reg_range(0x701f, 0x7020),
 	regmap_reg_range(0x7030, 0x7030),
-	regmap_reg_range(0x7200, 0x7203),
-	regmap_reg_range(0x7206, 0x7207),
+	regmap_reg_range(0x7200, 0x7207),
 	regmap_reg_range(0x7300, 0x7301),
 	regmap_reg_range(0x7400, 0x7401),
 	regmap_reg_range(0x7403, 0x7403),
@@ -1554,6 +1575,7 @@  const struct ksz_chip_data ksz_switch_chips[] = {
 		.internal_phy	= {true, true, true, true,
 				   true, false, false},
 		.gbit_capable	= {true, true, true, true, true, true, true},
+		.sgmii_port = 7,
 		.wr_table = &ksz9477_register_set,
 		.rd_table = &ksz9477_register_set,
 	},
@@ -1972,6 +1994,45 @@  static void ksz_phylink_get_caps(struct dsa_switch *ds, int port,
 		dev->dev_ops->get_caps(dev, port, config);
 }
 
+static void ksz_pcs_get_state(struct phylink_pcs *pcs,
+			      struct phylink_link_state *state)
+{
+	struct ksz_pcs *priv = container_of(pcs, struct ksz_pcs, pcs);
+	struct ksz_device *dev = priv->dev;
+
+	if (dev->dev_ops->pcs_get_state)
+		dev->dev_ops->pcs_get_state(pcs, state);
+}
+
+static int ksz_pcs_config(struct phylink_pcs *pcs, unsigned int mode,
+			  phy_interface_t interface,
+			  const unsigned long *advertising,
+			  bool permit_pause_to_mac)
+{
+	return 0;
+}
+
+static void ksz_pcs_an_restart(struct phylink_pcs *pcs)
+{
+}
+
+static void ksz_pcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
+			    phy_interface_t interface, int speed, int duplex)
+{
+	struct ksz_pcs *priv = container_of(pcs, struct ksz_pcs, pcs);
+	struct ksz_device *dev = priv->dev;
+
+	if (dev->dev_ops->pcs_link_up)
+		dev->dev_ops->pcs_link_up(pcs, mode, interface, speed, duplex);
+}
+
+static const struct phylink_pcs_ops ksz_pcs_ops = {
+	.pcs_get_state = ksz_pcs_get_state,
+	.pcs_config = ksz_pcs_config,
+	.pcs_an_restart = ksz_pcs_an_restart,
+	.pcs_link_up = ksz_pcs_link_up,
+};
+
 void ksz_r_mib_stats64(struct ksz_device *dev, int port)
 {
 	struct ethtool_pause_stats *pstats;
@@ -2021,7 +2082,7 @@  void ksz_r_mib_stats64(struct ksz_device *dev, int port)
 
 	spin_unlock(&mib->stats64_lock);
 
-	if (dev->info->phy_errata_9477) {
+	if (dev->info->phy_errata_9477 && dev->info->sgmii_port != port + 1) {
 		ret = ksz9477_errata_monitor(dev, port, raw->tx_late_col);
 		if (ret)
 			dev_err(dev->dev, "Failed to monitor transmission halt\n");
@@ -2531,6 +2592,12 @@  static int ksz_setup(struct dsa_switch *ds)
 			return ret;
 	}
 
+	if (dev->info->sgmii_port > 0) {
+		p = &dev->ports[dev->info->sgmii_port - 1];
+		if (p->pcs_priv)
+			p->pcs_priv->pcs.ops = &ksz_pcs_ops;
+	}
+
 	/* Start with learning disabled on standalone user ports, and enabled
 	 * on the CPU port. In lack of other finer mechanisms, learning on the
 	 * CPU port will avoid flooding bridge local addresses on the network
@@ -3355,6 +3422,17 @@  static void ksz_phylink_mac_config(struct phylink_config *config,
 	if (dev->info->internal_phy[port])
 		return;
 
+	/* No need to configure XMII control register when using SGMII. */
+	if (state->interface == PHY_INTERFACE_MODE_SGMII ||
+	    state->interface == PHY_INTERFACE_MODE_1000BASEX) {
+		struct ksz_port *p = &dev->ports[port];
+
+		p->interface = state->interface;
+		return;
+	}
+	if (dev->info->sgmii_port == port + 1)
+		return;
+
 	if (phylink_autoneg_inband(mode)) {
 		dev_err(dev->dev, "In-band AN not supported!\n");
 		return;
@@ -4783,6 +4861,9 @@  int ksz_switch_register(struct ksz_device *dev)
 		dev->ports[i].num = i;
 	}
 
+	/* Default SGMII mode is detecting which type of SFP is used. */
+	dev->sgmii_mode = 1;
+
 	/* set the real number of ports */
 	dev->ds->num_ports = dev->info->port_cnt;
 
@@ -4813,6 +4894,13 @@  int ksz_switch_register(struct ksz_device *dev)
 				of_get_phy_mode(port,
 						&dev->ports[port_num].interface);
 
+				/* SGMII port can be used without using SFP. */
+				if (dev->info->sgmii_port == port_num + 1) {
+					if (of_phy_is_fixed_link(port) &&
+					    !dev->ports[port_num].interface)
+						dev->sgmii_mode = 0;
+				}
+
 				ksz_parse_rgmii_delay(dev, port_num, port);
 			}
 			of_node_put(ports);
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index bec846e20682..12b043bb766a 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -86,6 +86,7 @@  struct ksz_chip_data {
 	bool supports_rgmii[KSZ_MAX_NUM_PORTS];
 	bool internal_phy[KSZ_MAX_NUM_PORTS];
 	bool gbit_capable[KSZ_MAX_NUM_PORTS];
+	u8 sgmii_port;
 	const struct regmap_access_table *wr_table;
 	const struct regmap_access_table *rd_table;
 };
@@ -114,6 +115,13 @@  struct ksz_switch_macaddr {
 	refcount_t refcount;
 };
 
+struct ksz_pcs {
+	struct phylink_pcs pcs;
+	struct ksz_device *dev;
+	int irq;
+	u8 port;
+};
+
 struct ksz_port {
 	bool remove_tag;		/* Remove Tag flag set, for ksz8795 only */
 	bool learning;
@@ -125,6 +133,10 @@  struct ksz_port {
 	u32 force:1;
 	u32 read:1;			/* read MIB counters in background */
 	u32 freeze:1;			/* MIB counter freeze is enabled */
+	u32 sgmii:1;			/* port is SGMII */
+	u32 sgmii_has_intr:1;
+	u32 sgmii_link:8;
+	u32 sgmii_setup:1;
 
 	struct ksz_port_mib mib;
 	phy_interface_t interface;
@@ -134,6 +146,7 @@  struct ksz_port {
 	void *acl_priv;
 	struct ksz_irq pirq;
 	u8 num;
+	struct ksz_pcs *pcs_priv;
 #if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ_PTP)
 	struct hwtstamp_config tstamp_config;
 	bool hwts_tx_en;
@@ -156,6 +169,7 @@  struct ksz_device {
 	struct mutex alu_mutex;		/* ALU access */
 	struct mutex vlan_mutex;	/* vlan access */
 	const struct ksz_dev_ops *dev_ops;
+	const struct phylink_pcs_ops *pcs_ops;
 
 	struct device *dev;
 	struct regmap *regmap[__KSZ_NUM_REGMAPS];
@@ -181,6 +195,8 @@  struct ksz_device {
 	struct ksz_port *ports;
 	struct delayed_work mib_read;
 	unsigned long mib_read_interval;
+	struct delayed_work sgmii_check;
+	u8 sgmii_mode;
 	u16 mirror_rx;
 	u16 mirror_tx;
 	u16 port_mask;
@@ -379,6 +395,11 @@  struct ksz_dev_ops {
 	int (*reset)(struct ksz_device *dev);
 	int (*init)(struct ksz_device *dev);
 	void (*exit)(struct ksz_device *dev);
+
+	void (*pcs_get_state)(struct phylink_pcs *pcs,
+			      struct phylink_link_state *state);
+	void (*pcs_link_up)(struct phylink_pcs *pcs, unsigned int mode,
+			    phy_interface_t interface, int speed, int duplex);
 };
 
 struct ksz_device *ksz_switch_alloc(struct device *base, void *priv);