diff mbox

[RFC,V2,3/3] Ixgbevf: Add migration support for ixgbevf driver

Message ID 1448372298-28386-4-git-send-email-tianyu.lan@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

lan,Tianyu Nov. 24, 2015, 1:38 p.m. UTC
This patch is to add migration support for ixgbevf driver. Using
faked PCI migration capability table communicates with Qemu to
share migration status and mailbox irq vector index.

Qemu will notify VF via sending MSIX msg to trigger mailbox
vector during migration and store migration status in the
PCI_VF_MIGRATION_VMM_STATUS regs in the new capability table.
The mailbox irq will be triggered just befoe stop-and-copy stage
and after migration on the target machine.

VF driver will put down net when detect migration and tell
Qemu it's ready for migration via writing PCI_VF_MIGRATION_VF_STATUS
reg. After migration, put up net again.

Qemu will in charge of migrating PCI config space regs and MSIX config.

The patch is to dedicate on the normal case that net traffic works
when mailbox irq is enabled. For other cases(such as the driver
isn't loaded, adapter is suspended or closed), mailbox irq won't be
triggered and VF driver will disable it via PCI_VF_MIGRATION_CAP
reg. These case will be resolved later.

Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |   5 ++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 102 ++++++++++++++++++++++
 2 files changed, 107 insertions(+)

Comments

Michael S. Tsirkin Nov. 24, 2015, 9:20 p.m. UTC | #1
On Tue, Nov 24, 2015 at 09:38:18PM +0800, Lan Tianyu wrote:
> This patch is to add migration support for ixgbevf driver. Using
> faked PCI migration capability table communicates with Qemu to
> share migration status and mailbox irq vector index.
> 
> Qemu will notify VF via sending MSIX msg to trigger mailbox
> vector during migration and store migration status in the
> PCI_VF_MIGRATION_VMM_STATUS regs in the new capability table.
> The mailbox irq will be triggered just befoe stop-and-copy stage
> and after migration on the target machine.
> 
> VF driver will put down net when detect migration and tell
> Qemu it's ready for migration via writing PCI_VF_MIGRATION_VF_STATUS
> reg. After migration, put up net again.
> 
> Qemu will in charge of migrating PCI config space regs and MSIX config.
> 
> The patch is to dedicate on the normal case that net traffic works
> when mailbox irq is enabled. For other cases(such as the driver
> isn't loaded, adapter is suspended or closed), mailbox irq won't be
> triggered and VF driver will disable it via PCI_VF_MIGRATION_CAP
> reg. These case will be resolved later.
> 
> Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>

I have to say, I was much more interested in the idea
of tracking dirty memory. I have some thoughts about
that one - did you give up on it then?



> ---
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |   5 ++
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 102 ++++++++++++++++++++++
>  2 files changed, 107 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> index 775d089..4b8ba2f 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> @@ -438,6 +438,11 @@ struct ixgbevf_adapter {
>  	u64 bp_tx_missed;
>  #endif
>  
> +	u8 migration_cap;
> +	u8 last_migration_reg;
> +	unsigned long migration_status;
> +	struct work_struct migration_task;
> +
>  	u8 __iomem *io_addr; /* Mainly for iounmap use */
>  	u32 link_speed;
>  	bool link_up;
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> index a16d267..95860c2 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> @@ -96,6 +96,8 @@ static int debug = -1;
>  module_param(debug, int, 0);
>  MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
>  
> +#define MIGRATION_IN_PROGRESS		0
> +
>  static void ixgbevf_service_event_schedule(struct ixgbevf_adapter *adapter)
>  {
>  	if (!test_bit(__IXGBEVF_DOWN, &adapter->state) &&
> @@ -1262,6 +1264,22 @@ static void ixgbevf_set_itr(struct ixgbevf_q_vector *q_vector)
>  	}
>  }
>  
> +static void ixgbevf_migration_check(struct ixgbevf_adapter *adapter) 
> +{
> +	struct pci_dev *pdev = adapter->pdev;
> +	u8 val;
> +
> +	pci_read_config_byte(pdev,
> +		     adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
> +		     &val);
> +
> +	if (val != adapter->last_migration_reg) {
> +		schedule_work(&adapter->migration_task);
> +		adapter->last_migration_reg = val;
> +	}
> +
> +}
> +
>  static irqreturn_t ixgbevf_msix_other(int irq, void *data)
>  {
>  	struct ixgbevf_adapter *adapter = data;
> @@ -1269,6 +1287,7 @@ static irqreturn_t ixgbevf_msix_other(int irq, void *data)
>  
>  	hw->mac.get_link_status = 1;
>  
> +	ixgbevf_migration_check(adapter);
>  	ixgbevf_service_event_schedule(adapter);
>  
>  	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, adapter->eims_other);
> @@ -1383,6 +1402,7 @@ out:
>  static int ixgbevf_request_msix_irqs(struct ixgbevf_adapter *adapter)
>  {
>  	struct net_device *netdev = adapter->netdev;
> +	struct pci_dev *pdev = adapter->pdev;
>  	int q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
>  	int vector, err;
>  	int ri = 0, ti = 0;
> @@ -1423,6 +1443,12 @@ static int ixgbevf_request_msix_irqs(struct ixgbevf_adapter *adapter)
>  		goto free_queue_irqs;
>  	}
>  
> +	if (adapter->migration_cap) {
> +		pci_write_config_byte(pdev,
> +			adapter->migration_cap + PCI_VF_MIGRATION_IRQ,
> +			vector);
> +	}
> +
>  	return 0;
>  
>  free_queue_irqs:
> @@ -2891,6 +2917,59 @@ static void ixgbevf_watchdog_subtask(struct ixgbevf_adapter *adapter)
>  	ixgbevf_update_stats(adapter);
>  }
>  
> +static void ixgbevf_migration_task(struct work_struct *work)
> +{
> +	struct ixgbevf_adapter *adapter = container_of(work,
> +			struct ixgbevf_adapter,
> +			migration_task);
> +	struct pci_dev *pdev = adapter->pdev;
> +	struct net_device *netdev = adapter->netdev;
> +	u8 val;
> +
> +	if (!test_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status)) {
> +		pci_read_config_byte(pdev,
> +		     adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
> +		     &val);
> +		if (val != VMM_MIGRATION_START)
> +			return;
> +
> +		pr_info("migration start\n");
> +		set_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status);
> +		netif_device_detach(netdev);
> +
> +		if (netif_running(netdev)) {
> +			rtnl_lock();
> +			ixgbevf_down(adapter);
> +			rtnl_unlock();
> +		}
> +		pci_save_state(pdev);
> +
> +		/* Tell Qemu VF is ready for migration. */
> +		pci_write_config_byte(pdev,
> +			     adapter->migration_cap + PCI_VF_MIGRATION_VF_STATUS,
> +			     PCI_VF_READY_FOR_MIGRATION);
> +	} else {
> +		pci_read_config_byte(pdev,
> +		     adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
> +		     &val);
> +		if (val != VMM_MIGRATION_END)
> +			return;
> +
> +		pci_restore_state(pdev);
> +
> +		if (netif_running(netdev)) {
> +			ixgbevf_reset(adapter);
> +			ixgbevf_up(adapter);
> +		}
> +
> +		netif_device_attach(netdev);
> +
> +		clear_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status);
> +		pr_info("migration end\n");
> +	}
> +
> +}
> +
>  /**
>   * ixgbevf_service_task - manages and runs subtasks
>   * @work: pointer to work_struct containing our data
> @@ -3122,6 +3201,7 @@ static int ixgbevf_open(struct net_device *netdev)
>  {
>  	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
>  	struct ixgbe_hw *hw = &adapter->hw;
> +	struct pci_dev *pdev = adapter->pdev;
>  	int err;
>  
>  	/* A previous failure to open the device because of a lack of
> @@ -3175,6 +3255,13 @@ static int ixgbevf_open(struct net_device *netdev)
>  
>  	ixgbevf_up_complete(adapter);
>  
> +	if (adapter->migration_cap) {
> +		pci_write_config_byte(pdev,
> +			     adapter->migration_cap + PCI_VF_MIGRATION_CAP,
> +			     PCI_VF_MIGRATION_ENABLE);
> +		adapter->last_migration_reg = 0;
> +	}
> +
>  	return 0;
>  
>  err_req_irq:
> @@ -3204,6 +3291,13 @@ err_setup_reset:
>  static int ixgbevf_close(struct net_device *netdev)
>  {
>  	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
> +	struct pci_dev *pdev = adapter->pdev;
> +	
> +	if (adapter->migration_cap) {
> +		pci_write_config_byte(pdev,
> +			     adapter->migration_cap + PCI_VF_MIGRATION_CAP,
> +			     PCI_VF_MIGRATION_DISABLE);
> +	}
>  
>  	ixgbevf_down(adapter);
>  	ixgbevf_free_irq(adapter);
> @@ -3764,6 +3858,12 @@ static int ixgbevf_suspend(struct pci_dev *pdev, pm_message_t state)
>  	int retval = 0;
>  #endif
>  
> +	if (adapter->migration_cap) {
> +		pci_write_config_byte(pdev,
> +			     adapter->migration_cap + PCI_VF_MIGRATION_CAP,
> +			     PCI_VF_MIGRATION_DISABLE);
> +	}
> +
>  	netif_device_detach(netdev);
>  
>  	if (netif_running(netdev)) {
> @@ -4029,6 +4129,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>  		    (unsigned long)adapter);
>  
>  	INIT_WORK(&adapter->service_task, ixgbevf_service_task);
> +	INIT_WORK(&adapter->migration_task, ixgbevf_migration_task);
>  	set_bit(__IXGBEVF_SERVICE_INITED, &adapter->state);
>  	clear_bit(__IXGBEVF_SERVICE_SCHED, &adapter->state);
>  
> @@ -4064,6 +4165,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>  		break;
>  	}
>  
> +	adapter->migration_cap = pci_find_capability(pdev, PCI_CAP_ID_MIGRATION);
>  	return 0;
>  
>  err_register:
> -- 
> 1.8.4.rc0.1.g8f6a3e5.dirty
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Duyck Nov. 25, 2015, 5:39 a.m. UTC | #2
On Tue, Nov 24, 2015 at 1:20 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Tue, Nov 24, 2015 at 09:38:18PM +0800, Lan Tianyu wrote:
>> This patch is to add migration support for ixgbevf driver. Using
>> faked PCI migration capability table communicates with Qemu to
>> share migration status and mailbox irq vector index.
>>
>> Qemu will notify VF via sending MSIX msg to trigger mailbox
>> vector during migration and store migration status in the
>> PCI_VF_MIGRATION_VMM_STATUS regs in the new capability table.
>> The mailbox irq will be triggered just befoe stop-and-copy stage
>> and after migration on the target machine.
>>
>> VF driver will put down net when detect migration and tell
>> Qemu it's ready for migration via writing PCI_VF_MIGRATION_VF_STATUS
>> reg. After migration, put up net again.
>>
>> Qemu will in charge of migrating PCI config space regs and MSIX config.
>>
>> The patch is to dedicate on the normal case that net traffic works
>> when mailbox irq is enabled. For other cases(such as the driver
>> isn't loaded, adapter is suspended or closed), mailbox irq won't be
>> triggered and VF driver will disable it via PCI_VF_MIGRATION_CAP
>> reg. These case will be resolved later.
>>
>> Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
>
> I have to say, I was much more interested in the idea
> of tracking dirty memory. I have some thoughts about
> that one - did you give up on it then?

The tracking of dirty pages still needs to be addressed unless the
interface is being downed before migration even starts which based on
other comments I am assuming is not the case.

I still feel that having a means of marking a page as being dirty when
it is unmapped would be the best way to go.  That way you only have to
update the DMA API instead of messing with each and every driver
trying to add code to force the page to be dirtied.

- Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
lan,Tianyu Nov. 25, 2015, 5:39 a.m. UTC | #3
On 2015?11?25? 05:20, Michael S. Tsirkin wrote:
> I have to say, I was much more interested in the idea
> of tracking dirty memory. I have some thoughts about
> that one - did you give up on it then?

No, our finial target is to keep VF active before doing
migration and tracking dirty memory is essential. But this
seems not easy to do that in short term for upstream. As
starters, stop VF before migration.

After deep thinking, the way of stopping VF still needs tracking
DMA-accessed dirty memory to make sure the received data buffer
before stopping VF migrated. It's easier to do that via dummy writing
data buffer when receive packet.
Michael S. Tsirkin Nov. 25, 2015, 12:28 p.m. UTC | #4
On Wed, Nov 25, 2015 at 01:39:32PM +0800, Lan Tianyu wrote:
> On 2015?11?25? 05:20, Michael S. Tsirkin wrote:
> > I have to say, I was much more interested in the idea
> > of tracking dirty memory. I have some thoughts about
> > that one - did you give up on it then?
> 
> No, our finial target is to keep VF active before doing
> migration and tracking dirty memory is essential. But this
> seems not easy to do that in short term for upstream. As
> starters, stop VF before migration.

Frankly, I don't really see what this short term hack buys us,
and if it goes in, we'll have to maintain it forever.

Also, assuming you just want to do ifdown/ifup for some reason, it's
easy enough to do using a guest agent, in a completely generic way.


> After deep thinking, the way of stopping VF still needs tracking
> DMA-accessed dirty memory to make sure the received data buffer
> before stopping VF migrated. It's easier to do that via dummy writing
> data buffer when receive packet.
> 
> 
> -- 
> Best regards
> Tianyu Lan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
lan,Tianyu Nov. 25, 2015, 4:02 p.m. UTC | #5
On 11/25/2015 8:28 PM, Michael S. Tsirkin wrote:
> Frankly, I don't really see what this short term hack buys us,
> and if it goes in, we'll have to maintain it forever.
>

The framework of how to notify VF about migration status won't be
changed regardless of stopping VF or not before doing migration.
We hope to reach agreement on this first. Tracking dirty memory still
need to more discussions and we will continue working on it. Stop VF may
help to work around the issue and make tracking easier.


> Also, assuming you just want to do ifdown/ifup for some reason, it's
> easy enough to do using a guest agent, in a completely generic way.
>

Just ifdown/ifup is not enough for migration. It needs to restore some 
PCI settings before doing ifup on the target machine
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Nov. 25, 2015, 4:22 p.m. UTC | #6
On Thu, Nov 26, 2015 at 12:02:33AM +0800, Lan, Tianyu wrote:
> On 11/25/2015 8:28 PM, Michael S. Tsirkin wrote:
> >Frankly, I don't really see what this short term hack buys us,
> >and if it goes in, we'll have to maintain it forever.
> >
> 
> The framework of how to notify VF about migration status won't be
> changed regardless of stopping VF or not before doing migration.
> We hope to reach agreement on this first.

Well it's bi-directional, the framework won't work if it's
uni-directional.
Further, if you use this interface to stop the interface
at the moment, you won't be able to do anything else
with it, and will need a new one down the road.


> Tracking dirty memory still
> need to more discussions and we will continue working on it. Stop VF may
> help to work around the issue and make tracking easier.
> 
> 
> >Also, assuming you just want to do ifdown/ifup for some reason, it's
> >easy enough to do using a guest agent, in a completely generic way.
> >
> 
> Just ifdown/ifup is not enough for migration. It needs to restore some PCI
> settings before doing ifup on the target machine

I'd focus on just restoring then.
Alexander Duyck Nov. 25, 2015, 4:24 p.m. UTC | #7
On Wed, Nov 25, 2015 at 8:02 AM, Lan, Tianyu <tianyu.lan@intel.com> wrote:
> On 11/25/2015 8:28 PM, Michael S. Tsirkin wrote:
>>
>> Frankly, I don't really see what this short term hack buys us,
>> and if it goes in, we'll have to maintain it forever.
>>
>
> The framework of how to notify VF about migration status won't be
> changed regardless of stopping VF or not before doing migration.
> We hope to reach agreement on this first. Tracking dirty memory still
> need to more discussions and we will continue working on it. Stop VF may
> help to work around the issue and make tracking easier.

The problem is you still have to stop the device at some point for the
same reason why you have to halt the VM.  You seem to think you can
get by without doing that but you can't.  All you do is open the
system up to multiple races if you leave the device running.  The goal
should be to avoid stopping the device until the last possible moment,
however it will still have to be stopped eventually.  It isn't as if
you can migrate memory and leave the device doing DMA and expect to
get a clean state.

I agree with Michael.  The focus needs to be on first addressing dirty
page tracking.  Once you have that you could use a variation on the
bonding solution where you postpone the hot-plug event until near the
end of the migration just before you halt the guest instead of having
to do it before you start the migration.  Then after that we could
look at optimizing things further by introducing a variation that you
could further improve on things by introducing a variation of hot-plug
that would pause the device as I suggested instead of removing it.  At
that point you should be able to have almost all of the key issues
addresses so that you could drop the bond interface entirely.

>> Also, assuming you just want to do ifdown/ifup for some reason, it's
>> easy enough to do using a guest agent, in a completely generic way.
>>
>
> Just ifdown/ifup is not enough for migration. It needs to restore some PCI
> settings before doing ifup on the target machine

That is why I have been suggesting making use of suspend/resume logic
that is already in place for PCI power management.  In the case of a
suspend/resume we already have to deal with the fact that the device
will go through a D0->D3->D0 reset so we have to restore all of the
existing state.  It would take a significant load off of Qemu since
the guest would be restoring its own state instead of making Qemu have
to do all of the device migration work.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Nov. 25, 2015, 4:39 p.m. UTC | #8
On Wed, Nov 25, 2015 at 08:24:38AM -0800, Alexander Duyck wrote:
> >> Also, assuming you just want to do ifdown/ifup for some reason, it's
> >> easy enough to do using a guest agent, in a completely generic way.
> >>
> >
> > Just ifdown/ifup is not enough for migration. It needs to restore some PCI
> > settings before doing ifup on the target machine
> 
> That is why I have been suggesting making use of suspend/resume logic
> that is already in place for PCI power management.  In the case of a
> suspend/resume we already have to deal with the fact that the device
> will go through a D0->D3->D0 reset so we have to restore all of the
> existing state.  It would take a significant load off of Qemu since
> the guest would be restoring its own state instead of making Qemu have
> to do all of the device migration work.

That can work, though again, the issue is you need guest
cooperation to migrate.

If you reset device on destination instead of restoring state,
then that issue goes away, but maybe the downtime
will be increased.

Will it really? I think it's worth it to start with the
simplest solution (reset on destination) and see
what the effect is, then add optimizations.


One thing that I've been thinking about for a while, is saving (some)
state speculatively.  For example, notify guest a bit before migration
is done, so it can save device state. If guest responds quickly, you
have state that can be restored.  If it doesn't, still migrate, and it
will have to reset on destination.
Alexander Duyck Nov. 25, 2015, 5:24 p.m. UTC | #9
On Wed, Nov 25, 2015 at 8:39 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Wed, Nov 25, 2015 at 08:24:38AM -0800, Alexander Duyck wrote:
>> >> Also, assuming you just want to do ifdown/ifup for some reason, it's
>> >> easy enough to do using a guest agent, in a completely generic way.
>> >>
>> >
>> > Just ifdown/ifup is not enough for migration. It needs to restore some PCI
>> > settings before doing ifup on the target machine
>>
>> That is why I have been suggesting making use of suspend/resume logic
>> that is already in place for PCI power management.  In the case of a
>> suspend/resume we already have to deal with the fact that the device
>> will go through a D0->D3->D0 reset so we have to restore all of the
>> existing state.  It would take a significant load off of Qemu since
>> the guest would be restoring its own state instead of making Qemu have
>> to do all of the device migration work.
>
> That can work, though again, the issue is you need guest
> cooperation to migrate.

Right now the problem is you need to have guest cooperation anyway as
you need to have some way of tracking the dirty pages.  If the IOMMU
on the host were to provide some sort of dirty page tracking then we
could exclude the guest from the equation, but until then we need the
guest to notify us of what pages it is letting the device dirty.  I'm
still of the opinion that the best way to go there is to just modify
the DMA API that is used in the guest so that it supports some sort of
page flag modification or something along those lines so we can track
all of the pages that might be written to by the device.

> If you reset device on destination instead of restoring state,
> then that issue goes away, but maybe the downtime
> will be increased.

Yes, the downtime will be increased, but it shouldn't be by much.
Depending on the setup a VF with a single queue can have about 3MB of
data outstanding when you move the driver over.  After that it is just
a matter of bringing the interface back up which should take only a
few hundred milliseconds assuming the PF is fairly responsive.

> Will it really? I think it's worth it to start with the
> simplest solution (reset on destination) and see
> what the effect is, then add optimizations.

Agreed.  My thought would be to start with something like
dma_mark_clean() that could be used to take care of marking the pages
for migration when they are unmapped or synced.

> One thing that I've been thinking about for a while, is saving (some)
> state speculatively.  For example, notify guest a bit before migration
> is done, so it can save device state. If guest responds quickly, you
> have state that can be restored.  If it doesn't, still migrate, and it
> will have to reset on destination.

I'm not sure how much more device state we really need to save.  The
driver in the guest has to have enough state to recover in the event
of a device failure resulting in a slot reset.  To top it off the
driver is able to reconfigure things probably as quick as we could if
we were restoring the state.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 775d089..4b8ba2f 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -438,6 +438,11 @@  struct ixgbevf_adapter {
 	u64 bp_tx_missed;
 #endif
 
+	u8 migration_cap;
+	u8 last_migration_reg;
+	unsigned long migration_status;
+	struct work_struct migration_task;
+
 	u8 __iomem *io_addr; /* Mainly for iounmap use */
 	u32 link_speed;
 	bool link_up;
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index a16d267..95860c2 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -96,6 +96,8 @@  static int debug = -1;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
+#define MIGRATION_IN_PROGRESS		0
+
 static void ixgbevf_service_event_schedule(struct ixgbevf_adapter *adapter)
 {
 	if (!test_bit(__IXGBEVF_DOWN, &adapter->state) &&
@@ -1262,6 +1264,22 @@  static void ixgbevf_set_itr(struct ixgbevf_q_vector *q_vector)
 	}
 }
 
+static void ixgbevf_migration_check(struct ixgbevf_adapter *adapter) 
+{
+	struct pci_dev *pdev = adapter->pdev;
+	u8 val;
+
+	pci_read_config_byte(pdev,
+		     adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
+		     &val);
+
+	if (val != adapter->last_migration_reg) {
+		schedule_work(&adapter->migration_task);
+		adapter->last_migration_reg = val;
+	}
+
+}
+
 static irqreturn_t ixgbevf_msix_other(int irq, void *data)
 {
 	struct ixgbevf_adapter *adapter = data;
@@ -1269,6 +1287,7 @@  static irqreturn_t ixgbevf_msix_other(int irq, void *data)
 
 	hw->mac.get_link_status = 1;
 
+	ixgbevf_migration_check(adapter);
 	ixgbevf_service_event_schedule(adapter);
 
 	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, adapter->eims_other);
@@ -1383,6 +1402,7 @@  out:
 static int ixgbevf_request_msix_irqs(struct ixgbevf_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
+	struct pci_dev *pdev = adapter->pdev;
 	int q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
 	int vector, err;
 	int ri = 0, ti = 0;
@@ -1423,6 +1443,12 @@  static int ixgbevf_request_msix_irqs(struct ixgbevf_adapter *adapter)
 		goto free_queue_irqs;
 	}
 
+	if (adapter->migration_cap) {
+		pci_write_config_byte(pdev,
+			adapter->migration_cap + PCI_VF_MIGRATION_IRQ,
+			vector);
+	}
+
 	return 0;
 
 free_queue_irqs:
@@ -2891,6 +2917,59 @@  static void ixgbevf_watchdog_subtask(struct ixgbevf_adapter *adapter)
 	ixgbevf_update_stats(adapter);
 }
 
+static void ixgbevf_migration_task(struct work_struct *work)
+{
+	struct ixgbevf_adapter *adapter = container_of(work,
+			struct ixgbevf_adapter,
+			migration_task);
+	struct pci_dev *pdev = adapter->pdev;
+	struct net_device *netdev = adapter->netdev;
+	u8 val;
+
+	if (!test_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status)) {
+		pci_read_config_byte(pdev,
+		     adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
+		     &val);
+		if (val != VMM_MIGRATION_START)
+			return;
+
+		pr_info("migration start\n");
+		set_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status);
+		netif_device_detach(netdev);
+
+		if (netif_running(netdev)) {
+			rtnl_lock();
+			ixgbevf_down(adapter);
+			rtnl_unlock();
+		}
+		pci_save_state(pdev);
+
+		/* Tell Qemu VF is ready for migration. */
+		pci_write_config_byte(pdev,
+			     adapter->migration_cap + PCI_VF_MIGRATION_VF_STATUS,
+			     PCI_VF_READY_FOR_MIGRATION);
+	} else {
+		pci_read_config_byte(pdev,
+		     adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
+		     &val);
+		if (val != VMM_MIGRATION_END)
+			return;
+
+		pci_restore_state(pdev);
+
+		if (netif_running(netdev)) {
+			ixgbevf_reset(adapter);
+			ixgbevf_up(adapter);
+		}
+
+		netif_device_attach(netdev);
+
+		clear_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status);
+		pr_info("migration end\n");
+	}
+
+}
+
 /**
  * ixgbevf_service_task - manages and runs subtasks
  * @work: pointer to work_struct containing our data
@@ -3122,6 +3201,7 @@  static int ixgbevf_open(struct net_device *netdev)
 {
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
+	struct pci_dev *pdev = adapter->pdev;
 	int err;
 
 	/* A previous failure to open the device because of a lack of
@@ -3175,6 +3255,13 @@  static int ixgbevf_open(struct net_device *netdev)
 
 	ixgbevf_up_complete(adapter);
 
+	if (adapter->migration_cap) {
+		pci_write_config_byte(pdev,
+			     adapter->migration_cap + PCI_VF_MIGRATION_CAP,
+			     PCI_VF_MIGRATION_ENABLE);
+		adapter->last_migration_reg = 0;
+	}
+
 	return 0;
 
 err_req_irq:
@@ -3204,6 +3291,13 @@  err_setup_reset:
 static int ixgbevf_close(struct net_device *netdev)
 {
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
+	struct pci_dev *pdev = adapter->pdev;
+	
+	if (adapter->migration_cap) {
+		pci_write_config_byte(pdev,
+			     adapter->migration_cap + PCI_VF_MIGRATION_CAP,
+			     PCI_VF_MIGRATION_DISABLE);
+	}
 
 	ixgbevf_down(adapter);
 	ixgbevf_free_irq(adapter);
@@ -3764,6 +3858,12 @@  static int ixgbevf_suspend(struct pci_dev *pdev, pm_message_t state)
 	int retval = 0;
 #endif
 
+	if (adapter->migration_cap) {
+		pci_write_config_byte(pdev,
+			     adapter->migration_cap + PCI_VF_MIGRATION_CAP,
+			     PCI_VF_MIGRATION_DISABLE);
+	}
+
 	netif_device_detach(netdev);
 
 	if (netif_running(netdev)) {
@@ -4029,6 +4129,7 @@  static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		    (unsigned long)adapter);
 
 	INIT_WORK(&adapter->service_task, ixgbevf_service_task);
+	INIT_WORK(&adapter->migration_task, ixgbevf_migration_task);
 	set_bit(__IXGBEVF_SERVICE_INITED, &adapter->state);
 	clear_bit(__IXGBEVF_SERVICE_SCHED, &adapter->state);
 
@@ -4064,6 +4165,7 @@  static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		break;
 	}
 
+	adapter->migration_cap = pci_find_capability(pdev, PCI_CAP_ID_MIGRATION);
 	return 0;
 
 err_register: