diff mbox

[V3] block/iscsi: allow caching of the allocation map

Message ID 1464079240-24362-1-git-send-email-pl@kamp.de (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Lieven May 24, 2016, 8:40 a.m. UTC
until now the allocation map was used only as a hint if a cluster
is allocated or not. If a block was not allocated (or Qemu had
no info about the allocation status) a get_block_status call was
issued to check the allocation status and possibly avoid
a subsequent read of unallocated sectors. If a block known to be
allocated the get_block_status call was omitted. In the other case
a get_block_status call was issued before every read to avoid
the necessity for a consistent allocation map. To avoid the
potential overhead of calling get_block_status for each and
every read request this took only place for the bigger requests.

This patch enhances this mechanism to cache the allocation
status and avoid calling get_block_status for blocks where
the allocation status has been queried before. This allows
for bypassing the read request even for smaller requests and
additionally omits calling get_block_status for known to be
unallocated blocks.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
v2->v3: - fix wording errors [Fam]
        - reinit allocmap only if allocmap is present in
          bdrv_reopen_commit
v1->v2: - add more comments [Fam]
        - free allocmap if allocation of allocmap_valid fails [Fam]
        - fix indent and whitespace errors [Fam]
        - account for cache mode changes on reopen

 block/iscsi.c | 231 +++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 182 insertions(+), 49 deletions(-)

Comments

Eric Blake May 24, 2016, 11:10 p.m. UTC | #1
On 05/24/2016 02:40 AM, Peter Lieven wrote:
> until now the allocation map was used only as a hint if a cluster
> is allocated or not. If a block was not allocated (or Qemu had
> no info about the allocation status) a get_block_status call was
> issued to check the allocation status and possibly avoid
> a subsequent read of unallocated sectors. If a block known to be
> allocated the get_block_status call was omitted. In the other case
> a get_block_status call was issued before every read to avoid
> the necessity for a consistent allocation map. To avoid the
> potential overhead of calling get_block_status for each and
> every read request this took only place for the bigger requests.
> 
> This patch enhances this mechanism to cache the allocation
> status and avoid calling get_block_status for blocks where
> the allocation status has been queried before. This allows
> for bypassing the read request even for smaller requests and
> additionally omits calling get_block_status for known to be
> unallocated blocks.
> 
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---

> +static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
>  {
> -    if (iscsilun->allocationmap == NULL) {
> -        return;
> +    iscsi_allocmap_free(iscsilun);
> +
> +    iscsilun->allocmap_size =
> +        DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
> +                     iscsilun->cluster_sectors);
> +

Computes: ceiling( (num_blocks * block_size / 512) / (cluster_size /
512) ); aka number of clusters.  But we don't independently track the
cluster size, so I don't see any simpler way of writing it, even if we
could be more efficient by not having to first scale through qemu's
sector size.

> +    iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
> +    if (!iscsilun->allocmap) {
> +        return -ENOMEM;
> +    }
> +
> +    if (open_flags & BDRV_O_NOCACHE) {
> +        /* in case that cache.direct = on all allocmap entries are
> +         * treated as invalid to force a relookup of the block
> +         * status on every read request */
> +        return 0;

Can we cache that we are opened BDRV_O_NOCACHE, so that we don't even
have to bother allocating allocmap when we know we are never changing
its bits?  In other words, can you swap this to be before the
bitmap_try_new()?

> +    }
> +
> +    iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size);
> +    if (!iscsilun->allocmap_valid) {
> +        /* if we are under memory pressure free the allocmap as well */
> +        iscsi_allocmap_free(iscsilun);
> +        return -ENOMEM;
>      }
> -    bitmap_set(iscsilun->allocationmap,
> -               sector_num / iscsilun->cluster_sectors,
> -               DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
> +
> +    return 0;
>  }
>  
> -static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
> -                                      int nb_sectors)
> +static void
> +iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
> +                      int nb_sectors, bool allocated, bool valid)
>  {
>      int64_t cluster_num, nb_clusters;
> -    if (iscsilun->allocationmap == NULL) {
> +
> +    if (iscsilun->allocmap == NULL) {
>          return;
>      }

Here, you are short-circuiting when there is no allocmap, but shouldn't
you also short-circuit if you are BDRV_O_NOCACHE?

Should you assert(!(allocated && !valid)) [or by deMorgan's,
assert(!allocated || valid)], to make sure we are only tracking 3 states
rather than 4?

>      cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
>      nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
>                    - cluster_num;
> -    if (nb_clusters > 0) {
> -        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
> +    if (allocated) {
> +        bitmap_set(iscsilun->allocmap,
> +                   sector_num / iscsilun->cluster_sectors,
> +                   DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));

This says that if we have a sub-cluster request, we can round out to
cluster alignment (if our covered part of the cluster is allocated,
presumably the rest is allocated as well)...

> +    } else if (nb_clusters > 0) {
> +        bitmap_clear(iscsilun->allocmap, cluster_num, nb_clusters);

...while this says if we are marking something clear, we have to round
in (while we can trim the aligned middle, we should not mark any
unaligned head or tail as trimmed, in case they remain allocated due to
the unvisited sectors).  Still, it may be worth comments for why your
rounding between the two calls is different.

> +    }
> +
> +    if (iscsilun->allocmap_valid == NULL) {
> +        return;
> +    }

When do we ever have allocmap set but allocmap_valid clear?  Isn't it
easier to assume that both maps are present (we are tracking status) or
absent (we are BDRV_O_NOCACHE)?

> +    if (valid) {
> +        if (nb_clusters > 0) {
> +            bitmap_set(iscsilun->allocmap_valid, cluster_num, nb_clusters);
> +        }
> +    } else {
> +        bitmap_clear(iscsilun->allocmap_valid,
> +                     sector_num / iscsilun->cluster_sectors,
> +                     DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));

Here, the rounding is opposite - you round in for valid (leaving the
head and tail in an unknown state because you didn't visit the whole
cluster), and round out for clear (you don't bother checking whether
your action will change the state of the head and tail, so it's easier
to just mark that cluster as needing a refresh).  Again, comments might
help.  And if you are relying on allocmap_valid before consulting
allocmap, I don't know if it makes any sense to mark any bits of the
allocmap above if those bits will just be masked out by allocmap_valid,
so maybe the earlier code could unconditionally round in, instead of
rounding out for allocated and in for unallocated.
Peter Lieven May 30, 2016, 6:33 a.m. UTC | #2
Am 25.05.2016 um 01:10 schrieb Eric Blake:
> On 05/24/2016 02:40 AM, Peter Lieven wrote:
>> until now the allocation map was used only as a hint if a cluster
>> is allocated or not. If a block was not allocated (or Qemu had
>> no info about the allocation status) a get_block_status call was
>> issued to check the allocation status and possibly avoid
>> a subsequent read of unallocated sectors. If a block known to be
>> allocated the get_block_status call was omitted. In the other case
>> a get_block_status call was issued before every read to avoid
>> the necessity for a consistent allocation map. To avoid the
>> potential overhead of calling get_block_status for each and
>> every read request this took only place for the bigger requests.
>>
>> This patch enhances this mechanism to cache the allocation
>> status and avoid calling get_block_status for blocks where
>> the allocation status has been queried before. This allows
>> for bypassing the read request even for smaller requests and
>> additionally omits calling get_block_status for known to be
>> unallocated blocks.
>>
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>> +static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
>>   {
>> -    if (iscsilun->allocationmap == NULL) {
>> -        return;
>> +    iscsi_allocmap_free(iscsilun);
>> +
>> +    iscsilun->allocmap_size =
>> +        DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
>> +                     iscsilun->cluster_sectors);
>> +
> Computes: ceiling( (num_blocks * block_size / 512) / (cluster_size /
> 512) ); aka number of clusters.  But we don't independently track the
> cluster size, so I don't see any simpler way of writing it, even if we
> could be more efficient by not having to first scale through qemu's
> sector size.
>
>> +    iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
>> +    if (!iscsilun->allocmap) {
>> +        return -ENOMEM;
>> +    }
>> +
>> +    if (open_flags & BDRV_O_NOCACHE) {
>> +        /* in case that cache.direct = on all allocmap entries are
>> +         * treated as invalid to force a relookup of the block
>> +         * status on every read request */
>> +        return 0;
> Can we cache that we are opened BDRV_O_NOCACHE, so that we don't even
> have to bother allocating allocmap when we know we are never changing
> its bits?  In other words, can you swap this to be before the
> bitmap_try_new()?

The idea of the allocmap in cache.direct = on mode is that we can
still speed up block jobs by skipping large unallocated areas. In this case
the allocmap has only a hint character. If we don't know the status
we issue a get_block_status request and verify the status. If its unallocated
we return zeroes. If we new through an earlier get block status request
that the area is allocated we can skip the useless get_block_status request.
This is the old behaviour without this patch.

>
>> +    }
>> +
>> +    iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size);
>> +    if (!iscsilun->allocmap_valid) {
>> +        /* if we are under memory pressure free the allocmap as well */
>> +        iscsi_allocmap_free(iscsilun);
>> +        return -ENOMEM;
>>       }
>> -    bitmap_set(iscsilun->allocationmap,
>> -               sector_num / iscsilun->cluster_sectors,
>> -               DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
>> +
>> +    return 0;
>>   }
>>   
>> -static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
>> -                                      int nb_sectors)
>> +static void
>> +iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
>> +                      int nb_sectors, bool allocated, bool valid)
>>   {
>>       int64_t cluster_num, nb_clusters;
>> -    if (iscsilun->allocationmap == NULL) {
>> +
>> +    if (iscsilun->allocmap == NULL) {
>>           return;
>>       }
> Here, you are short-circuiting when there is no allocmap, but shouldn't
> you also short-circuit if you are BDRV_O_NOCACHE?
>
> Should you assert(!(allocated && !valid)) [or by deMorgan's,
> assert(!allocated || valid)], to make sure we are only tracking 3 states
> rather than 4?

sure, we thats a good enhancement altough allocated and invalid doesn't
hurt.

>
>>       cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
>>       nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
>>                     - cluster_num;
>> -    if (nb_clusters > 0) {
>> -        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
>> +    if (allocated) {
>> +        bitmap_set(iscsilun->allocmap,
>> +                   sector_num / iscsilun->cluster_sectors,
>> +                   DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
> This says that if we have a sub-cluster request, we can round out to
> cluster alignment (if our covered part of the cluster is allocated,
> presumably the rest is allocated as well)...
>
>> +    } else if (nb_clusters > 0) {
>> +        bitmap_clear(iscsilun->allocmap, cluster_num, nb_clusters);
> ...while this says if we are marking something clear, we have to round
> in (while we can trim the aligned middle, we should not mark any
> unaligned head or tail as trimmed, in case they remain allocated due to
> the unvisited sectors).  Still, it may be worth comments for why your
> rounding between the two calls is different.

Okay

>
>> +    }
>> +
>> +    if (iscsilun->allocmap_valid == NULL) {
>> +        return;
>> +    }
> When do we ever have allocmap set but allocmap_valid clear?  Isn't it
> easier to assume that both maps are present (we are tracking status) or
> absent (we are BDRV_O_NOCACHE)?

Thats the current behaviour. See above.

Peter
Peter Lieven June 10, 2016, 9:13 a.m. UTC | #3
Am 30.05.2016 um 08:33 schrieb Peter Lieven:
> Am 25.05.2016 um 01:10 schrieb Eric Blake:
>> On 05/24/2016 02:40 AM, Peter Lieven wrote:
>>> until now the allocation map was used only as a hint if a cluster
>>> is allocated or not. If a block was not allocated (or Qemu had
>>> no info about the allocation status) a get_block_status call was
>>> issued to check the allocation status and possibly avoid
>>> a subsequent read of unallocated sectors. If a block known to be
>>> allocated the get_block_status call was omitted. In the other case
>>> a get_block_status call was issued before every read to avoid
>>> the necessity for a consistent allocation map. To avoid the
>>> potential overhead of calling get_block_status for each and
>>> every read request this took only place for the bigger requests.
>>>
>>> This patch enhances this mechanism to cache the allocation
>>> status and avoid calling get_block_status for blocks where
>>> the allocation status has been queried before. This allows
>>> for bypassing the read request even for smaller requests and
>>> additionally omits calling get_block_status for known to be
>>> unallocated blocks.
>>>
>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>> ---
>>> +static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
>>>   {
>>> -    if (iscsilun->allocationmap == NULL) {
>>> -        return;
>>> +    iscsi_allocmap_free(iscsilun);
>>> +
>>> +    iscsilun->allocmap_size =
>>> +        DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
>>> +                     iscsilun->cluster_sectors);
>>> +
>> Computes: ceiling( (num_blocks * block_size / 512) / (cluster_size /
>> 512) ); aka number of clusters.  But we don't independently track the
>> cluster size, so I don't see any simpler way of writing it, even if we
>> could be more efficient by not having to first scale through qemu's
>> sector size.
>>
>>> +    iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
>>> +    if (!iscsilun->allocmap) {
>>> +        return -ENOMEM;
>>> +    }
>>> +
>>> +    if (open_flags & BDRV_O_NOCACHE) {
>>> +        /* in case that cache.direct = on all allocmap entries are
>>> +         * treated as invalid to force a relookup of the block
>>> +         * status on every read request */
>>> +        return 0;
>> Can we cache that we are opened BDRV_O_NOCACHE, so that we don't even
>> have to bother allocating allocmap when we know we are never changing
>> its bits?  In other words, can you swap this to be before the
>> bitmap_try_new()?
>
> The idea of the allocmap in cache.direct = on mode is that we can
> still speed up block jobs by skipping large unallocated areas. In this case
> the allocmap has only a hint character. If we don't know the status
> we issue a get_block_status request and verify the status. If its unallocated
> we return zeroes. If we new through an earlier get block status request
> that the area is allocated we can skip the useless get_block_status request.
> This is the old behaviour without this patch.
>
>>
>>> +    }
>>> +
>>> +    iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size);
>>> +    if (!iscsilun->allocmap_valid) {
>>> +        /* if we are under memory pressure free the allocmap as well */
>>> +        iscsi_allocmap_free(iscsilun);
>>> +        return -ENOMEM;
>>>       }
>>> -    bitmap_set(iscsilun->allocationmap,
>>> -               sector_num / iscsilun->cluster_sectors,
>>> -               DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
>>> +
>>> +    return 0;
>>>   }
>>>   -static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
>>> -                                      int nb_sectors)
>>> +static void
>>> +iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
>>> +                      int nb_sectors, bool allocated, bool valid)
>>>   {
>>>       int64_t cluster_num, nb_clusters;
>>> -    if (iscsilun->allocationmap == NULL) {
>>> +
>>> +    if (iscsilun->allocmap == NULL) {
>>>           return;
>>>       }
>> Here, you are short-circuiting when there is no allocmap, but shouldn't
>> you also short-circuit if you are BDRV_O_NOCACHE?
>>
>> Should you assert(!(allocated && !valid)) [or by deMorgan's,
>> assert(!allocated || valid)], to make sure we are only tracking 3 states
>> rather than 4?
>
> sure, we thats a good enhancement altough allocated and invalid doesn't
> hurt.
>
>>
>>>       cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
>>>       nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
>>>                     - cluster_num;
>>> -    if (nb_clusters > 0) {
>>> -        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
>>> +    if (allocated) {
>>> +        bitmap_set(iscsilun->allocmap,
>>> +                   sector_num / iscsilun->cluster_sectors,
>>> +                   DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
>> This says that if we have a sub-cluster request, we can round out to
>> cluster alignment (if our covered part of the cluster is allocated,
>> presumably the rest is allocated as well)...
>>
>>> +    } else if (nb_clusters > 0) {
>>> +        bitmap_clear(iscsilun->allocmap, cluster_num, nb_clusters);
>> ...while this says if we are marking something clear, we have to round
>> in (while we can trim the aligned middle, we should not mark any
>> unaligned head or tail as trimmed, in case they remain allocated due to
>> the unvisited sectors).  Still, it may be worth comments for why your
>> rounding between the two calls is different.
>
> Okay
>
>>
>>> +    }
>>> +
>>> +    if (iscsilun->allocmap_valid == NULL) {
>>> +        return;
>>> +    }
>> When do we ever have allocmap set but allocmap_valid clear?  Isn't it
>> easier to assume that both maps are present (we are tracking status) or
>> absent (we are BDRV_O_NOCACHE)?
>
> Thats the current behaviour. See above.

Ping.

Peter
Fam Zheng June 12, 2016, 5:04 a.m. UTC | #4
On Tue, 05/24 10:40, Peter Lieven wrote:
> until now the allocation map was used only as a hint if a cluster
> is allocated or not. If a block was not allocated (or Qemu had
> no info about the allocation status) a get_block_status call was
> issued to check the allocation status and possibly avoid
> a subsequent read of unallocated sectors. If a block known to be
> allocated the get_block_status call was omitted. In the other case
> a get_block_status call was issued before every read to avoid
> the necessity for a consistent allocation map. To avoid the
> potential overhead of calling get_block_status for each and
> every read request this took only place for the bigger requests.
> 
> This patch enhances this mechanism to cache the allocation
> status and avoid calling get_block_status for blocks where
> the allocation status has been queried before. This allows
> for bypassing the read request even for smaller requests and
> additionally omits calling get_block_status for known to be
> unallocated blocks.
> 
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
> v2->v3: - fix wording errors [Fam]
>         - reinit allocmap only if allocmap is present in
>           bdrv_reopen_commit

I did an incremental review and the diff-of-diff looks good to me, so:

Reviewed-by: Fam Zheng <famz@redhat.com>
Paolo Bonzini June 13, 2016, 9:45 a.m. UTC | #5
On 24/05/2016 10:40, Peter Lieven wrote:
> until now the allocation map was used only as a hint if a cluster
> is allocated or not. If a block was not allocated (or Qemu had
> no info about the allocation status) a get_block_status call was
> issued to check the allocation status and possibly avoid
> a subsequent read of unallocated sectors. If a block known to be
> allocated the get_block_status call was omitted. In the other case
> a get_block_status call was issued before every read to avoid
> the necessity for a consistent allocation map. To avoid the
> potential overhead of calling get_block_status for each and
> every read request this took only place for the bigger requests.
> 
> This patch enhances this mechanism to cache the allocation
> status and avoid calling get_block_status for blocks where
> the allocation status has been queried before. This allows
> for bypassing the read request even for smaller requests and
> additionally omits calling get_block_status for known to be
> unallocated blocks.
> 
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
> v2->v3: - fix wording errors [Fam]
>         - reinit allocmap only if allocmap is present in
>           bdrv_reopen_commit
> v1->v2: - add more comments [Fam]
>         - free allocmap if allocation of allocmap_valid fails [Fam]
>         - fix indent and whitespace errors [Fam]
>         - account for cache mode changes on reopen
> 
>  block/iscsi.c | 231 +++++++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 182 insertions(+), 49 deletions(-)
> 
> diff --git a/block/iscsi.c b/block/iscsi.c
> index 2ca8e72..fb308f6 100644
> --- a/block/iscsi.c
> +++ b/block/iscsi.c
> @@ -2,7 +2,7 @@
>   * QEMU Block driver for iSCSI images
>   *
>   * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
> - * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
> + * Copyright (c) 2012-2016 Peter Lieven <pl@kamp.de>
>   *
>   * Permission is hereby granted, free of charge, to any person obtaining a copy
>   * of this software and associated documentation files (the "Software"), to deal
> @@ -62,7 +62,23 @@ typedef struct IscsiLun {
>      struct scsi_inquiry_logical_block_provisioning lbp;
>      struct scsi_inquiry_block_limits bl;
>      unsigned char *zeroblock;
> -    unsigned long *allocationmap;
> +    /* The allocmap tracks which clusters (pages) on the iSCSI target are
> +     * allocated and which are not. In case a target returns zeros for
> +     * unallocated pages (iscsilun->lprz) we can directly return zeros instead
> +     * of reading zeros over the wire if a read request falls within an
> +     * unallocated block. As there are 3 possible states we need 2 bitmaps to
> +     * track. allocmap_valid keeps track if QEMU's information about a page is
> +     * valid. allocmap tracks if a page is allocated or not. In case QEMU has no
> +     * valid information about a page the corresponding allocmap entry should be
> +     * switched to unallocated as well to force a new lookup of the allocation
> +     * status as lookups are generally skipped if a page is suspect to be
> +     * allocated. If a iSCSI target is opened with cache.direct = on the
> +     * allocmap_valid does not exist turning all cached information invalid so
> +     * that a fresh lookup is made for any page even if allocmap entry returns
> +     * it's unallocated. */
> +    unsigned long *allocmap;
> +    unsigned long *allocmap_valid;
> +    long allocmap_size;
>      int cluster_sectors;
>      bool use_16_for_rw;
>      bool write_protected;
> @@ -415,37 +431,132 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
>      return 1;
>  }
>  
> -static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
> +static void iscsi_allocmap_free(IscsiLun *iscsilun)
>  {
> -    return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
> -                                                       iscsilun),
> -                                       iscsilun->cluster_sectors));
> +    g_free(iscsilun->allocmap);
> +    g_free(iscsilun->allocmap_valid);
> +    iscsilun->allocmap = NULL;
> +    iscsilun->allocmap_valid = NULL;
>  }
>  
> -static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
> -                                    int nb_sectors)
> +
> +static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
>  {
> -    if (iscsilun->allocationmap == NULL) {
> -        return;
> +    iscsi_allocmap_free(iscsilun);
> +
> +    iscsilun->allocmap_size =
> +        DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
> +                     iscsilun->cluster_sectors);
> +
> +    iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
> +    if (!iscsilun->allocmap) {
> +        return -ENOMEM;
> +    }
> +
> +    if (open_flags & BDRV_O_NOCACHE) {
> +        /* in case that cache.direct = on all allocmap entries are
> +         * treated as invalid to force a relookup of the block
> +         * status on every read request */
> +        return 0;
> +    }
> +
> +    iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size);
> +    if (!iscsilun->allocmap_valid) {
> +        /* if we are under memory pressure free the allocmap as well */
> +        iscsi_allocmap_free(iscsilun);
> +        return -ENOMEM;
>      }
> -    bitmap_set(iscsilun->allocationmap,
> -               sector_num / iscsilun->cluster_sectors,
> -               DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
> +
> +    return 0;
>  }
>  
> -static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
> -                                      int nb_sectors)
> +static void
> +iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
> +                      int nb_sectors, bool allocated, bool valid)
>  {
>      int64_t cluster_num, nb_clusters;
> -    if (iscsilun->allocationmap == NULL) {
> +
> +    if (iscsilun->allocmap == NULL) {
>          return;
>      }
>      cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
>      nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
>                    - cluster_num;
> -    if (nb_clusters > 0) {
> -        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
> +    if (allocated) {
> +        bitmap_set(iscsilun->allocmap,
> +                   sector_num / iscsilun->cluster_sectors,
> +                   DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
> +    } else if (nb_clusters > 0) {
> +        bitmap_clear(iscsilun->allocmap, cluster_num, nb_clusters);
> +    }
> +
> +    if (iscsilun->allocmap_valid == NULL) {
> +        return;
> +    }
> +    if (valid) {
> +        if (nb_clusters > 0) {
> +            bitmap_set(iscsilun->allocmap_valid, cluster_num, nb_clusters);
> +        }
> +    } else {
> +        bitmap_clear(iscsilun->allocmap_valid,
> +                     sector_num / iscsilun->cluster_sectors,
> +                     DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
> +    }
> +}
> +
> +static void
> +iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t sector_num,
> +                             int nb_sectors)
> +{
> +    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, true, true);
> +}
> +
> +static void
> +iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t sector_num,
> +                               int nb_sectors)
> +{
> +    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, true);
> +}
> +
> +static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t sector_num,
> +                                       int nb_sectors)
> +{
> +    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, false);
> +}
> +
> +static void iscsi_allocmap_invalidate(IscsiLun *iscsilun)
> +{
> +    if (iscsilun->allocmap) {
> +        bitmap_zero(iscsilun->allocmap, iscsilun->allocmap_size);
>      }
> +    if (iscsilun->allocmap_valid) {
> +        bitmap_zero(iscsilun->allocmap_valid, iscsilun->allocmap_size);
> +    }
> +}
> +
> +static inline bool
> +iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num,
> +                            int nb_sectors)
> +{
> +    unsigned long size;
> +    if (iscsilun->allocmap == NULL) {
> +        return true;
> +    }
> +    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
> +    return !(find_next_bit(iscsilun->allocmap, size,
> +                           sector_num / iscsilun->cluster_sectors) == size);
> +}
> +
> +static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
> +                                           int64_t sector_num, int nb_sectors)
> +{
> +    unsigned long size;
> +    if (iscsilun->allocmap_valid == NULL) {
> +        return false;
> +    }
> +    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
> +    return (find_next_zero_bit(iscsilun->allocmap_valid, size,
> +                               sector_num / iscsilun->cluster_sectors) == size);
>  }
>  
>  static int coroutine_fn
> @@ -507,26 +618,16 @@ retry:
>      }
>  
>      if (iTask.status != SCSI_STATUS_GOOD) {
> +        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
>          return iTask.err_code;
>      }
>  
> -    iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
> +    iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors);
>  
>      return 0;
>  }
>  
>  
> -static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,
> -                                             int64_t sector_num, int nb_sectors)
> -{
> -    unsigned long size;
> -    if (iscsilun->allocationmap == NULL) {
> -        return true;
> -    }
> -    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
> -    return !(find_next_bit(iscsilun->allocationmap, size,
> -                           sector_num / iscsilun->cluster_sectors) == size);
> -}
>  
>  static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
>                                                    int64_t sector_num,
> @@ -611,9 +712,9 @@ retry:
>      }
>  
>      if (ret & BDRV_BLOCK_ZERO) {
> -        iscsi_allocationmap_clear(iscsilun, sector_num, *pnum);
> +        iscsi_allocmap_set_unallocated(iscsilun, sector_num, *pnum);
>      } else {
> -        iscsi_allocationmap_set(iscsilun, sector_num, *pnum);
> +        iscsi_allocmap_set_allocated(iscsilun, sector_num, *pnum);
>      }
>  
>      if (*pnum > nb_sectors) {
> @@ -648,16 +749,32 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
>          return -EINVAL;
>      }
>  
> -    if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES &&
> -        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
> -        int64_t ret;
> +    /* if cache.direct is off and we have a valid entry in our allocation map
> +     * we can skip checking the block status and directly return zeroes if
> +     * the request falls within an unallocated area */
> +    if (iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
> +        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
> +            qemu_iovec_memset(iov, 0, 0x00, iov->size);
> +            return 0;
> +    }
> +
> +    if (nb_sectors >= ISCSI_CHECKALLOC_THRES &&
> +        !iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
> +        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
>          int pnum;
>          BlockDriverState *file;
> -        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum, &file);
> +        /* check the block status from the beginning of the cluster
> +         * containing the start sector */
> +        int64_t ret = iscsi_co_get_block_status(bs,
> +                          sector_num - sector_num % iscsilun->cluster_sectors,
> +                          INT_MAX, &pnum, &file);
>          if (ret < 0) {
>              return ret;
>          }
> -        if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) {
> +        /* if the whole request falls into an unallocated area we can avoid
> +         * to read and directly return zeroes instead */
> +        if (ret & BDRV_BLOCK_ZERO &&
> +            pnum >= nb_sectors + sector_num % iscsilun->cluster_sectors) {
>              qemu_iovec_memset(iov, 0, 0x00, iov->size);
>              return 0;
>          }
> @@ -964,7 +1081,7 @@ retry:
>          return iTask.err_code;
>      }
>  
> -    iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
> +    iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
>  
>      return 0;
>  }
> @@ -1054,13 +1171,14 @@ retry:
>      }
>  
>      if (iTask.status != SCSI_STATUS_GOOD) {
> +        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
>          return iTask.err_code;
>      }
>  
>      if (flags & BDRV_REQ_MAY_UNMAP) {
> -        iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
> +        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
>      } else {
> -        iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
> +        iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors);
>      }
>  
>      return 0;
> @@ -1634,10 +1752,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
>          iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
>                                       iscsilun->block_size) >> BDRV_SECTOR_BITS;
>          if (iscsilun->lbprz) {
> -            iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
> -            if (iscsilun->allocationmap == NULL) {
> -                ret = -ENOMEM;
> -            }
> +            ret = iscsi_allocmap_init(iscsilun, bs->open_flags);
>          }
>      }
>  
> @@ -1674,7 +1789,7 @@ static void iscsi_close(BlockDriverState *bs)
>      }
>      iscsi_destroy_context(iscsi);
>      g_free(iscsilun->zeroblock);
> -    g_free(iscsilun->allocationmap);
> +    iscsi_allocmap_free(iscsilun);
>      memset(iscsilun, 0, sizeof(IscsiLun));
>  }
>  
> @@ -1732,6 +1847,16 @@ static int iscsi_reopen_prepare(BDRVReopenState *state,
>      return 0;
>  }
>  
> +static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
> +{
> +    IscsiLun *iscsilun = reopen_state->bs->opaque;
> +
> +    /* the cache.direct status might have changed */
> +    if (iscsilun->allocmap != NULL) {
> +        iscsi_allocmap_init(iscsilun, reopen_state->flags);
> +    }
> +}
> +
>  static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
>  {
>      IscsiLun *iscsilun = bs->opaque;
> @@ -1751,9 +1876,8 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
>          return -EINVAL;
>      }
>  
> -    if (iscsilun->allocationmap != NULL) {
> -        g_free(iscsilun->allocationmap);
> -        iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
> +    if (iscsilun->allocmap != NULL) {
> +        iscsi_allocmap_init(iscsilun, bs->open_flags);
>      }
>  
>      return 0;
> @@ -1813,6 +1937,13 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
>      return 0;
>  }
>  
> +static void iscsi_invalidate_cache(BlockDriverState *bs,
> +                                   Error **errp)
> +{
> +    IscsiLun *iscsilun = bs->opaque;
> +    iscsi_allocmap_invalidate(iscsilun);
> +}
> +
>  static QemuOptsList iscsi_create_opts = {
>      .name = "iscsi-create-opts",
>      .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head),
> @@ -1836,7 +1967,9 @@ static BlockDriver bdrv_iscsi = {
>      .bdrv_close      = iscsi_close,
>      .bdrv_create     = iscsi_create,
>      .create_opts     = &iscsi_create_opts,
> -    .bdrv_reopen_prepare  = iscsi_reopen_prepare,
> +    .bdrv_reopen_prepare   = iscsi_reopen_prepare,
> +    .bdrv_reopen_commit    = iscsi_reopen_commit,
> +    .bdrv_invalidate_cache = iscsi_invalidate_cache,
>  
>      .bdrv_getlength  = iscsi_getlength,
>      .bdrv_get_info   = iscsi_get_info,
> 

Queued, thanks.

Paolo
diff mbox

Patch

diff --git a/block/iscsi.c b/block/iscsi.c
index 2ca8e72..fb308f6 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@ 
  * QEMU Block driver for iSCSI images
  *
  * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2016 Peter Lieven <pl@kamp.de>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -62,7 +62,23 @@  typedef struct IscsiLun {
     struct scsi_inquiry_logical_block_provisioning lbp;
     struct scsi_inquiry_block_limits bl;
     unsigned char *zeroblock;
-    unsigned long *allocationmap;
+    /* The allocmap tracks which clusters (pages) on the iSCSI target are
+     * allocated and which are not. In case a target returns zeros for
+     * unallocated pages (iscsilun->lprz) we can directly return zeros instead
+     * of reading zeros over the wire if a read request falls within an
+     * unallocated block. As there are 3 possible states we need 2 bitmaps to
+     * track. allocmap_valid keeps track if QEMU's information about a page is
+     * valid. allocmap tracks if a page is allocated or not. In case QEMU has no
+     * valid information about a page the corresponding allocmap entry should be
+     * switched to unallocated as well to force a new lookup of the allocation
+     * status as lookups are generally skipped if a page is suspect to be
+     * allocated. If a iSCSI target is opened with cache.direct = on the
+     * allocmap_valid does not exist turning all cached information invalid so
+     * that a fresh lookup is made for any page even if allocmap entry returns
+     * it's unallocated. */
+    unsigned long *allocmap;
+    unsigned long *allocmap_valid;
+    long allocmap_size;
     int cluster_sectors;
     bool use_16_for_rw;
     bool write_protected;
@@ -415,37 +431,132 @@  static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
     return 1;
 }
 
-static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
+static void iscsi_allocmap_free(IscsiLun *iscsilun)
 {
-    return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
-                                                       iscsilun),
-                                       iscsilun->cluster_sectors));
+    g_free(iscsilun->allocmap);
+    g_free(iscsilun->allocmap_valid);
+    iscsilun->allocmap = NULL;
+    iscsilun->allocmap_valid = NULL;
 }
 
-static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
-                                    int nb_sectors)
+
+static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
 {
-    if (iscsilun->allocationmap == NULL) {
-        return;
+    iscsi_allocmap_free(iscsilun);
+
+    iscsilun->allocmap_size =
+        DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
+                     iscsilun->cluster_sectors);
+
+    iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
+    if (!iscsilun->allocmap) {
+        return -ENOMEM;
+    }
+
+    if (open_flags & BDRV_O_NOCACHE) {
+        /* in case that cache.direct = on all allocmap entries are
+         * treated as invalid to force a relookup of the block
+         * status on every read request */
+        return 0;
+    }
+
+    iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size);
+    if (!iscsilun->allocmap_valid) {
+        /* if we are under memory pressure free the allocmap as well */
+        iscsi_allocmap_free(iscsilun);
+        return -ENOMEM;
     }
-    bitmap_set(iscsilun->allocationmap,
-               sector_num / iscsilun->cluster_sectors,
-               DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
+
+    return 0;
 }
 
-static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
-                                      int nb_sectors)
+static void
+iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
+                      int nb_sectors, bool allocated, bool valid)
 {
     int64_t cluster_num, nb_clusters;
-    if (iscsilun->allocationmap == NULL) {
+
+    if (iscsilun->allocmap == NULL) {
         return;
     }
     cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
     nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
                   - cluster_num;
-    if (nb_clusters > 0) {
-        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
+    if (allocated) {
+        bitmap_set(iscsilun->allocmap,
+                   sector_num / iscsilun->cluster_sectors,
+                   DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
+    } else if (nb_clusters > 0) {
+        bitmap_clear(iscsilun->allocmap, cluster_num, nb_clusters);
+    }
+
+    if (iscsilun->allocmap_valid == NULL) {
+        return;
+    }
+    if (valid) {
+        if (nb_clusters > 0) {
+            bitmap_set(iscsilun->allocmap_valid, cluster_num, nb_clusters);
+        }
+    } else {
+        bitmap_clear(iscsilun->allocmap_valid,
+                     sector_num / iscsilun->cluster_sectors,
+                     DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
+    }
+}
+
+static void
+iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t sector_num,
+                             int nb_sectors)
+{
+    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, true, true);
+}
+
+static void
+iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t sector_num,
+                               int nb_sectors)
+{
+    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, true);
+}
+
+static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t sector_num,
+                                       int nb_sectors)
+{
+    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, false);
+}
+
+static void iscsi_allocmap_invalidate(IscsiLun *iscsilun)
+{
+    if (iscsilun->allocmap) {
+        bitmap_zero(iscsilun->allocmap, iscsilun->allocmap_size);
     }
+    if (iscsilun->allocmap_valid) {
+        bitmap_zero(iscsilun->allocmap_valid, iscsilun->allocmap_size);
+    }
+}
+
+static inline bool
+iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num,
+                            int nb_sectors)
+{
+    unsigned long size;
+    if (iscsilun->allocmap == NULL) {
+        return true;
+    }
+    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    return !(find_next_bit(iscsilun->allocmap, size,
+                           sector_num / iscsilun->cluster_sectors) == size);
+}
+
+static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
+                                           int64_t sector_num, int nb_sectors)
+{
+    unsigned long size;
+    if (iscsilun->allocmap_valid == NULL) {
+        return false;
+    }
+    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    return (find_next_zero_bit(iscsilun->allocmap_valid, size,
+                               sector_num / iscsilun->cluster_sectors) == size);
 }
 
 static int coroutine_fn
@@ -507,26 +618,16 @@  retry:
     }
 
     if (iTask.status != SCSI_STATUS_GOOD) {
+        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
         return iTask.err_code;
     }
 
-    iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
+    iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors);
 
     return 0;
 }
 
 
-static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,
-                                             int64_t sector_num, int nb_sectors)
-{
-    unsigned long size;
-    if (iscsilun->allocationmap == NULL) {
-        return true;
-    }
-    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
-    return !(find_next_bit(iscsilun->allocationmap, size,
-                           sector_num / iscsilun->cluster_sectors) == size);
-}
 
 static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
                                                   int64_t sector_num,
@@ -611,9 +712,9 @@  retry:
     }
 
     if (ret & BDRV_BLOCK_ZERO) {
-        iscsi_allocationmap_clear(iscsilun, sector_num, *pnum);
+        iscsi_allocmap_set_unallocated(iscsilun, sector_num, *pnum);
     } else {
-        iscsi_allocationmap_set(iscsilun, sector_num, *pnum);
+        iscsi_allocmap_set_allocated(iscsilun, sector_num, *pnum);
     }
 
     if (*pnum > nb_sectors) {
@@ -648,16 +749,32 @@  static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
         return -EINVAL;
     }
 
-    if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES &&
-        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
-        int64_t ret;
+    /* if cache.direct is off and we have a valid entry in our allocation map
+     * we can skip checking the block status and directly return zeroes if
+     * the request falls within an unallocated area */
+    if (iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
+        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
+            qemu_iovec_memset(iov, 0, 0x00, iov->size);
+            return 0;
+    }
+
+    if (nb_sectors >= ISCSI_CHECKALLOC_THRES &&
+        !iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
+        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
         int pnum;
         BlockDriverState *file;
-        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum, &file);
+        /* check the block status from the beginning of the cluster
+         * containing the start sector */
+        int64_t ret = iscsi_co_get_block_status(bs,
+                          sector_num - sector_num % iscsilun->cluster_sectors,
+                          INT_MAX, &pnum, &file);
         if (ret < 0) {
             return ret;
         }
-        if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) {
+        /* if the whole request falls into an unallocated area we can avoid
+         * to read and directly return zeroes instead */
+        if (ret & BDRV_BLOCK_ZERO &&
+            pnum >= nb_sectors + sector_num % iscsilun->cluster_sectors) {
             qemu_iovec_memset(iov, 0, 0x00, iov->size);
             return 0;
         }
@@ -964,7 +1081,7 @@  retry:
         return iTask.err_code;
     }
 
-    iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+    iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
 
     return 0;
 }
@@ -1054,13 +1171,14 @@  retry:
     }
 
     if (iTask.status != SCSI_STATUS_GOOD) {
+        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
         return iTask.err_code;
     }
 
     if (flags & BDRV_REQ_MAY_UNMAP) {
-        iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
     } else {
-        iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
+        iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors);
     }
 
     return 0;
@@ -1634,10 +1752,7 @@  static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
         iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
                                      iscsilun->block_size) >> BDRV_SECTOR_BITS;
         if (iscsilun->lbprz) {
-            iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
-            if (iscsilun->allocationmap == NULL) {
-                ret = -ENOMEM;
-            }
+            ret = iscsi_allocmap_init(iscsilun, bs->open_flags);
         }
     }
 
@@ -1674,7 +1789,7 @@  static void iscsi_close(BlockDriverState *bs)
     }
     iscsi_destroy_context(iscsi);
     g_free(iscsilun->zeroblock);
-    g_free(iscsilun->allocationmap);
+    iscsi_allocmap_free(iscsilun);
     memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
@@ -1732,6 +1847,16 @@  static int iscsi_reopen_prepare(BDRVReopenState *state,
     return 0;
 }
 
+static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
+{
+    IscsiLun *iscsilun = reopen_state->bs->opaque;
+
+    /* the cache.direct status might have changed */
+    if (iscsilun->allocmap != NULL) {
+        iscsi_allocmap_init(iscsilun, reopen_state->flags);
+    }
+}
+
 static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
 {
     IscsiLun *iscsilun = bs->opaque;
@@ -1751,9 +1876,8 @@  static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
         return -EINVAL;
     }
 
-    if (iscsilun->allocationmap != NULL) {
-        g_free(iscsilun->allocationmap);
-        iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
+    if (iscsilun->allocmap != NULL) {
+        iscsi_allocmap_init(iscsilun, bs->open_flags);
     }
 
     return 0;
@@ -1813,6 +1937,13 @@  static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     return 0;
 }
 
+static void iscsi_invalidate_cache(BlockDriverState *bs,
+                                   Error **errp)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    iscsi_allocmap_invalidate(iscsilun);
+}
+
 static QemuOptsList iscsi_create_opts = {
     .name = "iscsi-create-opts",
     .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head),
@@ -1836,7 +1967,9 @@  static BlockDriver bdrv_iscsi = {
     .bdrv_close      = iscsi_close,
     .bdrv_create     = iscsi_create,
     .create_opts     = &iscsi_create_opts,
-    .bdrv_reopen_prepare  = iscsi_reopen_prepare,
+    .bdrv_reopen_prepare   = iscsi_reopen_prepare,
+    .bdrv_reopen_commit    = iscsi_reopen_commit,
+    .bdrv_invalidate_cache = iscsi_invalidate_cache,
 
     .bdrv_getlength  = iscsi_getlength,
     .bdrv_get_info   = iscsi_get_info,