From patchwork Wed Feb 11 11:41:26 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jamie Lokier X-Patchwork-Id: 6659 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n1BBfWrw026711 for ; Wed, 11 Feb 2009 11:41:32 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752624AbZBKLla (ORCPT ); Wed, 11 Feb 2009 06:41:30 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1752831AbZBKLla (ORCPT ); Wed, 11 Feb 2009 06:41:30 -0500 Received: from mail2.shareable.org ([80.68.89.115]:40987 "EHLO mail2.shareable.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752624AbZBKLl3 (ORCPT ); Wed, 11 Feb 2009 06:41:29 -0500 Received: from jamie by mail2.shareable.org with local (Exim 4.63) (envelope-from ) id 1LXDT4-0000In-L9; Wed, 11 Feb 2009 11:41:26 +0000 Date: Wed, 11 Feb 2009 11:41:26 +0000 From: Jamie Lokier To: Kevin Wolf Cc: qemu-devel@nongnu.org, kvm-devel Subject: Re: [Qemu-devel] qcow2 corruption observed, fixed by reverting old change Message-ID: <20090211114126.GC31997@shareable.org> References: <20090211070049.GA27821@shareable.org> <4992A108.8070304@suse.de> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <4992A108.8070304@suse.de> User-Agent: Mutt/1.5.13 (2006-08-11) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org Kevin Wolf wrote: > Jamie Lokier schrieb: > > Although there are many ways to make Windows blue screen in KVM, in > > this case I've narrowed it down to the difference in > > qemu/block-qcow2.c between kvm-72 and kvm-73 (not -83). > > This must be one of SVN revisions 5003 to 5008 in upstream qemu. Can you > narrow it down to one of these? I certainly don't feel like reviewing > all of them once again. It's QEMU SVN delta 5005-5006, copied below. I've tested by applying the diffs up to QEMU SVN revs 5003 to 500, onto kvm-72, and 5005-5006 is the diff which triggers the failed guest boot, consistently. Aside from logic, the code mixes signed 32-bit with unsigned 64-bit with unclear naming which would make me nervous. My host is 64-bit, by the way. -- Jamie --- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html --- trunk/block-qcow2.c 2008/08/14 18:09:32 5005 +++ trunk/block-qcow2.c 2008/08/14 18:10:28 5006 @@ -52,6 +52,8 @@ #define QCOW_CRYPT_NONE 0 #define QCOW_CRYPT_AES 1 +#define QCOW_MAX_CRYPT_CLUSTERS 32 + /* indicate that the refcount of the referenced cluster is exactly one. */ #define QCOW_OFLAG_COPIED (1LL << 63) /* indicate that the cluster is compressed (they never have the copied flag) */ @@ -263,7 +265,8 @@ if (!s->cluster_cache) goto fail; /* one more sector for decompressed data alignment */ - s->cluster_data = qemu_malloc(s->cluster_size + 512); + s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + + 512); if (!s->cluster_data) goto fail; s->cluster_cache_offset = -1; @@ -612,43 +615,98 @@ * For a given offset of the disk image, return cluster offset in * qcow2 file. * + * on entry, *num is the number of contiguous clusters we'd like to + * access following offset. + * + * on exit, *num is the number of contiguous clusters we can read. + * * Return 1, if the offset is found * Return 0, otherwise. * */ -static uint64_t get_cluster_offset(BlockDriverState *bs, uint64_t offset) +static uint64_t get_cluster_offset(BlockDriverState *bs, + uint64_t offset, int *num) { BDRVQcowState *s = bs->opaque; int l1_index, l2_index; - uint64_t l2_offset, *l2_table, cluster_offset; + uint64_t l2_offset, *l2_table, cluster_offset, next; + int l1_bits; + int index_in_cluster, nb_available, nb_needed; + + index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); + nb_needed = *num + index_in_cluster; + + l1_bits = s->l2_bits + s->cluster_bits; + + /* compute how many bytes there are between the offset and + * and the end of the l1 entry + */ + + nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1)); + + /* compute the number of available sectors */ + + nb_available = (nb_available >> 9) + index_in_cluster; + + cluster_offset = 0; /* seek the the l2 offset in the l1 table */ - l1_index = offset >> (s->l2_bits + s->cluster_bits); + l1_index = offset >> l1_bits; if (l1_index >= s->l1_size) - return 0; + goto out; l2_offset = s->l1_table[l1_index]; /* seek the l2 table of the given l2 offset */ if (!l2_offset) - return 0; + goto out; /* load the l2 table in memory */ l2_offset &= ~QCOW_OFLAG_COPIED; l2_table = l2_load(bs, l2_offset); if (l2_table == NULL) - return 0; + goto out; /* find the cluster offset for the given disk offset */ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); cluster_offset = be64_to_cpu(l2_table[l2_index]); + nb_available = s->cluster_sectors; + l2_index++; + + if (!cluster_offset) { - return cluster_offset & ~QCOW_OFLAG_COPIED; + /* how many empty clusters ? */ + + while (nb_available < nb_needed && !l2_table[l2_index]) { + l2_index++; + nb_available += s->cluster_sectors; + } + } else { + + /* how many allocated clusters ? */ + + cluster_offset &= ~QCOW_OFLAG_COPIED; + while (nb_available < nb_needed) { + next = be64_to_cpu(l2_table[l2_index]) & ~QCOW_OFLAG_COPIED; + if (next != cluster_offset + (nb_available << 9)) + break; + l2_index++; + nb_available += s->cluster_sectors; + } + } + +out: + if (nb_available > nb_needed) + nb_available = nb_needed; + + *num = nb_available - index_in_cluster; + + return cluster_offset; } /* @@ -659,13 +717,10 @@ */ static void free_any_clusters(BlockDriverState *bs, - uint64_t cluster_offset) + uint64_t cluster_offset, int nb_clusters) { BDRVQcowState *s = bs->opaque; - if (cluster_offset == 0) - return; - /* free the cluster */ if (cluster_offset & QCOW_OFLAG_COMPRESSED) { @@ -677,7 +732,9 @@ return; } - free_clusters(bs, cluster_offset, s->cluster_size); + free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits); + + return; } /* @@ -768,7 +825,8 @@ if (cluster_offset & QCOW_OFLAG_COPIED) return cluster_offset & ~QCOW_OFLAG_COPIED; - free_any_clusters(bs, cluster_offset); + if (cluster_offset) + free_any_clusters(bs, cluster_offset, 1); cluster_offset = alloc_bytes(bs, compressed_size); nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - @@ -806,68 +864,136 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int n_start, int n_end) + int n_start, int n_end, + int *num) { BDRVQcowState *s = bs->opaque; int l2_index, ret; uint64_t l2_offset, *l2_table, cluster_offset; + int nb_available, nb_clusters, i; + uint64_t start_sect, current; ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); if (ret == 0) return 0; + nb_clusters = ((n_end << 9) + s->cluster_size - 1) >> + s->cluster_bits; + if (nb_clusters > s->l2_size - l2_index) + nb_clusters = s->l2_size - l2_index; + cluster_offset = be64_to_cpu(l2_table[l2_index]); - if (cluster_offset & QCOW_OFLAG_COPIED) - return cluster_offset & ~QCOW_OFLAG_COPIED; - free_any_clusters(bs, cluster_offset); + /* We keep all QCOW_OFLAG_COPIED clusters */ + + if (cluster_offset & QCOW_OFLAG_COPIED) { + + for (i = 1; i < nb_clusters; i++) { + current = be64_to_cpu(l2_table[l2_index + i]); + if (cluster_offset + (i << s->cluster_bits) != current) + break; + } + nb_clusters = i; + + nb_available = nb_clusters << (s->cluster_bits - 9); + if (nb_available > n_end) + nb_available = n_end; + + cluster_offset &= ~QCOW_OFLAG_COPIED; + + goto out; + } + + /* for the moment, multiple compressed clusters are not managed */ + + if (cluster_offset & QCOW_OFLAG_COMPRESSED) + nb_clusters = 1; + + /* how many empty or how many to free ? */ + + if (!cluster_offset) { + + /* how many free clusters ? */ + + i = 1; + while (i < nb_clusters && + l2_table[l2_index + i] == 0) { + i++; + } + nb_clusters = i; + + } else { + + /* how many contiguous clusters ? */ + + for (i = 1; i < nb_clusters; i++) { + current = be64_to_cpu(l2_table[l2_index + i]); + if (cluster_offset + (i << s->cluster_bits) != current) + break; + } + nb_clusters = i; + + free_any_clusters(bs, cluster_offset, i); + } /* allocate a new cluster */ - cluster_offset = alloc_clusters(bs, s->cluster_size); + cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size); /* we must initialize the cluster content which won't be written */ - if ((n_end - n_start) < s->cluster_sectors) { - uint64_t start_sect; - - start_sect = (offset & ~(s->cluster_size - 1)) >> 9; - ret = copy_sectors(bs, start_sect, - cluster_offset, 0, n_start); + nb_available = nb_clusters << (s->cluster_bits - 9); + if (nb_available > n_end) + nb_available = n_end; + + /* copy content of unmodified sectors */ + + start_sect = (offset & ~(s->cluster_size - 1)) >> 9; + if (n_start) { + ret = copy_sectors(bs, start_sect, cluster_offset, 0, n_start); if (ret < 0) return 0; - ret = copy_sectors(bs, start_sect, - cluster_offset, n_end, s->cluster_sectors); + } + + if (nb_available & (s->cluster_sectors - 1)) { + uint64_t end = nb_available & ~(uint64_t)(s->cluster_sectors - 1); + ret = copy_sectors(bs, start_sect + end, + cluster_offset + (end << 9), + nb_available - end, + s->cluster_sectors); if (ret < 0) return 0; } /* update L2 table */ - l2_table[l2_index] = cpu_to_be64(cluster_offset | QCOW_OFLAG_COPIED); + for (i = 0; i < nb_clusters; i++) + l2_table[l2_index + i] = cpu_to_be64((cluster_offset + + (i << s->cluster_bits)) | + QCOW_OFLAG_COPIED); + if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t), l2_table + l2_index, - sizeof(uint64_t)) != sizeof(uint64_t)) + nb_clusters * sizeof(uint64_t)) != + nb_clusters * sizeof(uint64_t)) return 0; +out: + *num = nb_available - n_start; + return cluster_offset; } static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { - BDRVQcowState *s = bs->opaque; - int index_in_cluster, n; uint64_t cluster_offset; - cluster_offset = get_cluster_offset(bs, sector_num << 9); - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) - n = nb_sectors; - *pnum = n; + *pnum = nb_sectors; + cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum); + return (cluster_offset != 0); } @@ -944,11 +1070,9 @@ uint64_t cluster_offset; while (nb_sectors > 0) { - cluster_offset = get_cluster_offset(bs, sector_num << 9); + n = nb_sectors; + cluster_offset = get_cluster_offset(bs, sector_num << 9, &n); index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) - n = nb_sectors; if (!cluster_offset) { if (bs->backing_hd) { /* read from the base image */ @@ -987,15 +1111,17 @@ BDRVQcowState *s = bs->opaque; int ret, index_in_cluster, n; uint64_t cluster_offset; + int n_end; while (nb_sectors > 0) { index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) - n = nb_sectors; + n_end = index_in_cluster + nb_sectors; + if (s->crypt_method && + n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) + n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; cluster_offset = alloc_cluster_offset(bs, sector_num << 9, index_in_cluster, - index_in_cluster + n); + n_end, &n); if (!cluster_offset) return -1; if (s->crypt_method) { @@ -1068,11 +1194,9 @@ } /* prepare next AIO request */ - acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9); + acb->n = acb->nb_sectors; + acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n); index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); - acb->n = s->cluster_sectors - index_in_cluster; - if (acb->n > acb->nb_sectors) - acb->n = acb->nb_sectors; if (!acb->cluster_offset) { if (bs->backing_hd) { @@ -1152,6 +1276,7 @@ int index_in_cluster; uint64_t cluster_offset; const uint8_t *src_buf; + int n_end; acb->hd_aiocb = NULL; @@ -1174,19 +1299,22 @@ } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); - acb->n = s->cluster_sectors - index_in_cluster; - if (acb->n > acb->nb_sectors) - acb->n = acb->nb_sectors; + n_end = index_in_cluster + acb->nb_sectors; + if (s->crypt_method && + n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) + n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; + cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9, index_in_cluster, - index_in_cluster + acb->n); + n_end, &acb->n); if (!cluster_offset || (cluster_offset & 511) != 0) { ret = -EIO; goto fail; } if (s->crypt_method) { if (!acb->cluster_data) { - acb->cluster_data = qemu_mallocz(s->cluster_size); + acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * + s->cluster_size); if (!acb->cluster_data) { ret = -ENOMEM; goto fail;