From patchwork Sun Oct 11 03:53:03 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xiao Guangrong X-Patchwork-Id: 7368221 Return-Path: X-Original-To: patchwork-kvm@patchwork.kernel.org Delivered-To: patchwork-parsemail@patchwork1.web.kernel.org Received: from mail.kernel.org (mail.kernel.org [198.145.29.136]) by patchwork1.web.kernel.org (Postfix) with ESMTP id C99019F40A for ; Sat, 10 Oct 2015 19:59:49 +0000 (UTC) Received: from mail.kernel.org (localhost [127.0.0.1]) by mail.kernel.org (Postfix) with ESMTP id 4F41E20929 for ; Sat, 10 Oct 2015 19:59:48 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 90ABE2091D for ; Sat, 10 Oct 2015 19:59:46 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752733AbbJJT7o (ORCPT ); Sat, 10 Oct 2015 15:59:44 -0400 Received: from mga03.intel.com ([134.134.136.65]:57371 "EHLO mga03.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751944AbbJJT7e (ORCPT ); Sat, 10 Oct 2015 15:59:34 -0400 Received: from orsmga003.jf.intel.com ([10.7.209.27]) by orsmga103.jf.intel.com with ESMTP; 10 Oct 2015 12:59:32 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.17,664,1437462000"; d="scan'208";a="661783249" Received: from xiaoreal1.sh.intel.com (HELO xiaoreal1.sh.intel.com.sh.intel.com) ([10.239.48.79]) by orsmga003.jf.intel.com with ESMTP; 10 Oct 2015 12:59:28 -0700 From: Xiao Guangrong To: pbonzini@redhat.com, imammedo@redhat.com Cc: gleb@kernel.org, mtosatti@redhat.com, stefanha@redhat.com, mst@redhat.com, rth@twiddle.net, ehabkost@redhat.com, dan.j.williams@intel.com, kvm@vger.kernel.org, qemu-devel@nongnu.org, Xiao Guangrong Subject: [PATCH v3 31/32] nvdimm: allow using whole backend memory as pmem Date: Sun, 11 Oct 2015 11:53:03 +0800 Message-Id: <1444535584-18220-32-git-send-email-guangrong.xiao@linux.intel.com> X-Mailer: git-send-email 1.8.3.1 In-Reply-To: <1444535584-18220-1-git-send-email-guangrong.xiao@linux.intel.com> References: <1444535584-18220-1-git-send-email-guangrong.xiao@linux.intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Spam-Status: No, score=-5.0 required=5.0 tests=BAYES_00, DATE_IN_FUTURE_06_12, RCVD_IN_DNSWL_HI, T_RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=unavailable version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP Introduce a parameter, named "reserve-label-data", which indicates that QEMU does not reserve any region on the backend memory to support label data, instead, it will build a readonly label data in memory which has a active namespace containing whole backend memory This is useful for the users who want to pass whole nvdimm device and make its data completely be visible to guest The parameter is false on default Signed-off-by: Xiao Guangrong --- hw/mem/Makefile.objs | 3 +- hw/mem/nvdimm/acpi.c | 20 +++ hw/mem/nvdimm/internal.h | 3 + hw/mem/nvdimm/namespace.c | 309 ++++++++++++++++++++++++++++++++++++++++++++++ hw/mem/nvdimm/nvdimm.c | 36 +++++- include/hw/mem/nvdimm.h | 4 + 6 files changed, 369 insertions(+), 6 deletions(-) create mode 100644 hw/mem/nvdimm/namespace.c diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs index 7310bac..fc76ca5 100644 --- a/hw/mem/Makefile.objs +++ b/hw/mem/Makefile.objs @@ -1,3 +1,4 @@ common-obj-$(CONFIG_DIMM) += dimm.o common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o -common-obj-$(CONFIG_NVDIMM) += nvdimm/nvdimm.o nvdimm/acpi.o +common-obj-$(CONFIG_NVDIMM) += nvdimm/nvdimm.o nvdimm/acpi.o \ + nvdimm/namespace.o diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c index 6f05b37..e6694bc 100644 --- a/hw/mem/nvdimm/acpi.c +++ b/hw/mem/nvdimm/acpi.c @@ -305,6 +305,8 @@ static void build_device_structure(GSList *device_list, char *buf) { for (; device_list; device_list = device_list->next) { NVDIMMDevice *nvdimm = device_list->data; + nfit_memdev *memdev; + nfit_dcr *dcr; /* build System Physical Address Range Description Table. */ buf += build_structure_spa(buf, nvdimm); @@ -313,10 +315,15 @@ static void build_device_structure(GSList *device_list, char *buf) * build Memory Device to System Physical Address Range Mapping * Table. */ + memdev = (nfit_memdev *)buf; buf += build_structure_memdev(buf, nvdimm); /* build Control Region Descriptor Table. */ + dcr = (struct nfit_dcr *)buf; buf += build_structure_dcr(buf, nvdimm); + + calculate_nvdimm_isetcookie(nvdimm, memdev->region_offset, + dcr->serial_number); } } @@ -560,6 +567,12 @@ dsm_cmd_set_label_data(NVDIMMDevice *nvdimm, dsm_in *in, dsm_out *out) goto exit; } + if (!nvdimm->reserve_label_data) { + out->len = sizeof(out->status); + status = DSM_STATUS_NOT_SUPPORTED; + goto exit; + } + status = DSM_STATUS_SUCCESS; memcpy(nvdimm->label_data + offset, cmd_in->in_buf, length); out->len = sizeof(status); @@ -583,6 +596,10 @@ static void dsm_write_nvdimm(MemoryRegion *dsm_ram_mr, uint32_t handle, switch (function) { case DSM_CMD_IMPLEMENTED: cmd_list = DIMM_SUPPORT_CMD; + if (!nvdimm->reserve_label_data) { + cmd_list &= ~(1 << DSM_CMD_SET_NAMESPACE_LABEL_DATA); + } + out->len = sizeof(out->cmd_implemented); out->cmd_implemented.cmd_list = cpu_to_le64(cmd_list); goto free; @@ -936,6 +953,9 @@ void nvdimm_build_acpi_table(NVDIMMState *state, GArray *table_offsets, nvdimm_build_ssdt(state, device_list, table_offsets, table_data, linker); + + build_nvdimm_label_data(device_list); + g_slist_free(device_list); } } diff --git a/hw/mem/nvdimm/internal.h b/hw/mem/nvdimm/internal.h index 1e95363..f523175 100644 --- a/hw/mem/nvdimm/internal.h +++ b/hw/mem/nvdimm/internal.h @@ -35,4 +35,7 @@ typedef struct uuid_le uuid_le; (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) } }) GSList *nvdimm_get_built_list(void); +void calculate_nvdimm_isetcookie(NVDIMMDevice *nvdimm, uint64_t spa_offset, + uint32_t sn); +void build_nvdimm_label_data(GSList *device_list); #endif diff --git a/hw/mem/nvdimm/namespace.c b/hw/mem/nvdimm/namespace.c new file mode 100644 index 0000000..fe58f9a --- /dev/null +++ b/hw/mem/nvdimm/namespace.c @@ -0,0 +1,309 @@ +/* + * NVDIMM Namespace Support + * + * Copyright(C) 2015 Intel Corporation. + * + * Author: + * Xiao Guangrong + * + * NVDIMM namespace specification can be found at: + * http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see + */ + +#include "hw/mem/nvdimm.h" +#include "internal.h" + +static uint64_t fletcher64(void *addr, size_t len) +{ + uint32_t *buf = addr; + uint32_t lo32 = 0; + uint64_t hi32 = 0; + int i; + + for (i = 0; i < len / sizeof(uint32_t); i++) { + lo32 += cpu_to_le32(buf[i]); + hi32 += lo32; + } + + return hi32 << 32 | lo32; +} + +struct interleave_set_info { + struct interleave_set_info_map { + uint64_t region_spa_offset; + uint32_t serial_number; + uint32_t zero; + } mapping[1]; +}; +typedef struct interleave_set_info interleave_set_info; + +void calculate_nvdimm_isetcookie(NVDIMMDevice *nvdimm, uint64_t spa_offset, + uint32_t sn) +{ + interleave_set_info info; + + info.mapping[0].region_spa_offset = spa_offset; + info.mapping[0].serial_number = sn; + info.mapping[0].zero = 0; + + nvdimm->isetcookie = fletcher64(&info, sizeof(info)); +} + +#define NSINDEX_SIGNATURE "NAMESPACE_INDEX\0" + +enum { + NSINDEX_SIG_LEN = 16, + NSINDEX_ALIGN = 256, + NSINDEX_SEQ_MASK = 0x3, + NSINDEX_MAJOR = 0x1, + NSINDEX_MINOR = 0x1, + + NSLABEL_UUID_LEN = 16, + NSLABEL_NAME_LEN = 64, + NSLABEL_FLAG_ROLABEL = 0x1, /* read-only label */ + NSLABEL_FLAG_LOCAL = 0x2, /* DIMM-local namespace */ + NSLABEL_FLAG_BTT = 0x4, /* namespace contains a BTT */ + NSLABEL_FLAG_UPDATING = 0x8, /* label being updated */ +}; + +/* + * struct nd_namespace_index - label set superblock + * @sig: NAMESPACE_INDEX\0 + * @flags: placeholder + * @seq: sequence number for this index + * @myoff: offset of this index in label area + * @mysize: size of this index struct + * @otheroff: offset of other index + * @labeloff: offset of first label slot + * @nslot: total number of label slots + * @major: label area major version + * @minor: label area minor version + * @checksum: fletcher64 of all fields + * @free[0]: bitmap, nlabel bits + * + * The size of free[] is rounded up so the total struct size is a + * multiple of NSINDEX_ALIGN bytes. Any bits this allocates beyond + * nlabel bits must be zero. + */ +struct namespace_label_index_block { + uint8_t sig[NSINDEX_SIG_LEN]; + uint32_t flags; + uint32_t seq; + uint64_t myoff; + uint64_t mysize; + uint64_t otheroff; + uint64_t labeloff; + uint32_t nlabel; + uint16_t major; + uint16_t minor; + uint64_t checksum; + uint8_t free[0]; +} QEMU_PACKED; +typedef struct namespace_label_index_block namespace_label_index_block; + +/* + * struct nd_namespace_label - namespace superblock + * @uuid: UUID per RFC 4122 + * @name: optional name (NULL-terminated) + * @flags: see NSLABEL_FLAG_* + * @nlabel: num labels to describe this ns + * @position: labels position in set + * @isetcookie: interleave set cookie + * @lbasize: LBA size in bytes or 0 for pmem + * @dpa: DPA of NVM range on this DIMM + * @rawsize: size of namespace + * @slot: slot of this label in label area + * @unused: must be zero + */ +struct namespace_label { + uint8_t uuid[NSLABEL_UUID_LEN]; + uint8_t name[NSLABEL_NAME_LEN]; + uint32_t flags; + uint16_t nlabel; + uint16_t position; + uint64_t isetcookie; + uint64_t lbasize; + uint64_t dpa; + uint64_t rawsize; + uint32_t slot; + uint32_t unused; +} QEMU_PACKED; +typedef struct namespace_label namespace_label; + +/*calculate the number of label can be contained in whole label data. */ +static int label_data_max_label_nr(NVDIMMDevice *nvdimm, size_t block_size) +{ + /* totally we have 2 namespace label index block. */ + if (block_size * 2 >= nvdimm->label_size) { + return 0; + } + + return (nvdimm->label_size - block_size * 2) / sizeof(namespace_label); +} + +/*calculate the number of label can be contained in index block. */ +static int label_index_block_max_label_nr(size_t block_size) +{ + int free_size; + + free_size = block_size - sizeof(namespace_label_index_block); + + return free_size * BITS_PER_BYTE; +} + +static int calculate_max_label_nr(NVDIMMDevice *nvdimm, size_t block_size) +{ + return MIN(label_index_block_max_label_nr(block_size), + label_data_max_label_nr(nvdimm, block_size)); +} + +/* + * check if we can increase the size of namespace_label_index_block to + * contain more labels. + */ +static bool can_increase_index_block(NVDIMMDevice *nvdimm, + size_t block_size, int label_nr) +{ + size_t remaining; + + remaining = nvdimm->label_size - block_size * 2 - + label_nr * sizeof(namespace_label); + + assert((int64_t)remaining >= 0); + + /* can contain 1 label at least. */ + return remaining >= NSINDEX_ALIGN * 2 + sizeof(namespace_label); +} + +static void count_label_nr(NVDIMMDevice *nvdimm, size_t *label_block_size, + int *label_nr) +{ + *label_block_size = 0; + + do { + /* + * The minimum size of an index block is 256 bytes and the size must + * be a multiple of 256 bytes. + */ + *label_block_size += NSINDEX_ALIGN; + + *label_nr = calculate_max_label_nr(nvdimm, *label_block_size); + } while (can_increase_index_block(nvdimm, *label_block_size, *label_nr)); +} + +static void namespace_label_uuid(NVDIMMDevice *nvdimm, void *uuid) +{ + /* magic UUID. */ + uuid_le label_uuid_init = UUID_LE(0x137e67a9, 0x7dcb, 0x4c66, 0xb2, + 0xe6, 0x05, 0x06, 0x5b, 0xeb, + 0x6a, 0x00); + int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP, NULL); + + label_uuid_init.b[0] += slot; + memcpy(uuid, &label_uuid_init, sizeof(label_uuid_init)); +} + +static void nvdimm_device_init_namespace(NVDIMMDevice *nvdimm) +{ + namespace_label_index_block *index1, *index2; + namespace_label *label; + uint64_t addr = object_property_get_int(OBJECT(nvdimm), DIMM_ADDR_PROP, + NULL); + uint64_t size = object_property_get_int(OBJECT(nvdimm), DIMM_SIZE_PROP, + NULL); + int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP, NULL); + int i, label_nr; + size_t label_block_size; + + nvdimm->label_data = g_malloc(nvdimm->label_size); + + count_label_nr(nvdimm, &label_block_size, &label_nr); + nvdebug("nvdimm%d: label_block_size 0x%lx label_nr %d.\n", + slot, label_block_size, label_nr); + + index1 = nvdimm->label_data; + + /* + * init the first namespace label index block, except @otheroff + * and @checksum. we will do it later. + */ + memcpy(index1->sig, NSINDEX_SIGNATURE, sizeof(NSINDEX_SIGNATURE)); + index1->flags = cpu_to_le32(0); + index1->seq = cpu_to_le32(0x1); + index1->myoff = cpu_to_le64(0); + index1->mysize = cpu_to_le64(label_block_size); + index1->labeloff = cpu_to_le64(label_block_size * 2); + index1->nlabel = cpu_to_le32(label_nr); + index1->major = cpu_to_le16(NSINDEX_MAJOR); + index1->minor = cpu_to_le16(NSINDEX_MINOR); + index1->checksum = cpu_to_le64(0); + memset(index1->free, 0, + label_block_size - sizeof(namespace_label_index_block)); + + /* + * the label slot with the lowest offset in the label storage area is + * tracked by the least significant bit of the first byte of the free + * array. + * + * the fist label is used. + */ + for (i = 1; i < index1->nlabel; i++) { + set_bit(i, (unsigned long *)index1->free); + } + + /* init the second namespace label index block. */ + index2 = (void *)index1 + label_block_size; + memcpy(index2, index1, label_block_size); + index2->seq = cpu_to_le32(0x2); + index2->myoff = cpu_to_le64(label_block_size); + + /* init @otheroff and @checksume. */ + index1->otheroff = cpu_to_le64(index2->myoff); + index2->otheroff = cpu_to_le64(index1->myoff); + index1->checksum = cpu_to_le64(fletcher64(index1, label_block_size)); + index2->checksum = cpu_to_le64(fletcher64(index2, label_block_size)); + + /* only one label is used which is the first label and is readonly. */ + label = nvdimm->label_data + label_block_size * 2; + namespace_label_uuid(nvdimm, label->uuid); + sprintf((char *)label->name, "QEMU NS%d", slot); + label->flags = cpu_to_le32(NSLABEL_FLAG_ROLABEL); + label->nlabel = cpu_to_le16(1); + label->position = cpu_to_le16(0); + label->isetcookie = cpu_to_le64(nvdimm->isetcookie); + label->lbasize = cpu_to_le64(0); + label->dpa = cpu_to_le64(addr); + label->rawsize = cpu_to_le64(size); + label->slot = cpu_to_le32(0); + label->unused = cpu_to_le32(0); + + nvdebug("nvdimm%d, checksum1 0x%lx checksum2 0x%lx isetcookie 0x%lx.\n", + slot, index1->checksum, index2->checksum, + label->isetcookie); +} + +void build_nvdimm_label_data(GSList *device_list) +{ + for (; device_list; device_list = device_list->next) { + NVDIMMDevice *nvdimm = device_list->data; + + if (nvdimm->label_data) { + continue; + } + + nvdimm_device_init_namespace(nvdimm); + } +} diff --git a/hw/mem/nvdimm/nvdimm.c b/hw/mem/nvdimm/nvdimm.c index bc8c577..9688533 100644 --- a/hw/mem/nvdimm/nvdimm.c +++ b/hw/mem/nvdimm/nvdimm.c @@ -62,14 +62,15 @@ static void nvdimm_realize(DIMMDevice *dimm, Error **errp) { MemoryRegion *mr; NVDIMMDevice *nvdimm = NVDIMM(dimm); - uint64_t size; + uint64_t reserved_label_size, size; nvdimm->label_size = MIN_NAMESPACE_LABEL_SIZE; + reserved_label_size = nvdimm->reserve_label_data ? nvdimm->label_size : 0; mr = host_memory_backend_get_memory(dimm->hostmem, errp); size = memory_region_size(mr); - if (size <= nvdimm->label_size) { + if (size <= reserved_label_size) { char *path = object_get_canonical_path_component(OBJECT(dimm->hostmem)); error_setg(errp, "the size of memdev %s (0x%" PRIx64 ") is too small" " to contain nvdimm namespace label (0x%" PRIx64 ")", path, @@ -78,9 +79,12 @@ static void nvdimm_realize(DIMMDevice *dimm, Error **errp) } memory_region_init_alias(&nvdimm->nvdimm_mr, OBJECT(dimm), "nvdimm-memory", - mr, 0, size - nvdimm->label_size); - nvdimm->label_data = memory_region_get_ram_ptr(mr) + - memory_region_size(&nvdimm->nvdimm_mr); + mr, 0, size - reserved_label_size); + + if (reserved_label_size) { + nvdimm->label_data = memory_region_get_ram_ptr(mr) + + memory_region_size(&nvdimm->nvdimm_mr); + } } static void nvdimm_class_init(ObjectClass *oc, void *data) @@ -95,10 +99,32 @@ static void nvdimm_class_init(ObjectClass *oc, void *data) ddc->get_memory_region = nvdimm_get_memory_region; } +static bool nvdimm_get_reserve_label_data(Object *obj, Error **errp) +{ + NVDIMMDevice *nvdimm = NVDIMM(obj); + + return nvdimm->reserve_label_data; +} + +static void nvdimm_set_reserve_label_data(Object *obj, bool value, Error **errp) +{ + NVDIMMDevice *nvdimm = NVDIMM(obj); + + nvdimm->reserve_label_data = value; +} + +static void nvdimm_init(Object *obj) +{ + object_property_add_bool(obj, "reserve-label-data", + nvdimm_get_reserve_label_data, + nvdimm_set_reserve_label_data, NULL); +} + static TypeInfo nvdimm_info = { .name = TYPE_NVDIMM, .parent = TYPE_DIMM, .instance_size = sizeof(NVDIMMDevice), + .instance_init = nvdimm_init, .class_init = nvdimm_class_init, }; diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index 0a6bda4..a8eef65 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -28,8 +28,12 @@ struct NVDIMMDevice { DIMMDevice parent_obj; /* public */ + bool reserve_label_data; uint64_t label_size; void *label_data; + + uint64_t isetcookie; + MemoryRegion nvdimm_mr; }; typedef struct NVDIMMDevice NVDIMMDevice;