Message ID | CAD6h2NQZGBa5U0-tmj=wtsFpcqnV2tHx+_vESrx8+3r7dGpHjQ@mail.gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, May 20, 2014 at 10:16:22PM +0800, Haojian Zhuang wrote: > On 20 May 2014 22:01, Christoffer Dall <christoffer.dall@linaro.org> wrote: > > On Tue, May 20, 2014 at 09:52:53PM +0800, Haojian Zhuang wrote: > >> On 20 May 2014 21:44, Christoffer Dall <christoffer.dall@linaro.org> wrote: > >> > On Tue, May 20, 2014 at 09:10:27PM +0800, Haojian Zhuang wrote: > > > > [...] > > > >> >> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c > >> >> index 47b2983..4c0c1e9 100644 > >> >> --- a/virt/kvm/arm/vgic.c > >> >> +++ b/virt/kvm/arm/vgic.c > >> >> @@ -76,6 +76,8 @@ > >> >> #define IMPLEMENTER_ARM 0x43b > >> >> #define GICC_ARCH_VERSION_V2 0x2 > >> >> > >> >> +#define vgic_nr_lr(vcpu) (vcpu->hw_cfg & HWCFG_NR_LR_MASK) > >> >> + > >> >> /* Physical address of vgic virtual cpu interface */ > >> >> static phys_addr_t vgic_vcpu_base; > >> >> > >> >> @@ -97,7 +99,7 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); > >> >> static void vgic_update_state(struct kvm *kvm); > >> >> static void vgic_kick_vcpus(struct kvm *kvm); > >> >> static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); > >> >> -static u32 vgic_nr_lr; > >> >> +static u32 vgic_hw_cfg; > >> >> > >> >> static unsigned int vgic_maint_irq; > >> >> > >> >> @@ -624,9 +626,9 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) > >> >> struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; > >> >> int vcpu_id = vcpu->vcpu_id; > >> >> int i, irq, source_cpu; > >> >> - u32 *lr; > >> >> + u32 *lr, nr_lr = vgic_nr_lr(vgic_cpu); > >> > > >> > This is static for any system post-boot, right? Can't we set this > >> > global variable once like we did before instead of having to define > >> > these extra variables and do the bit manipulation all over the place? > >> > > >> > -Christoffer > >> > >> I tried to define a global gich_apr variable before. But Marc didn't agree on > >> that. He suggested to use vgic_cpu_nr_lr to save both GICH_APR offset > >> and nr_lr. > >> > >> Adding gich_apr variable should be the simpler implementation. > >> > > You're talking about storing this information on the vgic_cpu struct, > > which is accessed on every world-switch patch. There, you don't want > > two memory accesses. > > > diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S > index 76af9302..b27e43f 100644 > --- a/arch/arm/kvm/interrupts_head.S > +++ b/arch/arm/kvm/interrupts_head.S > @@ -419,7 +419,9 @@ vcpu .req r0 @ vcpu pointer > always in r0 > ldr r7, [r2, #GICH_EISR1] > ldr r8, [r2, #GICH_ELRSR0] > ldr r9, [r2, #GICH_ELRSR1] > - ldr r10, [r2, #GICH_APR] > + ldr r10, =gich_apr > + ldr r10, [r10] > + ldr r10, [r2, r10] > > str r3, [r11, #VGIC_CPU_HCR] > str r4, [r11, #VGIC_CPU_VMCR] > @@ -435,7 +437,11 @@ vcpu .req r0 @ vcpu pointer > always in r0 > str r5, [r2, #GICH_HCR] > > /* Save list registers */ > - add r2, r2, #GICH_LR0 > + ldr r10, =gich_apr > + ldr r10, [r10] > + /* the offset between GICH_APR & GICH_LR0 is 0x10 */ > + add r10, r10, #0x10 > + add r2, r2, r10 > add r3, r11, #VGIC_CPU_LR > ldr r4, [r11, #VGIC_CPU_NR_LR] > 1: ldr r6, [r2], #4 > @@ -469,10 +475,16 @@ vcpu .req r0 @ vcpu pointer > always in r0 > > str r3, [r2, #GICH_HCR] > str r4, [r2, #GICH_VMCR] > - str r8, [r2, #GICH_APR] > + ldr r6, =gich_apr > + ldr r6, [r6] > + str r8, [r2, r6] > > /* Restore list registers */ > - add r2, r2, #GICH_LR0 > + ldr r6, =gich_apr > + ldr r6, [r6] > + /* the offset between GICH_APR & GICH_LR0 is 0x10 */ > + add r6, r6, #0x10 > + add r2, r2, r6 > add r3, r11, #VGIC_CPU_LR > ldr r4, [r11, #VGIC_CPU_NR_LR] > 1: ldr r6, [r3], #4 > @@ -618,3 +630,7 @@ vcpu .req r0 @ vcpu pointer > always in r0 > .macro load_vcpu > mrc p15, 4, vcpu, c13, c0, 2 @ HTPIDR > .endm > + > + .global gich_apr > +gich_apr: > + .long GICH_APR > > diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c > index 47b2983..6bf31db 100644 > --- a/virt/kvm/arm/vgic.c > +++ b/virt/kvm/arm/vgic.c > @@ -1470,17 +1470,30 @@ static struct notifier_block vgic_cpu_nb = { > .notifier_call = vgic_cpu_notify, > }; > > +static const struct of_device_id of_vgic_ids[] = { > + { > + .compatible = "arm,cortex-a15-gic", > + .data = (void *)GICH_APR, > + }, { > + .compatible = "hisilicon,hip04-gic", > + .data = (void *)HIP04_GICH_APR, > + }, { > + }, > +}; > + > int kvm_vgic_hyp_init(void) > { > int ret; > struct resource vctrl_res; > struct resource vcpu_res; > + const struct of_device_id *match; > > - vgic_node = of_find_compatible_node(NULL, NULL, "arm,cortex-a15-gic"); > + vgic_node = of_find_matching_node_and_match(NULL, of_vgic_ids, &match); > if (!vgic_node) { > kvm_err("error: no compatible vgic node in DT\n"); > return -ENODEV; > } > + gich_apr = (unsigned int)match->data; > > vgic_maint_irq = irq_of_parse_and_map(vgic_node, 0); > if (!vgic_maint_irq) { > > It's the implementation of gich_apr in arm32. > > We needn't add or change anything in struct vgic_cpu. And both the > assembly code and the code could be much easier. > But we do end up with an extra memory access from EL2 in the critical path, and I believe Marc's concern here is that if we cross a cache line, this might really hurt performance. > > Here, on the other hand, you're in host kernel land, and you can do your > > bit-shuffling once, and always access a single static variable like we > > did before, which will simplify the C-code. > > > > No bit-shuffling in gich_apr implementation. Is it right? > I would like to see us avoid allocating that extra nr_lr variable in every function mucking with list registers in the C-file. I would need to look at the data structure size and profile the world-switch code to properly evaluate if it's worth packing the values in a single field, so I'll let Marc comment on this one. -Christoffer
On 20 May 2014 23:05, Christoffer Dall <christoffer.dall@linaro.org> wrote: > On Tue, May 20, 2014 at 10:16:22PM +0800, Haojian Zhuang wrote: >> On 20 May 2014 22:01, Christoffer Dall <christoffer.dall@linaro.org> wrote: >> It's the implementation of gich_apr in arm32. >> >> We needn't add or change anything in struct vgic_cpu. And both the >> assembly code and the code could be much easier. >> > > But we do end up with an extra memory access from EL2 in the critical > path, and I believe Marc's concern here is that if we cross a cache > line, this might really hurt performance. > Sorry. Do we may cross a cache line or a TLB entry? I think that you're concerning to cross TLB entries. The reason is in below. 1. If the problem is on crossing cache line, it's caused by too much instructions. Either the packing nr_lr or the gich_apr adds some instructions. The packing nr_lr needs a little more instructions. 2. ldr instruction is a pseudo instruction. So it's parsed into operation on PC register. Now I put gich_apr in interrupts_head.S, it results in gich_apr variable before __kvm_hyp_code_start. It may cross the TLB entries. How about to declare gich_apr after __kvm_cpu_return in interrupts.S? Since save_vgic_state & restore_vgic_state is only used once, declaring gich_apr just after the code could avoid crossing TLB entry. Regards Haojian
On Tue, May 20, 2014 at 11:39:12PM +0800, Haojian Zhuang wrote: > On 20 May 2014 23:05, Christoffer Dall <christoffer.dall@linaro.org> wrote: > > On Tue, May 20, 2014 at 10:16:22PM +0800, Haojian Zhuang wrote: > >> On 20 May 2014 22:01, Christoffer Dall <christoffer.dall@linaro.org> wrote: > >> It's the implementation of gich_apr in arm32. > >> > >> We needn't add or change anything in struct vgic_cpu. And both the > >> assembly code and the code could be much easier. > >> > > > > But we do end up with an extra memory access from EL2 in the critical > > path, and I believe Marc's concern here is that if we cross a cache > > line, this might really hurt performance. > > > > Sorry. Do we may cross a cache line or a TLB entry? > > I think that you're concerning to cross TLB entries. The reason is in > below. > > 1. If the problem is on crossing cache line, it's caused by too much > instructions. Either the packing nr_lr or the gich_apr adds some > instructions. The packing nr_lr needs a little more instructions. I don't see why this argument is valid. If you have a separate instruction and data cache, you may be loading from a different cache line when placing the static value close to your instructions. If you add a variable to the vcpu struct, all of the fields may no longer fit in a single data cache line and you may cause the memory subsystem to have to fetch another cache line. I believe the latter is Marc's concern, and I suspect he would be equally concerned about the former. I'm not too concerned about a TLB entry here, that works at a 4K granularity and with the proper alignment of the struct and hyp code, that shouldn't be a concern. Without it, of course, there's a risk of requiring another TLB entry as well. > > 2. ldr instruction is a pseudo instruction. So it's parsed into operation > on PC register. Eh, it just means that it does a load relative from the PC address, and if the offset is too far to be encoded in the immediate field, then it does an indirect load through a literal pool, if I understand what you are referring to. In any case, there will be at least one actual ldr instruction issued on the PE. > Now I put gich_apr in interrupts_head.S, it results > in gich_apr variable before __kvm_hyp_code_start. It may cross the > TLB entries. > How about to declare gich_apr after __kvm_cpu_return in interrupts.S? > Since save_vgic_state & restore_vgic_state is only used once, declaring > gich_apr just after the code could avoid crossing TLB entry. > Again, all the fields in the vcpu struct are quite likely to be aligned within a single data cache line, I don't believe that's the case if you stick some data in between the the hyp code. -Christoffer
On 21 May 2014 17:02, Christoffer Dall <christoffer.dall@linaro.org> wrote: > On Tue, May 20, 2014 at 11:39:12PM +0800, Haojian Zhuang wrote: >> On 20 May 2014 23:05, Christoffer Dall <christoffer.dall@linaro.org> wrote: >> > On Tue, May 20, 2014 at 10:16:22PM +0800, Haojian Zhuang wrote: >> >> On 20 May 2014 22:01, Christoffer Dall <christoffer.dall@linaro.org> wrote: >> >> It's the implementation of gich_apr in arm32. >> >> >> >> We needn't add or change anything in struct vgic_cpu. And both the >> >> assembly code and the code could be much easier. >> >> >> > >> > But we do end up with an extra memory access from EL2 in the critical >> > path, and I believe Marc's concern here is that if we cross a cache >> > line, this might really hurt performance. >> > >> >> Sorry. Do we may cross a cache line or a TLB entry? >> >> I think that you're concerning to cross TLB entries. The reason is in >> below. >> >> 1. If the problem is on crossing cache line, it's caused by too much >> instructions. Either the packing nr_lr or the gich_apr adds some >> instructions. The packing nr_lr needs a little more instructions. > > I don't see why this argument is valid. If you have a separate I want to make it clear what I missing. > instruction and data cache, you may be loading from a different cache > line when placing the static value close to your instructions. If you > add a variable to the vcpu struct, all of the fields may no longer fit > in a single data cache line and you may cause the memory subsystem to > have to fetch another cache line. I believe the latter is Marc's Yes, I forgot new gich_apr is the only variable in the assembly code. So the gich_apr will be load from a different cache line. Then let's come back to packing hw_cfg. Now the high word is used to store the offset of GICH_APR. The unpacking operation is too complex to calculate the register offset, especially in arm64 implementation. How about changing the packing mechanism? 1. Add the definition of enconding in arm-gic.h. #define HIP04_GIC (1 << 16) #define HIP04_GICH_APR 0x70 #define HIP04_GICH_LR0 0x80 2. The code in save_vgic_state could be changed in below. ldr r9, [r2, #GICH_ELRSR1] +ldr r10, [r3, #VGIC_CPU_HW_CFG] +tst r10, #HIP04_GIC +ldreq r10, [r2, #GICH_APR] +ldrne r10, [r2, #HIP04_GICH_APR] Although I used the condition checking at here, the code could be easier. I think that the executing time on "ldr" and "ldreq" should be same, because CPCS should be ready Then calculation is avoid. Only three instructions are appended for both GICH_APR & GICH_LR0. The implementation in arm64 should be same & simple. How do you think so? Regards Haojian
On Wed, May 21, 2014 at 05:47:00PM +0800, Haojian Zhuang wrote: > On 21 May 2014 17:02, Christoffer Dall <christoffer.dall@linaro.org> wrote: > > On Tue, May 20, 2014 at 11:39:12PM +0800, Haojian Zhuang wrote: > >> On 20 May 2014 23:05, Christoffer Dall <christoffer.dall@linaro.org> wrote: > >> > On Tue, May 20, 2014 at 10:16:22PM +0800, Haojian Zhuang wrote: > >> >> On 20 May 2014 22:01, Christoffer Dall <christoffer.dall@linaro.org> wrote: > >> >> It's the implementation of gich_apr in arm32. > >> >> > >> >> We needn't add or change anything in struct vgic_cpu. And both the > >> >> assembly code and the code could be much easier. > >> >> > >> > > >> > But we do end up with an extra memory access from EL2 in the critical > >> > path, and I believe Marc's concern here is that if we cross a cache > >> > line, this might really hurt performance. > >> > > >> > >> Sorry. Do we may cross a cache line or a TLB entry? > >> > >> I think that you're concerning to cross TLB entries. The reason is in > >> below. > >> > >> 1. If the problem is on crossing cache line, it's caused by too much > >> instructions. Either the packing nr_lr or the gich_apr adds some > >> instructions. The packing nr_lr needs a little more instructions. > > > > I don't see why this argument is valid. If you have a separate > > I want to make it clear what I missing. > > > instruction and data cache, you may be loading from a different cache > > line when placing the static value close to your instructions. If you > > add a variable to the vcpu struct, all of the fields may no longer fit > > in a single data cache line and you may cause the memory subsystem to > > have to fetch another cache line. I believe the latter is Marc's > > Yes, I forgot new gich_apr is the only variable in the assembly code. > So the gich_apr will be load from a different cache line. > > Then let's come back to packing hw_cfg. > > Now the high word is used to store the offset of GICH_APR. The > unpacking operation is too complex to calculate the register offset, > especially in arm64 implementation. > > How about changing the packing mechanism? > > 1. Add the definition of enconding in arm-gic.h. > > #define HIP04_GIC (1 << 16) > #define HIP04_GICH_APR 0x70 > #define HIP04_GICH_LR0 0x80 > > 2. The code in save_vgic_state could be changed in below. > > ldr r9, [r2, #GICH_ELRSR1] > +ldr r10, [r3, #VGIC_CPU_HW_CFG] > +tst r10, #HIP04_GIC > +ldreq r10, [r2, #GICH_APR] > +ldrne r10, [r2, #HIP04_GICH_APR] > > Although I used the condition checking at here, the code could > be easier. > > I think that the executing time on "ldr" and "ldreq" should be same, > because CPCS should be ready > > Then calculation is avoid. Only three instructions are appended > for both GICH_APR & GICH_LR0. The implementation in arm64 > should be same & simple. > I think you misunderstood my point. Keep the assembly code as is, store the APR and the NR_LR in the HW_CFG always, on all systems, and don't use any conditionals in the assembly code (code is difficult to read, instruction prefetching and speculative execution becomes difficult, etc.). Only change something in the C-code. Set a static variable there during vgic_hyp_init and get rid of all the local variable declarations that dereference the vgic_vcpu struct. -Christoffer
diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S index 76af9302..b27e43f 100644 --- a/arch/arm/kvm/interrupts_head.S +++ b/arch/arm/kvm/interrupts_head.S @@ -419,7 +419,9 @@ vcpu .req r0 @ vcpu pointer always in r0 ldr r7, [r2, #GICH_EISR1] ldr r8, [r2, #GICH_ELRSR0] ldr r9, [r2, #GICH_ELRSR1] - ldr r10, [r2, #GICH_APR] + ldr r10, =gich_apr + ldr r10, [r10] + ldr r10, [r2, r10] str r3, [r11, #VGIC_CPU_HCR] str r4, [r11, #VGIC_CPU_VMCR] @@ -435,7 +437,11 @@ vcpu .req r0 @ vcpu pointer always in r0 str r5, [r2, #GICH_HCR] /* Save list registers */ - add r2, r2, #GICH_LR0 + ldr r10, =gich_apr + ldr r10, [r10] + /* the offset between GICH_APR & GICH_LR0 is 0x10 */ + add r10, r10, #0x10 + add r2, r2, r10 add r3, r11, #VGIC_CPU_LR ldr r4, [r11, #VGIC_CPU_NR_LR] 1: ldr r6, [r2], #4 @@ -469,10 +475,16 @@ vcpu .req r0 @ vcpu pointer always in r0 str r3, [r2, #GICH_HCR] str r4, [r2, #GICH_VMCR] - str r8, [r2, #GICH_APR] + ldr r6, =gich_apr + ldr r6, [r6] + str r8, [r2, r6] /* Restore list registers */ - add r2, r2, #GICH_LR0 + ldr r6, =gich_apr + ldr r6, [r6] + /* the offset between GICH_APR & GICH_LR0 is 0x10 */ + add r6, r6, #0x10 + add r2, r2, r6 add r3, r11, #VGIC_CPU_LR ldr r4, [r11, #VGIC_CPU_NR_LR] 1: ldr r6, [r3], #4 @@ -618,3 +630,7 @@ vcpu .req r0 @ vcpu pointer always in r0 .macro load_vcpu mrc p15, 4, vcpu, c13, c0, 2 @ HTPIDR .endm + + .global gich_apr +gich_apr: + .long GICH_APR diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 47b2983..6bf31db 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1470,17 +1470,30 @@ static struct notifier_block vgic_cpu_nb = { .notifier_call = vgic_cpu_notify, }; +static const struct of_device_id of_vgic_ids[] = { + { + .compatible = "arm,cortex-a15-gic", + .data = (void *)GICH_APR, + }, { + .compatible = "hisilicon,hip04-gic", + .data = (void *)HIP04_GICH_APR, + }, { + }, +}; + int kvm_vgic_hyp_init(void) { int ret; struct resource vctrl_res; struct resource vcpu_res; + const struct of_device_id *match; - vgic_node = of_find_compatible_node(NULL, NULL, "arm,cortex-a15-gic"); + vgic_node = of_find_matching_node_and_match(NULL, of_vgic_ids, &match); if (!vgic_node) { kvm_err("error: no compatible vgic node in DT\n"); return -ENODEV; } + gich_apr = (unsigned int)match->data; vgic_maint_irq = irq_of_parse_and_map(vgic_node, 0); if (!vgic_maint_irq) {