Message ID | 1547054628-12703-5-git-send-email-longman@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | /proc/stat: Reduce irqs counting performance overhead | expand |
On 01/09/2019 12:23 PM, Waiman Long wrote: > After skipping the percpu summation of non-active IRQs on a 4-socket > Broadwell system with about 3k IRQs, about half of the CPU cycles were > spent in the kstat_irqs() call. The majority of which were used to look > up the IRQ descriptors for the corresponding IRQ numbers. > > We can recoup a lot of those lost cycles by calling kstat_irqs_usr() > only for those IRQs that are active. A bitmap is now used to keep track > of the list of the active IRQs. Changes in nr_active_irqs count will > cause the code to rescan all the IRQs and repopulate the bitmap. > > On the same 4-socket server, the introduction of this patch further > reduces the system time of reading /proc/stat 5k times from 8.048s > to 5.817s. This is a another time reduction of 28%. > > Signed-off-by: Waiman Long <longman@redhat.com> > --- > fs/proc/stat.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 84 insertions(+) > > diff --git a/fs/proc/stat.c b/fs/proc/stat.c > index 4b06f1b..5e2a398 100644 > --- a/fs/proc/stat.c > +++ b/fs/proc/stat.c > @@ -93,6 +93,25 @@ static u64 compute_stat_irqs_sum(void) > } > > /* > + * Write the given number of space separated '0' into the sequence file. > + */ > +static void write_zeros(struct seq_file *p, int cnt) > +{ > + /* String of 16 '0's */ > + static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; > + > + while (cnt > 0) { > + if (cnt >= 16) { > + seq_write(p, zeros, 32); > + cnt -= 16; > + } else { > + seq_write(p, zeros, 2 * cnt); > + cnt = 0; > + } > + } > +} > + > +/* > * Print out the "intr" line of /proc/stat. > */ > static void show_stat_irqs(struct seq_file *p) > @@ -100,9 +119,74 @@ static void show_stat_irqs(struct seq_file *p) > int i; > > seq_put_decimal_ull(p, "intr ", compute_stat_irqs_sum()); > + > + if (IS_ENABLED(CONFIG_SMP) && (nr_cpu_ids >= 10) && (nr_irqs >= 256)) { > + /* > + * On systems with 10 or more CPUs and 256 or more IRQs, > + * we used a bitmap to keep track of the number of active > + * IRQs and call kstat_irqs_usr() only for those IRQs. > + * The bitmap will be refreshed whenever nr_active_irqs > + * changes. > + */ > + extern atomic_t nr_active_irqs; > + static DEFINE_MUTEX(irqs_mutex); > + static int last_irq = -1; > + static int bitmap_size, active_irqs; > + static unsigned long *bitmap; > + int current_irqs = atomic_read(&nr_active_irqs); > + > + mutex_lock(&irqs_mutex); > + if (current_irqs != active_irqs) { > + /* > + * Rescan all the IRQs for active ones. > + */ > + if (nr_irqs > bitmap_size) { > + static unsigned long *new_bitmap; > + static int new_size; > + > + new_size = BITS_TO_LONGS(nr_irqs)*sizeof(long); > + new_bitmap = (unsigned long *)krealloc(bitmap, > + new_size, GFP_KERNEL); > + if (!new_bitmap) > + goto fallback; > + bitmap = new_bitmap; > + bitmap_size = new_size; > + } > + memset(bitmap, 0, bitmap_size/BITS_PER_BYTE); > + last_irq = 0; Sorry, last_irq should be initialized to -1 here. Cheers, Longman
diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 4b06f1b..5e2a398 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -93,6 +93,25 @@ static u64 compute_stat_irqs_sum(void) } /* + * Write the given number of space separated '0' into the sequence file. + */ +static void write_zeros(struct seq_file *p, int cnt) +{ + /* String of 16 '0's */ + static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; + + while (cnt > 0) { + if (cnt >= 16) { + seq_write(p, zeros, 32); + cnt -= 16; + } else { + seq_write(p, zeros, 2 * cnt); + cnt = 0; + } + } +} + +/* * Print out the "intr" line of /proc/stat. */ static void show_stat_irqs(struct seq_file *p) @@ -100,9 +119,74 @@ static void show_stat_irqs(struct seq_file *p) int i; seq_put_decimal_ull(p, "intr ", compute_stat_irqs_sum()); + + if (IS_ENABLED(CONFIG_SMP) && (nr_cpu_ids >= 10) && (nr_irqs >= 256)) { + /* + * On systems with 10 or more CPUs and 256 or more IRQs, + * we used a bitmap to keep track of the number of active + * IRQs and call kstat_irqs_usr() only for those IRQs. + * The bitmap will be refreshed whenever nr_active_irqs + * changes. + */ + extern atomic_t nr_active_irqs; + static DEFINE_MUTEX(irqs_mutex); + static int last_irq = -1; + static int bitmap_size, active_irqs; + static unsigned long *bitmap; + int current_irqs = atomic_read(&nr_active_irqs); + + mutex_lock(&irqs_mutex); + if (current_irqs != active_irqs) { + /* + * Rescan all the IRQs for active ones. + */ + if (nr_irqs > bitmap_size) { + static unsigned long *new_bitmap; + static int new_size; + + new_size = BITS_TO_LONGS(nr_irqs)*sizeof(long); + new_bitmap = (unsigned long *)krealloc(bitmap, + new_size, GFP_KERNEL); + if (!new_bitmap) + goto fallback; + bitmap = new_bitmap; + bitmap_size = new_size; + } + memset(bitmap, 0, bitmap_size/BITS_PER_BYTE); + last_irq = 0; + for_each_irq_nr(i) { + int cnt = kstat_irqs_usr(i); + + if (cnt) { + bitmap_set(bitmap, 0, i); + last_irq = i; + } + seq_put_decimal_ull(p, " ", cnt); + } + active_irqs = current_irqs; + mutex_unlock(&irqs_mutex); + goto out; + } + /* + * Retrieve counts from active IRQs only. + */ + for (i = 0; i <= last_irq; i++) { + int next = find_next_bit(bitmap, last_irq + 1, i); + + if (next > i) + write_zeros(p, next - i); + i = next; + seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); + } + mutex_unlock(&irqs_mutex); + write_zeros(p, nr_irqs - i); + goto out; + } +fallback: for_each_irq_nr(i) seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); +out: seq_putc(p, '\n'); }
After skipping the percpu summation of non-active IRQs on a 4-socket Broadwell system with about 3k IRQs, about half of the CPU cycles were spent in the kstat_irqs() call. The majority of which were used to look up the IRQ descriptors for the corresponding IRQ numbers. We can recoup a lot of those lost cycles by calling kstat_irqs_usr() only for those IRQs that are active. A bitmap is now used to keep track of the list of the active IRQs. Changes in nr_active_irqs count will cause the code to rescan all the IRQs and repopulate the bitmap. On the same 4-socket server, the introduction of this patch further reduces the system time of reading /proc/stat 5k times from 8.048s to 5.817s. This is a another time reduction of 28%. Signed-off-by: Waiman Long <longman@redhat.com> --- fs/proc/stat.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+)