@@ -228,21 +228,29 @@ ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!p || IS_ERR(p)) // EOF or an error
break;
err = m->op->show(m, p);
- if (err < 0) // hard error
- break;
- if (unlikely(err)) // ->show() says "skip it"
- m->count = 0;
- if (unlikely(!m->count)) { // empty record
- p = m->op->next(m, p, &m->index);
- continue;
+ if (err != -EOVERFLOW) {
+ if (err < 0) // hard error
+ break;
+ if (unlikely(err)) // ->show() says "skip it"
+ m->count = 0;
+ if (unlikely(!m->count)) { // empty record
+ p = m->op->next(m, p, &m->index);
+ continue;
+ }
+ if (!seq_has_overflowed(m)) // got it
+ goto Fill;
}
- if (!seq_has_overflowed(m)) // got it
- goto Fill;
// need a bigger buffer
m->op->stop(m, p);
kvfree(m->buf);
m->count = 0;
- m->buf = seq_buf_alloc(m->size <<= 1);
+ do {
+ m->size <<= 1;
+ /* set 16MB as limit in case max_showsize goes wide */
+ if (m->size > (1 << 24))
+ goto Enomem;
+ } while (m->size <= m->max_showsize);
+ m->buf = seq_buf_alloc(m->size);
if (!m->buf)
goto Enomem;
p = m->op->start(m, &m->index);
@@ -26,6 +26,7 @@ struct seq_file {
int poll_event;
const struct file *file;
void *private;
+ size_t max_showsize;
};
struct seq_operations {
@@ -52,6 +53,24 @@ static inline bool seq_has_overflowed(struct seq_file *m)
return m->count == m->size;
}
+/**
+ * seq_check_showsize_overflow - check if the buffer *would* be overflowed
+ * @m: the seq_file handle
+ * @showsize: the number of bytes to be written to buffer in next show()
+ *
+ * If next show() would not fit in current buffer, no need to fetch
+ * and print it. The showsize should be estimated without actually
+ * fetching the data.
+ *
+ * Returns true if print the data of size showsize would cause buffer
+ * overflowed.
+ */
+static inline bool seq_check_showsize_overflow(struct seq_file *m, size_t showsize)
+{
+ m->max_showsize = max(m->max_showsize, showsize);
+ return m->count + showsize >= m->size;
+}
+
/**
* seq_get_buf - get buffer to write arbitrary data to
* @m: the seq_file handle
Overflow happens when op->show() reaches buffer capacity, and when this happens, the data fetched and printed is wasted for current show() and needs to be fetched and printed again in next show(). Assuming each show() yields data with size m, then for every `floor(seq_file->size/m)` show() calls there will be one show() duplicated due to overflow. When m is larger than half of seq_file->size, every show() will be duplicated except the first one, this happens to /proc/interrupts when there are hundreds of CPUs. More than one duplicated/wasted show() for a single record happens when the record size for show() is way too large, because seq_read() only doubles buffer size and make a retry until no overflow happened. For example, /proc/vmallocinfo yields huge size of data with a *single* show() call, on one instance, the size is 424810 bytes, the buffer doubling process would make 8 rounds of retries to make buffer large enough: (PAGE_SIZE=4096)*2*2*2*2*2*2*2=524288 > 424810, meaning one read of /proc/vmallocinfo actually iterates the whole vm data 8 times. Huge data yielded by one single show() also happens to arch_show_interrupts() for /proc/interrupts when there are hundreds of CPUs: on arch x86 with 256 CPUs, arch_show_interrupts() would yield about 48436 bytes, which needs 5 round of data fetching: 4096*2*2*2*2=65536 > 48436. The lack of information is the problem here, add an interface for caller to register an *estimated* showsize before actually fetching/printing the real data. Buffer size doubling calculation stop until reach the maximum registered showsize, and when an overflow is expected, break out the show() loop and copy the data to user buffer. For example, show_interrupts() can use following code to check overflow before fetching/printing interrupt counters /* nr_cpu+2 columns, each 10bytes wide */ if (seq_check_showsize_overflow(p, (nr_cpu+2)*10)) return -EOVERFLOW; On a 1CPU vm, duplicate cpu interrupt data 256 times in show_interrupts() and arch_show_interrupts() for /proc/interrupts, timing for reads without seq_check_showsize_overflow(): $ strace -T -e read cat /proc/interrupts > /dev/null ... read(3, " CPU0 CPU0 "..., 131072) = 2828 <0.000078> read(3, " 1: 9 9 "..., 131072) = 2850 <0.000054> read(3, " 8: 0 0 "..., 131072) = 2849 <0.000072> read(3, " 9: 0 0 "..., 131072) = 2849 <0.000080> read(3, " 12: 144 144 "..., 131072) = 2850 <0.000075> read(3, " 16: 0 0 "..., 131072) = 2855 <0.000076> read(3, " 21: 14 14 "..., 131072) = 2848 <0.000077> read(3, " 22: 10 10 "..., 131072) = 2861 <0.000076> read(3, " 24: 1 1 "..., 131072) = 2882 <0.000073> read(3, " 25: 1 1 "..., 131072) = 2882 <0.000074> read(3, " 26: 1 1 "..., 131072) = 2882 <0.000074> read(3, " 27: 1 1 "..., 131072) = 2882 <0.000101> read(3, " 28: 1 1 "..., 131072) = 2882 <0.000064> read(3, " 29: 1 1 "..., 131072) = 2882 <0.000066> read(3, " 30: 1 1 "..., 131072) = 2882 <0.000065> read(3, " 31: 0 0 "..., 131072) = 2872 <0.000052> read(3, " 32: 3787 3787 "..., 131072) = 2871 <0.000049> read(3, " 33: 0 0 "..., 131072) = 2872 <0.000079> read(3, " 34: 127 127 "..., 131072) = 2873 <0.000073> read(3, " 35: 103 103 "..., 131072) = 2874 <0.000055> read(3, " 36: 32 32 "..., 131072) = 2866 <0.000051> read(3, " 38: 139 139 "..., 131072) = 2875 <0.000069> read(3, " 39: 0 0 "..., 131072) = 2872 <0.000067> read(3, " 40: 17 17 "..., 131072) = 2876 <0.000055> read(3, " 41: 105 105 "..., 131072) = 2876 <0.000194> read(3, "NMI: 0 0 "..., 131072) = 48436 <0.000874> read(3, "", 131072) = 0 <0.000013> Adding up, totally 0.002736s. And with seq_check_showsize_overflow() added in show_interrupts(): $ strace -T -e read cat /proc/interrupts > /dev/null ... read(3, " CPU0 CPU0 "..., 131072) = 2828 <0.000070> read(3, " 1: 9 9 "..., 131072) = 2850 <0.000040> read(3, " 8: 0 0 "..., 131072) = 2849 <0.000041> read(3, " 9: 0 0 "..., 131072) = 2849 <0.000040> read(3, " 12: 144 144 "..., 131072) = 2850 <0.000060> read(3, " 16: 0 0 "..., 131072) = 2855 <0.000042> read(3, " 21: 14 14 "..., 131072) = 2848 <0.000045> read(3, " 22: 10 10 "..., 131072) = 2861 <0.000044> read(3, " 24: 1 1 "..., 131072) = 2882 <0.000042> read(3, " 25: 1 1 "..., 131072) = 2882 <0.000042> read(3, " 26: 1 1 "..., 131072) = 2882 <0.000042> read(3, " 27: 1 1 "..., 131072) = 2882 <0.000042> read(3, " 28: 1 1 "..., 131072) = 2882 <0.000041> read(3, " 29: 1 1 "..., 131072) = 2882 <0.000036> read(3, " 30: 1 1 "..., 131072) = 2882 <0.000042> read(3, " 31: 0 0 "..., 131072) = 2872 <0.000041> read(3, " 32: 285 285 "..., 131072) = 2873 <0.000044> read(3, " 33: 238 238 "..., 131072) = 2874 <0.000044> read(3, " 34: 138 138 "..., 131072) = 2875 <0.000044> read(3, " 35: 0 0 "..., 131072) = 2872 <0.000042> read(3, " 36: 3856 3856 "..., 131072) = 2871 <0.000044> read(3, " 37: 32 32 "..., 131072) = 2866 <0.000066> read(3, " 39: 0 0 "..., 131072) = 2872 <0.000064> read(3, " 40: 22 22 "..., 131072) = 2876 <0.000066> read(3, " 41: 105 105 "..., 131072) = 2876 <0.000066> read(3, "NMI: 0 0 "..., 131072) = 48436 <0.000296> read(3, "", 131072) = 0 <0.000037> Adding up, totally 0.001523s, a significant improvement comparing with 0.002736s. Signed-off-by: David Wang <00107082@163.com> --- fs/seq_file.c | 28 ++++++++++++++++++---------- include/linux/seq_file.h | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+), 10 deletions(-)