From fb2a329828e64ad0e224a8cb97dbc17147149629 Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Mon, 23 Oct 2017 21:24:29 +0300
Subject: [PATCH] Btrfs: heuristic try avoid bucket sorting on edge data cases
Heap sort used in kernel are too slow and costly,
So let's make some statistic assume about egde input data cases
Based on observation of difference between min/max values in bucket.
Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
---
fs/btrfs/compression.c | 38 ++++++++++++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
@@ -1310,8 +1310,46 @@ static int byte_core_set_size(struct heuristic_ws *ws)
u32 i;
u32 coreset_sum = 0;
const u32 core_set_threshold = ws->sample_size * 90 / 100;
+ struct bucket_item *max, *min;
+ struct bucket_item tmp;
struct bucket_item *bucket = ws->bucket;
+
+ /* Presort for find min/max value */
+ max = &bucket[0];
+ min = &bucket[BUCKET_SIZE - 1];
+ for (i = 1; i < BUCKET_SIZE - 1; i++) {
+ if (bucket[i].count > max->count) {
+ tmp = *max;
+ *max = bucket[i];
+ bucket[i] = tmp;
+ }
+ if (bucket[i].count < min->count) {
+ tmp = *min;
+ *min = bucket[i];
+ bucket[i] = tmp;
+ }
+ }
+
+ /*
+ * Hacks for avoid sorting on Edge data cases (sorting too constly)
+ * i.e. that will fast filter easy compressible
+ * and bad compressible data
+ * Based on observation of number distribution on different data sets
+ *
+ * Assume 1: For bad compressible data distribution between min/max
+ * will be less then 0.6% of sample size
+ *
+ * Assume 2: For good compressible data distribution between min/max
+ * will be far bigger then 4% of sample size
+ */
+
+ if (max->count - min->count < ws->sample_size * 6 / 1000)
+ return BYTE_CORE_SET_HIGH + 1;
+
+ if (max->count - min->count > ws->sample_size * 4 / 100)
+ return BYTE_CORE_SET_LOW - 1;
+
/* Sort in reverse order */
sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL);
--
2.14.2