======================================================================
Trats: fixed 100 MHz | 62.15 MB/s | 17.44 MB/s | 119.50 mA
Trats: mmc devfreq | 58.45 MB/s | 17.47 MB/s | 116.40 mA
======================================================================
Rinat: fixed 100 MHz | 78.70 MB/s | 6.2 MB/s | 27.16 mA
Rinat: mmc devfreq | 62.60 MB/s | 6.0 MB/s | 26.89 mA
1. 'fixed' means fixed slot clock 100 MHz (bus clock 200 MHz)
2. 'mmc devfreq' means scaled slot clock from 25 to 100 MHz (bus clock
50 MHz and 100 MHz)
Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
---
Documentation/devicetree/bindings/mmc/mmc.txt | 2 +
drivers/mmc/card/block.c | 247 ++++++++++++++++++++++++++
drivers/mmc/core/Kconfig | 16 ++
drivers/mmc/core/core.h | 1 -
drivers/mmc/core/host.c | 2 +
include/linux/mmc/card.h | 8 +
include/linux/mmc/host.h | 3 +
7 files changed, 278 insertions(+), 1 deletion(-)
@@ -42,6 +42,8 @@ Optional properties:
- mmc-hs400-1_2v: eMMC HS400 mode(1.2V I/O) is supported
- dsr: Value the card's (optional) Driver Stage Register (DSR) should be
programmed with. Valid range: [0 .. 0xffff].
+- frequency-scaling: when present, dynamic frequency scaling for this
+ host is supported and will be enabled (if kernel supports it)
*NOTE* on CD and WP polarity. To use common for all SD/MMC host controllers line
polarity properties, we have to fix the meaning of the "normal" and "inverted"
@@ -35,6 +35,7 @@
#include <linux/capability.h>
#include <linux/compat.h>
#include <linux/pm_runtime.h>
+#include <linux/devfreq.h>
#include <linux/mmc/ioctl.h>
#include <linux/mmc/card.h>
@@ -319,6 +320,243 @@ static void mmc_blk_release(struct gendisk *disk, fmode_t mode)
mutex_unlock(&block_mutex);
}
+#ifdef CONFIG_MMC_DEVFREQ
+
+/*
+ * TODO:
+ * - work with clkgate
+ * - sometimes devfreq_work is not executed on intensive IO...
+ * [ 33.883417] mmcblk mmc1:0001: clk 100000000, busy 4840 ms, time 4840 ms, ceil 17997 kB/s
+ * [ 41.513621] mmcblk mmc1:0001: clk 100000000, busy 7630 ms, time 7630 ms, ceil 18226 kB/s
+ * [ 43.363617] mmcblk mmc1:0001: clk 100000000, busy 1806 ms, time 1849 ms, ceil 18226 kB/s
+ * looks issue of defferable work. The polling times here are way too long.
+ *
+ * - devm_devfreq_register_opp_notifier
+ *
+ * - Setting MMC frequency when card is very busy may take very long time
+ * because mmc_claim_host() must find idle gap of MMC.
+ * Probably this should be done in workqueue.
+ */
+
+#ifdef CONFIG_MMC_CLKGATE
+#error "Currently MMC devfreq conflicts with clkgate. Choose one."
+#endif
+
+#define MMC_DEVFREQ_GOVERNOR "simple_ondemand"
+/*
+ * Device utilization level is measured as ratio of current throughput
+ * to maximum throughput (reached in device lifecycle) for given frequency
+ * level:
+ * total_time = polling time,
+ * busy_time = (bytes / max_throughput) * (max_freq / cur_freq)
+ *
+ * This is still inaccurate. Especially that some of loads may achieve
+ * bigger throughput than others for the same expected business of device.
+ * For example reading from MMC to memory has higher throughput than
+ * copying data on the same MMC.
+ *
+ * To boost frequency early and reduce this inaccurate utilization metrics
+ * assume 30% of throughput as saturated.
+ */
+#define MMC_DEVFREQ_SATURATION 30
+
+/*
+ * TODO: take these from DT as OPP
+ */
+static const unsigned int mmc_devfreq_frequencies[] = {
+ 25000000,
+ 50000000,
+ 100000000,
+ 200000000,
+ 400000000,
+};
+
+static unsigned int get_clk_ratio_throughput(struct mmc_card *card,
+ unsigned int freq)
+{
+ unsigned int max_opp = card->df_num_freq - 1;
+
+ if (!card->df_num_freq)
+ return 0;
+
+ return (mmc_devfreq_frequencies[max_opp] / freq);
+}
+
+static int mmc_devfreq_target(struct device *dev, unsigned long *freq, u32 flags)
+{
+ struct dev_pm_opp *opp;
+ unsigned long new_freq;
+ struct mmc_card *card = mmc_dev_to_card(dev);
+ unsigned int cur_freq = mmc_host_clk_rate(card->host);
+
+ if (!cur_freq)
+ return 0;
+
+ rcu_read_lock();
+ opp = devfreq_recommended_opp(dev, freq, flags);
+ if (IS_ERR(opp)) {
+ rcu_read_unlock();
+ return PTR_ERR(opp);
+ }
+ new_freq = dev_pm_opp_get_freq(opp);
+ rcu_read_unlock();
+
+ if (cur_freq == new_freq)
+ return 0;
+
+ dev_dbg(dev, "%u Hz -> %lu Hz\n", cur_freq, new_freq);
+
+ mmc_claim_host(card->host);
+ mmc_set_clock(card->host, new_freq);
+ mmc_release_host(card->host);
+
+ return 0;
+}
+
+static int mmc_devfreq_get_dev_status(struct device *dev,
+ struct devfreq_dev_status *stat)
+{
+ struct mmc_card *card = mmc_dev_to_card(dev);
+ ktime_t old_time, cur_time;
+ /*
+ * TODO: All calculations done on 64-bit numbers but is it really
+ * necessary? The "bytes" are "int" and they won't overflow
+ * (2 GB between polling time). Polling time (in ms) also shouldn't
+ * overflow.
+ */
+ s64 diff_time;
+ u64 bps, bytes;
+ unsigned int cur_freq = mmc_host_clk_rate(card->host);
+
+ if (!cur_freq)
+ return -ENODEV;
+
+ /*
+ * After getting current measurements, reset byte counter and time
+ * of polling early to reduce possible misses of processed requests.
+ */
+ bytes = (u64)atomic_xchg(&card->df_bytes, 0);
+
+ old_time = card->df_poll_time;
+ cur_time = card->df_poll_time = ktime_get();
+ diff_time = ktime_to_ms(ktime_sub(cur_time, old_time));
+ if (diff_time < 0 || diff_time > UINT_MAX)
+ /* Can't happen, but for safe promotion to unsigned in do_div */
+ return -EINVAL;
+
+ /*
+ * Calculate bytes per second and update card throughput. Diff time
+ * is in ms so multiply by 1000.
+ */
+ bytes *= 1000;
+ bps = bytes;
+ do_div(bps, diff_time);
+ card->df_ceil_bps = max(card->df_ceil_bps, bps);
+
+ /* Calculate busy time: bytes/max_throughput */
+ do_div(bytes, card->df_ceil_bps);
+
+ /*
+ * Boost by current frequency level. This boost may result in
+ * exceeding total_time.
+ */
+ bytes *= get_clk_ratio_throughput(card, cur_freq);
+
+ stat->busy_time = bytes;
+ stat->total_time = diff_time;
+ stat->current_frequency = cur_freq;
+
+ /* Only debug */
+ {
+ u64 ceil = card->df_ceil_bps;
+ do_div(ceil, 1024);
+ dev_dbg(&card->dev, "clk: %u, busy: %llu ms, time: %llu ms, ceil: %llu kB/s\n",
+ cur_freq, bytes, diff_time, ceil);
+ }
+ return 0;
+}
+
+static struct devfreq_dev_profile mmc_devfreq_profile = {
+ .initial_freq = 50000000, /* overwritten with current freq in init */
+ .polling_ms = 100,
+ .target = mmc_devfreq_target,
+ .get_dev_status = mmc_devfreq_get_dev_status,
+};
+
+static struct devfreq_simple_ondemand_data mmc_governor_data = {
+ .upthreshold = MMC_DEVFREQ_SATURATION,
+ .downdifferential = 5,
+};
+
+static void mmc_devfreq_init(struct mmc_card *card)
+{
+ int ret;
+ unsigned int i;
+ unsigned int max_freq = card->host->f_max;
+
+ WARN_ON(!max_freq);
+ for (i = 0; i < ARRAY_SIZE(mmc_devfreq_frequencies); i++) {
+ if (mmc_devfreq_frequencies[i] > max_freq)
+ break;
+
+ ret = dev_pm_opp_add(&card->dev, mmc_devfreq_frequencies[i], 0);
+ if (ret) {
+ dev_err(&card->dev,
+ "Cannot add opp entries: %d\n", ret);
+ return;
+ }
+ }
+ card->df_num_freq = i;
+
+ if (card->df_num_freq < 2) {
+ dev_info(&card->dev,
+ "Not enough frequencies for devfreq (device supports %u frequencies)\n",
+ card->df_num_freq);
+ return;
+ }
+
+ mmc_devfreq_profile.initial_freq = mmc_host_clk_rate(card->host);
+
+ card->devfreq = devm_devfreq_add_device(&card->dev,
+ &mmc_devfreq_profile, MMC_DEVFREQ_GOVERNOR,
+ &mmc_governor_data);
+ if (IS_ERR(card->devfreq)) {
+ card->devfreq = NULL;
+ } else {
+ dev_info(&card->dev,
+ "Starting frequency scaling with %u frequencies\n",
+ card->df_num_freq);
+ card->df_poll_time = ktime_get();
+ card->df_ceil_bps = 1; /* Non-zero to avoid first do_div by 0 */
+ }
+ /* TODO: devm_devfreq_register_opp_notifier */
+}
+
+static void mmc_devfreq_exit(struct mmc_card *card)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mmc_devfreq_frequencies); i++)
+ dev_pm_opp_remove(&card->dev, mmc_devfreq_frequencies[i]);
+}
+
+static void mmc_devfreq_account_req(struct mmc_card *card, struct request *req)
+{
+ if (!card->devfreq)
+ return;
+
+ atomic_add(blk_rq_bytes(req), &card->df_bytes);
+}
+
+#else /* !CONFIG_MMC_DEVFREQ */
+
+static inline void mmc_devfreq_init(struct mmc_card *card) { }
+static inline void mmc_devfreq_exit(struct mmc_card *card) { }
+static inline void mmc_devfreq_account_req(struct mmc_card *card,
+ struct request *req) { }
+
+#endif /* CONFIG_MMC_DEVFREQ */
+
static int
mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
@@ -2038,6 +2276,10 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
mmc_blk_issue_rw_rq(mq, NULL);
ret = mmc_blk_issue_flush(mq, req);
} else {
+ if (req) {
+ mmc_devfreq_account_req(mq->card, req);
+ // TODO: do not ignore special requests?
+ }
if (!req && host->areq) {
spin_lock_irqsave(&host->context_info.lock, flags);
host->context_info.is_waiting_last_req = true;
@@ -2469,6 +2711,9 @@ static int mmc_blk_probe(struct device *dev)
pm_runtime_enable(&card->dev);
}
+ if (card->host->caps2 & MMC_CAP2_FREQ_SCALING)
+ mmc_devfreq_init(card);
+
return 0;
out:
@@ -2482,6 +2727,8 @@ static int mmc_blk_remove(struct device *dev)
struct mmc_card *card = mmc_dev_to_card(dev);
struct mmc_blk_data *md = dev_get_drvdata(dev);
+ /* FIXME: exit devfreq in mmc_detect_card_removed? */
+ mmc_devfreq_exit(card);
mmc_blk_remove_parts(card, md);
pm_runtime_get_sync(&card->dev);
mmc_claim_host(card->host);
@@ -11,3 +11,19 @@ config MMC_CLKGATE
support handling this in order for it to be of any use.
If unsure, say N.
+
+config MMC_DEVFREQ
+ bool "MMC host clock frequency scaling for block devices"
+ depends on !MMC_CLKGATE
+ depends on PM_DEVFREQ
+ select DEVFREQ_GOV_SIMPLE_ONDEMAND
+ select PM_OPP
+ help
+ This will add dynamic frequency scaling of MMC host clock
+ depending on current utilization of MMC. The utilization is
+ calculated as number of bytes queued for transfer (both from
+ and to MMC card). This should reduce energy consumption when
+ MMC is not heavily used. On high loads this shouldn't decrease
+ performance. Only block devices are supported.
+
+ If unsure, say N.
@@ -38,7 +38,6 @@ struct device_node *mmc_of_find_child_device(struct mmc_host *host,
void mmc_init_erase(struct mmc_card *card);
void mmc_set_chip_select(struct mmc_host *host, int mode);
-void mmc_set_clock(struct mmc_host *host, unsigned int hz);
void mmc_gate_clock(struct mmc_host *host);
void mmc_ungate_clock(struct mmc_host *host);
void mmc_set_ungated(struct mmc_host *host);
@@ -439,6 +439,8 @@ int mmc_of_parse(struct mmc_host *host)
host->caps2 |= MMC_CAP2_HS400_1_8V | MMC_CAP2_HS200_1_8V_SDR;
if (of_find_property(np, "mmc-hs400-1_2v", &len))
host->caps2 |= MMC_CAP2_HS400_1_2V | MMC_CAP2_HS200_1_2V_SDR;
+ if (of_find_property(np, "frequency-scaling", &len))
+ host->caps2 |= MMC_CAP2_FREQ_SCALING;
host->dsr_req = !of_property_read_u32(np, "dsr", &host->dsr);
if (host->dsr_req && (host->dsr & ~0xffff)) {
@@ -309,6 +309,14 @@ struct mmc_card {
struct dentry *debugfs_root;
struct mmc_part part[MMC_NUM_PHY_PARTITION]; /* physical partitions */
unsigned int nr_parts;
+
+#ifdef CONFIG_MMC_DEVFREQ
+ struct devfreq *devfreq;
+ atomic_t df_bytes;
+ ktime_t df_poll_time;
+ u64 df_ceil_bps;
+ unsigned int df_num_freq;
+#endif
};
/*
@@ -289,6 +289,7 @@ struct mmc_host {
MMC_CAP2_HS400_1_2V)
#define MMC_CAP2_HSX00_1_2V (MMC_CAP2_HS200_1_2V_SDR | MMC_CAP2_HS400_1_2V)
#define MMC_CAP2_SDIO_IRQ_NOTHREAD (1 << 17)
+#define MMC_CAP2_FREQ_SCALING (1 << 18) /* Supports MMC_DEVFREQ */
mmc_pm_flag_t pm_caps; /* supported pm features */
@@ -489,6 +490,8 @@ static inline unsigned int mmc_host_clk_rate(struct mmc_host *host)
return host->ios.clock;
}
#endif
+/* FIXME: should it be exported? */
+void mmc_set_clock(struct mmc_host *host, unsigned int hz);
static inline int mmc_card_hs(struct mmc_card *card)
{