diff mbox

[4/7] Protectable Memory

Message ID 20180223144807.1180-5-igor.stoppa@huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Igor Stoppa Feb. 23, 2018, 2:48 p.m. UTC
The MMU available in many systems running Linux can often provide R/O
protection to the memory pages it handles.

However, the MMU-based protection works efficiently only when said pages
contain exclusively data that will not need further modifications.

Statically allocated variables can be segregated into a dedicated
section, but this does not sit very well with dynamically allocated
ones.

Dynamic allocation does not provide, currently, any means for grouping
variables in memory pages that would contain exclusively data suitable
for conversion to read only access mode.

The allocator here provided (pmalloc - protectable memory allocator)
introduces the concept of pools of protectable memory.

A module can request a pool and then refer any allocation request to the
pool handler it has received.

Once all the chunks of memory associated to a specific pool are
initialized, the pool can be protected.

After this point, the pool can only be destroyed (it is up to the module
to avoid any further references to the memory from the pool, after
the destruction is invoked).

The latter case is mainly meant for releasing memory, when a module is
unloaded.

A module can have as many pools as needed, for example to support the
protection of data that is initialized in sufficiently distinct phases.

Since pmalloc memory is obtained from vmalloc, an attacker that has
gained access to the physical mapping, still has to identify where the
target of the attack is actually located.

At the same time, being also based on genalloc, pmalloc does not
generate as much trashing of the TLB as it would be caused by using
directly only vmalloc.

Signed-off-by: Igor Stoppa <igor.stoppa@huawei.com>
---
 include/linux/genalloc.h |   3 +
 include/linux/pmalloc.h  | 242 +++++++++++++++++++++++
 include/linux/vmalloc.h  |   1 +
 lib/genalloc.c           |  27 +++
 mm/Kconfig               |   6 +
 mm/Makefile              |   1 +
 mm/pmalloc.c             | 499 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/usercopy.c            |  33 ++++
 8 files changed, 812 insertions(+)
 create mode 100644 include/linux/pmalloc.h
 create mode 100644 mm/pmalloc.c

Comments

Jay Freyensee Feb. 24, 2018, 12:10 a.m. UTC | #1
On 2/23/18 6:48 AM, Igor Stoppa wrote:

> The MMU available in many systems running Linux can often provide R/O
> protection to the memory pages it handles.
>
> However, the MMU-based protection works efficiently only when said pages
> contain exclusively data that will not need further modifications.
>
> Statically allocated variables can be segregated into a dedicated
> section, but this does not sit very well with dynamically allocated
> ones.
>
> Dynamic allocation does not provide, currently, any means for grouping
> variables in memory pages that would contain exclusively data suitable
> for conversion to read only access mode.
>
> The allocator here provided (pmalloc - protectable memory allocator)
> introduces the concept of pools of protectable memory.
>
> A module can request a pool and then refer any allocation request to the
> pool handler it has received.
>
> Once all the chunks of memory associated to a specific pool are
> initialized, the pool can be protected.
>
> After this point, the pool can only be destroyed (it is up to the module
> to avoid any further references to the memory from the pool, after
> the destruction is invoked).
>
> The latter case is mainly meant for releasing memory, when a module is
> unloaded.
>
> A module can have as many pools as needed, for example to support the
> protection of data that is initialized in sufficiently distinct phases.
>
> Since pmalloc memory is obtained from vmalloc, an attacker that has
> gained access to the physical mapping, still has to identify where the
> target of the attack is actually located.
>
> At the same time, being also based on genalloc, pmalloc does not
> generate as much trashing of the TLB as it would be caused by using
> directly only vmalloc.
>
> Signed-off-by: Igor Stoppa <igor.stoppa@huawei.com>
> ---
>   include/linux/genalloc.h |   3 +
>   include/linux/pmalloc.h  | 242 +++++++++++++++++++++++
>   include/linux/vmalloc.h  |   1 +
>   lib/genalloc.c           |  27 +++
>   mm/Kconfig               |   6 +
>   mm/Makefile              |   1 +
>   mm/pmalloc.c             | 499 +++++++++++++++++++++++++++++++++++++++++++++++
>   mm/usercopy.c            |  33 ++++
>   8 files changed, 812 insertions(+)
>   create mode 100644 include/linux/pmalloc.h
>   create mode 100644 mm/pmalloc.c
>
> diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
> index dcaa33e74b1c..b6c4cea9fbd8 100644
> --- a/include/linux/genalloc.h
> +++ b/include/linux/genalloc.h
> @@ -121,6 +121,9 @@ extern unsigned long gen_pool_alloc_algo(struct gen_pool *, size_t,
>   extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size,
>   		dma_addr_t *dma);
>   extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
> +
> +extern void gen_pool_flush_chunk(struct gen_pool *pool,
> +				 struct gen_pool_chunk *chunk);
>   extern void gen_pool_for_each_chunk(struct gen_pool *,
>   	void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *);
>   extern size_t gen_pool_avail(struct gen_pool *);
> diff --git a/include/linux/pmalloc.h b/include/linux/pmalloc.h
> new file mode 100644
> index 000000000000..afc2068d5545
> --- /dev/null
> +++ b/include/linux/pmalloc.h
> @@ -0,0 +1,242 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * pmalloc.h: Header for Protectable Memory Allocator
> + *
> + * (C) Copyright 2017 Huawei Technologies Co. Ltd.
> + * Author: Igor Stoppa <igor.stoppa@huawei.com>
> + */
> +
> +#ifndef _LINUX_PMALLOC_H
> +#define _LINUX_PMALLOC_H
> +
> +
> +#include <linux/genalloc.h>
> +#include <linux/string.h>
> +
> +#define PMALLOC_DEFAULT_ALLOC_ORDER (-1)
> +
> +/*
> + * Library for dynamic allocation of pools of memory that can be,
> + * after initialization, marked as read-only.
> + *
> + * This is intended to complement __read_only_after_init, for those cases
> + * where either it is not possible to know the initialization value before
> + * init is completed, or the amount of data is variable and can be
> + * determined only at run-time.
> + *
> + * ***WARNING***
> + * The user of the API is expected to synchronize:
> + * 1) allocation,
> + * 2) writes to the allocated memory,
> + * 3) write protection of the pool,
> + * 4) freeing of the allocated memory, and
> + * 5) destruction of the pool.
> + *
> + * For a non-threaded scenario, this type of locking is not even required.
> + *
> + * Even if the library were to provide support for locking, point 2)
> + * would still depend on the user taking the lock.
> + */
> +
> +
> +/**
> + * pmalloc_create_pool() - create a new protectable memory pool
> + * @name: the name of the pool, enforced to be unique
> + * @min_alloc_order: log2 of the minimum allocation size obtainable
> + *                   from the pool
> + *
> + * Creates a new (empty) memory pool for allocation of protectable
> + * memory. Memory will be allocated upon request (through pmalloc).
> + *
> + * Return:
> + * * pointer to the new pool	- success
> + * * NULL			- error
> + */
> +struct gen_pool *pmalloc_create_pool(const char *name,
> +					 int min_alloc_order);

Same comments as earlier.  If this is new API with new code being 
introduced into the kernel, can the variables be declared to avoid weird 
problems?  Like min_alloc_order being a negative value makes little 
sense (based on the description here), so can it be declared as size_t 
or unsigned int?
> +
> +/**
> + * is_pmalloc_object() - validates the existence of an alleged object
> + * @ptr: address of the object
> + * @n: size of the object, in bytes
> + *
> + * Return:
> + * * 0		- the object does not belong to pmalloc
> + * * 1		- the object belongs to pmalloc
> + * * \-1	- the object overlaps pmalloc memory incorrectly
> + */
> +int is_pmalloc_object(const void *ptr, const unsigned long n);
> +
> +/**
> + * pmalloc_prealloc() - tries to allocate a memory chunk of the requested size
> + * @pool: handle to the pool to be used for memory allocation
> + * @size: amount of memory (in bytes) requested
> + *
> + * Prepares a chunk of the requested size.
> + * This is intended to both minimize latency in later memory requests and
> + * avoid sleeping during allocation.
> + * Memory allocated with prealloc is stored in one single chunk, as
> + * opposed to what is allocated on-demand when pmalloc runs out of free
> + * space already existing in the pool and has to invoke vmalloc.
> + * One additional advantage of pre-allocating larger chunks of memory is
> + * that the total slack tends to be smaller.
> + *
> + * Return:
> + * * true	- the vmalloc call was successful
> + * * false	- error
> + */
> +bool pmalloc_prealloc(struct gen_pool *pool, size_t size);
> +
> +/**
> + * pmalloc() - allocate protectable memory from a pool
> + * @pool: handle to the pool to be used for memory allocation
> + * @size: amount of memory (in bytes) requested
> + * @gfp: flags for page allocation
> + *
> + * Allocates memory from an unprotected pool. If the pool doesn't have
> + * enough memory, and the request did not include GFP_ATOMIC, an attempt
> + * is made to add a new chunk of memory to the pool
> + * (a multiple of PAGE_SIZE), in order to fit the new request.
> + * Otherwise, NULL is returned.
> + *
> + * Return:
> + * * pointer to the memory requested	- success
> + * * NULL				- either no memory available or
> + *					  pool already read-only
> + */

I don't know if an errno value is being set, but setting a variable 
somewhere using EROFS or ENOMEM would more accurate diagnose those two 
NULL conditions.
> +void *pmalloc(struct gen_pool *pool, size_t size, gfp_t gfp);
> +
> +
> +/**
> + * pzalloc() - zero-initialized version of pmalloc
> + * @pool: handle to the pool to be used for memory allocation
> + * @size: amount of memory (in bytes) requested
> + * @gfp: flags for page allocation
> + *
> + * Executes pmalloc, initializing the memory requested to 0,
> + * before returning the pointer to it.
> + *
> + * Return:
> + * * pointer to the memory requested	- success
> + * * NULL				- either no memory available or
> + *					  pool already read-only
> + */
Same comment here, though that inline function below looks highly 
optimized...
> +static inline void *pzalloc(struct gen_pool *pool, size_t size, gfp_t gfp)
> +{
> +	return pmalloc(pool, size, gfp | __GFP_ZERO);
> +}
> +
> +/**
> + * pmalloc_array() - allocates an array according to the parameters
> + * @pool: handle to the pool to be used for memory allocation
> + * @n: number of elements in the array
> + * @size: amount of memory (in bytes) requested for each element
> + * @flags: flags for page allocation
> + *
> + * Executes pmalloc, if it has a chance to succeed.
> + *
> + * Return:
> + * * the pmalloc result	- success
> + * * NULL		- error
> + */
> +static inline void *pmalloc_array(struct gen_pool *pool, size_t n,
> +				  size_t size, gfp_t flags)
> +{
> +	if (unlikely(!(pool && n && size)))
Has this code been run through sparse?  I know one thing sparse looks at 
is if NULL is being treated like a 0, and sparse does check cases when 0 
is being used in place for NULL for pointer checks, and I'm wondering if 
that line of code would pass.

> +		return NULL;
> +	return pmalloc(pool, n * size, flags);
> +}
> +
> +/**
> + * pcalloc() - allocates a 0-initialized array according to the parameters
> + * @pool: handle to the pool to be used for memory allocation
> + * @n: number of elements in the array
> + * @size: amount of memory (in bytes) requested
> + * @flags: flags for page allocation
> + *
> + * Executes pmalloc_array, if it has a chance to succeed.
> + *
> + * Return:
> + * * the pmalloc result	- success
> + * * NULL		- error
> + */
> +static inline void *pcalloc(struct gen_pool *pool, size_t n,
> +			    size_t size, gfp_t flags)
> +{
> +	return pmalloc_array(pool, n, size, flags | __GFP_ZERO);
> +}
> +
> +/**
> + * pstrdup() - duplicate a string, using pmalloc as allocator
> + * @pool: handle to the pool to be used for memory allocation
> + * @s: string to duplicate
> + * @gfp: flags for page allocation
> + *
> + * Generates a copy of the given string, allocating sufficient memory
> + * from the given pmalloc pool.
> + *
> + * Return:
> + * * pointer to the replica	- success
> + * * NULL			- error
> + */
> +static inline char *pstrdup(struct gen_pool *pool, const char *s, gfp_t gfp)
> +{
> +	size_t len;
> +	char *buf;
> +
> +	if (unlikely(pool == NULL || s == NULL))
Here, the right check is being done, so at the very least, I would make 
the last line I commented on the same as this one for code continuity.
> +		return NULL;
> +
> +	len = strlen(s) + 1;
> +	buf = pmalloc(pool, len, gfp);
> +	if (likely(buf))
> +		strncpy(buf, s, len);
> +	return buf;
> +}
> +
> +/**
> + * pmalloc_protect_pool() - turn a read/write pool read-only
> + * @pool: the pool to protect
> + *
> + * Write-protects all the memory chunks assigned to the pool.
> + * This prevents any further allocation.
> + *
> + * Return:
> + * * 0		- success
> + * * -EINVAL	- error
> + */
> +int pmalloc_protect_pool(struct gen_pool *pool);
> +
> +/**
> + * pfree() - mark as unused memory that was previously in use
> + * @pool: handle to the pool to be used for memory allocation
> + * @addr: the beginning of the memory area to be freed
> + *
> + * The behavior of pfree is different, depending on the state of the
> + * protection.
> + * If the pool is not yet protected, the memory is marked as unused and
> + * will be available for further allocations.
> + * If the pool is already protected, the memory is marked as unused, but
> + * it will still be impossible to perform further allocation, because of
> + * the existing protection.
> + * The freed memory, in this case, will be truly released only when the
> + * pool is destroyed.
> + */
> +static inline void pfree(struct gen_pool *pool, const void *addr)
> +{
> +	gen_pool_free(pool, (unsigned long)addr, 0);
> +}
> +
> +/**
> + * pmalloc_destroy_pool() - destroys a pool and all the associated memory
> + * @pool: the pool to destroy
> + *
> + * All the memory that was allocated through pmalloc in the pool will be freed.
> + *
> + * Return:
> + * * 0		- success
> + * * -EINVAL	- error
> + */
> +int pmalloc_destroy_pool(struct gen_pool *pool);
> +
> +#endif
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 1e5d8c392f15..116d280cca53 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -20,6 +20,7 @@ struct notifier_block;		/* in notifier.h */
>   #define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
>   #define VM_NO_GUARD		0x00000040      /* don't add guard page */
>   #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
> +#define VM_PMALLOC		0x00000100	/* pmalloc area - see docs */
>   /* bits [20..32] reserved for arch specific ioremap internals */
>   
>   /*
> diff --git a/lib/genalloc.c b/lib/genalloc.c
> index 87f62f31b52f..24ed35035095 100644
> --- a/lib/genalloc.c
> +++ b/lib/genalloc.c
> @@ -625,6 +625,33 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
>   }
>   EXPORT_SYMBOL(gen_pool_free);
>   
> +
> +/**
> + * gen_pool_flush_chunk() - drops all the allocations from a specific chunk
> + * @pool:	the generic memory pool
> + * @chunk:	The chunk to wipe clear.
> + *
> + * This is meant to be called only while destroying a pool. It's up to the
> + * caller to avoid races, but really, at this point the pool should have
> + * already been retired and it should have become unavailable for any other
> + * sort of operation.
> + */
> +void gen_pool_flush_chunk(struct gen_pool *pool,
> +			  struct gen_pool_chunk *chunk)
> +{
> +	size_t size;
> +
> +	if (unlikely(!(pool && chunk)))

Please make this check the same as the last line I commented on, 
especially since it's the same struct being checked.

> +		return;
> +
> +	size = chunk->end_addr + 1 - chunk->start_addr;
> +	memset(chunk->entries, 0,
> +	       DIV_ROUND_UP(size >> pool->min_alloc_order * BITS_PER_ENTRY,
> +			    BITS_PER_BYTE));
> +	atomic_long_set(&chunk->avail, size);
> +}
> +
> +
>   /**
>    * gen_pool_for_each_chunk() - call func for every chunk of generic memory pool
>    * @pool:	the generic memory pool
> diff --git a/mm/Kconfig b/mm/Kconfig
> index c782e8fb7235..be578fbdce6d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -760,3 +760,9 @@ config GUP_BENCHMARK
>   	  performance of get_user_pages_fast().
>   
>   	  See tools/testing/selftests/vm/gup_benchmark.c
> +
> +config PROTECTABLE_MEMORY
> +    bool
> +    depends on ARCH_HAS_SET_MEMORY
> +    select GENERIC_ALLOCATOR
> +    default y
> diff --git a/mm/Makefile b/mm/Makefile
> index e669f02c5a54..959fdbdac118 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -65,6 +65,7 @@ obj-$(CONFIG_SPARSEMEM)	+= sparse.o
>   obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
>   obj-$(CONFIG_SLOB) += slob.o
>   obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> +obj-$(CONFIG_PROTECTABLE_MEMORY) += pmalloc.o
>   obj-$(CONFIG_KSM) += ksm.o
>   obj-$(CONFIG_PAGE_POISONING) += page_poison.o
>   obj-$(CONFIG_SLAB) += slab.o
> diff --git a/mm/pmalloc.c b/mm/pmalloc.c
> new file mode 100644
> index 000000000000..abddba90a9f6
> --- /dev/null
> +++ b/mm/pmalloc.c
> @@ -0,0 +1,499 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * pmalloc.c: Protectable Memory Allocator
> + *
> + * (C) Copyright 2017 Huawei Technologies Co. Ltd.
> + * Author: Igor Stoppa <igor.stoppa@huawei.com>
> + */
> +
> +#include <linux/printk.h>
> +#include <linux/init.h>
> +#include <linux/mm.h>
> +#include <linux/vmalloc.h>
> +#include <linux/genalloc.h>
> +#include <linux/kernel.h>
> +#include <linux/log2.h>
> +#include <linux/slab.h>
> +#include <linux/device.h>
> +#include <linux/atomic.h>
> +#include <linux/rculist.h>
> +#include <linux/set_memory.h>
> +#include <asm/cacheflush.h>
> +#include <asm/page.h>
> +
> +#include <linux/pmalloc.h>
> +/*
> + * pmalloc_data contains the data specific to a pmalloc pool,
> + * in a format compatible with the design of gen_alloc.
> + * Some of the fields are used for exposing the corresponding parameter
> + * to userspace, through sysfs.
> + */
> +struct pmalloc_data {
> +	struct gen_pool *pool;  /* Link back to the associated pool. */
> +	bool protected;     /* Status of the pool: RO or RW. */
> +	struct kobj_attribute attr_protected; /* Sysfs attribute. */
> +	struct kobj_attribute attr_avail;     /* Sysfs attribute. */
> +	struct kobj_attribute attr_size;      /* Sysfs attribute. */
> +	struct kobj_attribute attr_chunks;    /* Sysfs attribute. */
> +	struct kobject *pool_kobject;
> +	struct list_head node; /* list of pools */
> +};
> +
> +static LIST_HEAD(pmalloc_final_list);
> +static LIST_HEAD(pmalloc_tmp_list);
> +static struct list_head *pmalloc_list = &pmalloc_tmp_list;
> +static DEFINE_MUTEX(pmalloc_mutex);
> +static struct kobject *pmalloc_kobject;
> +
> +static ssize_t pmalloc_pool_show_protected(struct kobject *dev,
> +					   struct kobj_attribute *attr,
> +					   char *buf)
> +{
> +	struct pmalloc_data *data;
> +
> +	data = container_of(attr, struct pmalloc_data, attr_protected);
> +	if (data->protected)
> +		return sprintf(buf, "protected\n");
> +	else
> +		return sprintf(buf, "unprotected\n");
> +}
> +
> +static ssize_t pmalloc_pool_show_avail(struct kobject *dev,
> +				       struct kobj_attribute *attr,
> +				       char *buf)
> +{
> +	struct pmalloc_data *data;
> +
> +	data = container_of(attr, struct pmalloc_data, attr_avail);
> +	return sprintf(buf, "%lu\n",
> +		       (unsigned long)gen_pool_avail(data->pool));
> +}
> +
> +static ssize_t pmalloc_pool_show_size(struct kobject *dev,
> +				      struct kobj_attribute *attr,
> +				      char *buf)
> +{
> +	struct pmalloc_data *data;
> +
> +	data = container_of(attr, struct pmalloc_data, attr_size);
> +	return sprintf(buf, "%lu\n",
> +		       (unsigned long)gen_pool_size(data->pool));
> +}
> +
> +static void pool_chunk_number(struct gen_pool *pool,
> +			      struct gen_pool_chunk *chunk, void *data)
> +{
> +	unsigned long *counter = data;
> +
> +	(*counter)++;
> +}
> +
> +static ssize_t pmalloc_pool_show_chunks(struct kobject *dev,
> +					struct kobj_attribute *attr,
> +					char *buf)
> +{
> +	struct pmalloc_data *data;
> +	unsigned long chunks_num = 0;
> +
> +	data = container_of(attr, struct pmalloc_data, attr_chunks);
> +	gen_pool_for_each_chunk(data->pool, pool_chunk_number, &chunks_num);
> +	return sprintf(buf, "%lu\n", chunks_num);
> +}
> +
> +/* Exposes the pool and its attributes through sysfs. */
> +static struct kobject *pmalloc_connect(struct pmalloc_data *data)
> +{
> +	const struct attribute *attrs[] = {
> +		&data->attr_protected.attr,
> +		&data->attr_avail.attr,
> +		&data->attr_size.attr,
> +		&data->attr_chunks.attr,
> +		NULL
> +	};
> +	struct kobject *kobj;
> +
> +	kobj = kobject_create_and_add(data->pool->name, pmalloc_kobject);
> +	if (unlikely(!kobj))
> +		return NULL;
> +
> +	if (unlikely(sysfs_create_files(kobj, attrs) < 0)) {
> +		kobject_put(kobj);
> +		kobj = NULL;
> +	}
> +	return kobj;
> +}
> +
> +/* Removes the pool and its attributes from sysfs. */
> +static void pmalloc_disconnect(struct pmalloc_data *data,
> +			       struct kobject *kobj)
> +{
> +	const struct attribute *attrs[] = {
> +		&data->attr_protected.attr,
> +		&data->attr_avail.attr,
> +		&data->attr_size.attr,
> +		&data->attr_chunks.attr,
> +		NULL
> +	};
> +
> +	sysfs_remove_files(kobj, attrs);
> +	kobject_put(kobj);
> +}
> +
> +/* Declares an attribute of the pool. */
> +#define pmalloc_attr_init(data, attr_name) \
> +do { \
> +	sysfs_attr_init(&data->attr_##attr_name.attr); \
> +	data->attr_##attr_name.attr.name = #attr_name; \
> +	data->attr_##attr_name.attr.mode = VERIFY_OCTAL_PERMISSIONS(0400); \
> +	data->attr_##attr_name.show = pmalloc_pool_show_##attr_name; \
> +} while (0)
> +
> +struct gen_pool *pmalloc_create_pool(const char *name, int min_alloc_order)
> +{
> +	struct gen_pool *pool;
> +	const char *pool_name;
> +	struct pmalloc_data *data;
> +
> +	if (!name) {
> +		WARN_ON(1);
??  Maybe the name check should be in WARN_ON()?
> +		return NULL;
> +	}
> +
> +	if (min_alloc_order < 0)
> +		min_alloc_order = ilog2(sizeof(unsigned long));
> +
> +	pool = gen_pool_create(min_alloc_order, NUMA_NO_NODE);
> +	if (unlikely(!pool))
> +		return NULL;
> +
> +	mutex_lock(&pmalloc_mutex);
> +	list_for_each_entry(data, pmalloc_list, node)
> +		if (!strcmp(name, data->pool->name))
> +			goto same_name_err;
> +
> +	pool_name = kstrdup(name, GFP_KERNEL);
> +	if (unlikely(!pool_name))
> +		goto name_alloc_err;
> +
> +	data = kzalloc(sizeof(struct pmalloc_data), GFP_KERNEL);
> +	if (unlikely(!data))
> +		goto data_alloc_err;
> +
> +	data->protected = false;
> +	data->pool = pool;
> +	pmalloc_attr_init(data, protected);
> +	pmalloc_attr_init(data, avail);
> +	pmalloc_attr_init(data, size);
> +	pmalloc_attr_init(data, chunks);
> +	pool->data = data;
> +	pool->name = pool_name;
> +
> +	list_add(&data->node, pmalloc_list);
> +	if (pmalloc_list == &pmalloc_final_list)
> +		data->pool_kobject = pmalloc_connect(data);
> +	mutex_unlock(&pmalloc_mutex);
> +	return pool;
> +
> +data_alloc_err:
> +	kfree(pool_name);
> +name_alloc_err:
> +same_name_err:
> +	mutex_unlock(&pmalloc_mutex);
> +	gen_pool_destroy(pool);
> +	return NULL;
> +}
> +
> +static inline int check_alloc_params(struct gen_pool *pool, size_t req_size)
> +{
> +	struct pmalloc_data *data;
> +
> +	if (unlikely(!req_size || !pool))
same unlikely() check problem mentioned before.
> +		return -1;
Can we use an errno value instead for better diagnosibility?
> +
> +	data = pool->data;
> +
> +	if (data == NULL)
> +		return -1;
Same here (ENOMEM or ENXIO comes to mind).
> +
> +	if (unlikely(data->protected)) {
> +		WARN_ON(1);
Maybe re-write this with the check inside WARN_ON()?
> +		return -1;

Same here, how about a different errno value for this case?
> +	}
> +	return 0;
> +}
> +
> +
> +static inline bool chunk_tagging(void *chunk, bool tag)
> +{
> +	struct vm_struct *area;
> +	struct page *page;
> +
> +	if (!is_vmalloc_addr(chunk))
> +		return false;
> +
> +	page = vmalloc_to_page(chunk);
> +	if (unlikely(!page))
> +		return false;
> +
> +	area = page->area;
> +	if (tag)
> +		area->flags |= VM_PMALLOC;
> +	else
> +		area->flags &= ~VM_PMALLOC;
> +	return true;
> +}
> +
> +
> +static inline bool tag_chunk(void *chunk)
> +{
> +	return chunk_tagging(chunk, true);
> +}
> +
> +
> +static inline bool untag_chunk(void *chunk)
> +{
> +	return chunk_tagging(chunk, false);
> +}
> +
> +enum {
> +	INVALID_PMALLOC_OBJECT = -1,
> +	NOT_PMALLOC_OBJECT = 0,
> +	VALID_PMALLOC_OBJECT = 1,
> +};
> +
> +int is_pmalloc_object(const void *ptr, const unsigned long n)
> +{
> +	struct vm_struct *area;
> +	struct page *page;
> +	unsigned long area_start;
> +	unsigned long area_end;
> +	unsigned long object_start;
> +	unsigned long object_end;
> +
> +
> +	/*
> +	 * is_pmalloc_object gets called pretty late, so chances are high
> +	 * that the object is indeed of vmalloc type
> +	 */
> +	if (unlikely(!is_vmalloc_addr(ptr)))
> +		return NOT_PMALLOC_OBJECT;
> +
> +	page = vmalloc_to_page(ptr);
> +	if (unlikely(!page))
> +		return NOT_PMALLOC_OBJECT;
> +
> +	area = page->area;
> +
> +	if (likely(!(area->flags & VM_PMALLOC)))
> +		return NOT_PMALLOC_OBJECT;
> +
> +	area_start = (unsigned long)area->addr;
> +	area_end = area_start + area->nr_pages * PAGE_SIZE - 1;
> +	object_start = (unsigned long)ptr;
> +	object_end = object_start + n - 1;
> +
> +	if (likely((area_start <= object_start) &&
> +		   (object_end <= area_end)))
> +		return VALID_PMALLOC_OBJECT;
> +	else
> +		return INVALID_PMALLOC_OBJECT;
> +}
> +
> +
> +bool pmalloc_prealloc(struct gen_pool *pool, size_t size)
> +{
> +	void *chunk;
> +	size_t chunk_size;
> +	bool add_error;
> +
> +	if (check_alloc_params(pool, size))
> +		return false;
> +
> +	/* Expand pool */
> +	chunk_size = roundup(size, PAGE_SIZE);
> +	chunk = vmalloc(chunk_size);
> +	if (unlikely(chunk == NULL))
> +		return false;
> +
> +	/* Locking is already done inside gen_pool_add */
> +	add_error = gen_pool_add(pool, (unsigned long)chunk, chunk_size,
> +				 NUMA_NO_NODE);
> +	if (unlikely(add_error != 0))
> +		goto abort;
> +
> +	return true;
> +abort:
> +	vfree_atomic(chunk);
> +	return false;
> +
> +}
> +
> +void *pmalloc(struct gen_pool *pool, size_t size, gfp_t gfp)
> +{
> +	void *chunk;
> +	size_t chunk_size;
> +	bool add_error;
> +	unsigned long retval;
> +
> +	if (check_alloc_params(pool, size))
> +		return NULL;
> +
> +retry_alloc_from_pool:
> +	retval = gen_pool_alloc(pool, size);
> +	if (retval)
> +		goto return_allocation;
> +
> +	if (unlikely((gfp & __GFP_ATOMIC))) {
> +		if (unlikely((gfp & __GFP_NOFAIL)))
> +			goto retry_alloc_from_pool;
> +		else
> +			return NULL;
> +	}
> +
> +	/* Expand pool */
> +	chunk_size = roundup(size, PAGE_SIZE);
> +	chunk = vmalloc(chunk_size);
> +	if (unlikely(!chunk)) {
> +		if (unlikely((gfp & __GFP_NOFAIL)))
> +			goto retry_alloc_from_pool;
> +		else
> +			return NULL;
> +	}
> +	if (unlikely(!tag_chunk(chunk)))
> +		goto free;
> +
> +	/* Locking is already done inside gen_pool_add */
> +	add_error = gen_pool_add(pool, (unsigned long)chunk, chunk_size,
> +				 NUMA_NO_NODE);
> +	if (unlikely(add_error))
> +		goto abort;
> +
> +	retval = gen_pool_alloc(pool, size);
> +	if (retval) {
> +return_allocation:
> +		*(size_t *)retval = size;
> +		if (gfp & __GFP_ZERO)
> +			memset((void *)retval, 0, size);
> +		return (void *)retval;
> +	}
> +	/*
> +	 * Here there is no test for __GFP_NO_FAIL because, in case of
> +	 * concurrent allocation, one thread might add a chunk to the
> +	 * pool and this memory could be allocated by another thread,
> +	 * before the first thread gets a chance to use it.
> +	 * As long as vmalloc succeeds, it's ok to retry.
> +	 */
> +	goto retry_alloc_from_pool;
> +abort:
> +	untag_chunk(chunk);
> +free:
> +	vfree_atomic(chunk);
> +	return NULL;
> +}
> +
> +static void pmalloc_chunk_set_protection(struct gen_pool *pool,
> +
> +					 struct gen_pool_chunk *chunk,
> +					 void *data)
> +{
> +	const bool *flag = data;
> +	size_t chunk_size = chunk->end_addr + 1 - chunk->start_addr;
> +	unsigned long pages = chunk_size / PAGE_SIZE;
> +
> +	BUG_ON(chunk_size & (PAGE_SIZE - 1));
Re-think WARN_ON() for BUG_ON()?  And also check chunk as well, as it's 
being used below?
> +
> +	if (*flag)
> +		set_memory_ro(chunk->start_addr, pages);
> +	else
> +		set_memory_rw(chunk->start_addr, pages);
> +}
> +
> +static int pmalloc_pool_set_protection(struct gen_pool *pool, bool protection)
> +{
> +	struct pmalloc_data *data;
> +	struct gen_pool_chunk *chunk;
> +
> +	if (unlikely(!pool))
> +		return -EINVAL;
This is example of what I'd perfer seeing in check_alloc_params().
> +
> +	data = pool->data;
> +
> +	if (unlikely(!data))
> +		return -EINVAL;
ENXIO or EIO or ENOMEM sound better?
> +
> +	if (unlikely(data->protected == protection)) {
> +		WARN_ON(1);
Better to put the check inside WARN_ON, me thinks...
> +		return 0;
> +	}
> +
> +	data->protected = protection;
> +	list_for_each_entry(chunk, &(pool)->chunks, next_chunk)
> +		pmalloc_chunk_set_protection(pool, chunk, &protection);
> +	return 0;
> +}
> +
> +int pmalloc_protect_pool(struct gen_pool *pool)
> +{
> +	return pmalloc_pool_set_protection(pool, true);
Is pool == NULL being checked somewhere, similar to previous functions 
in this patch?
> +}
> +
> +
> +static void pmalloc_chunk_free(struct gen_pool *pool,
> +			       struct gen_pool_chunk *chunk, void *data)
> +{
Wat is 'data' being used for? Looks unused.  Should parameters be 
checked, like other ones?
> +	untag_chunk(chunk);
> +	gen_pool_flush_chunk(pool, chunk);
> +	vfree_atomic((void *)chunk->start_addr);
> +}
> +
> +
> +int pmalloc_destroy_pool(struct gen_pool *pool)
> +{
> +	struct pmalloc_data *data;
> +
> +	if (unlikely(pool == NULL))
> +		return -EINVAL;
> +
> +	data = pool->data;
> +
> +	if (unlikely(data == NULL))
> +		return -EINVAL;

I'd use a different errno value since you already used it for pool.
> +
> +	mutex_lock(&pmalloc_mutex);
> +	list_del(&data->node);
> +	mutex_unlock(&pmalloc_mutex);
> +
> +	if (likely(data->pool_kobject))
> +		pmalloc_disconnect(data, data->pool_kobject);
> +
> +	pmalloc_pool_set_protection(pool, false);
> +	gen_pool_for_each_chunk(pool, pmalloc_chunk_free, NULL);
> +	gen_pool_destroy(pool);
> +	kfree(data);
Does data need to be set to NULL in this case, as data is a member of 
pool (pool->data)?  I'm worried about dangling pointer scenarios which 
probably isn't good for security modules?
> +	return 0;
> +}
> +
> +/*
> + * When the sysfs is ready to receive registrations, connect all the
> + * pools previously created. Also enable further pools to be connected
> + * right away.
> + */
> +static int __init pmalloc_late_init(void)
> +{
> +	struct pmalloc_data *data, *n;
> +
> +	pmalloc_kobject = kobject_create_and_add("pmalloc", kernel_kobj);
> +
> +	mutex_lock(&pmalloc_mutex);
> +	pmalloc_list = &pmalloc_final_list;
> +
> +	if (likely(pmalloc_kobject != NULL)) {
> +		list_for_each_entry_safe(data, n, &pmalloc_tmp_list, node) {
> +			list_move(&data->node, &pmalloc_final_list);
> +			pmalloc_connect(data);
> +		}
> +	}
It would be nice to have the init() return an error value in case of 
failure.

Thanks,
Jay
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
kernel test robot Feb. 25, 2018, 2:33 a.m. UTC | #2
Hi Igor,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.16-rc2]
[cannot apply to next-20180223]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Igor-Stoppa/genalloc-track-beginning-of-allocations/20180225-081601
config: arm-allnoconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=arm 

All errors (new ones prefixed by >>):

   mm/pmalloc.o: In function `pmalloc_prealloc':
>> pmalloc.c:(.text+0x280): undefined reference to `vfree_atomic'
   mm/pmalloc.o: In function `pmalloc':
   pmalloc.c:(.text+0x2c4): undefined reference to `vfree_atomic'
   mm/pmalloc.o: In function `pmalloc_chunk_free':
   pmalloc.c:(.text+0x9e): undefined reference to `vfree_atomic'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Igor Stoppa Feb. 26, 2018, 2:28 p.m. UTC | #3
On 24/02/18 02:10, J Freyensee wrote:
> On 2/23/18 6:48 AM, Igor Stoppa wrote:

[...]

>> +struct gen_pool *pmalloc_create_pool(const char *name,
>> +					 int min_alloc_order);
> 
> Same comments as earlier.  If this is new API with new code being 
> introduced into the kernel, can the variables be declared to avoid weird 
> problems?  Like min_alloc_order being a negative value makes little 
> sense (based on the description here), so can it be declared as size_t 
> or unsigned int?

in this case, yes, but I see it as different case

[...]

>> + * * NULL				- either no memory available or
>> + *					  pool already read-only
>> + */
> 
> I don't know if an errno value is being set, but setting a variable 
> somewhere using EROFS or ENOMEM would more accurate diagnose those two 
> NULL conditions.

I expect that the latter is highly unlikely to happen, because the user
of the API controls if the pool is locked or not.

I think it shouldn't come as a surprise to the one who locked the pool,
if the pool is locked.

If the pool is used with concurrent users, attention should be paid to
not lock it before ever user is happy (this is where the user of the API
has to provide own locking.)

Since the information if the pool is already protected is actually
present in the pmalloc_data structure associated with the pool, I was
tempted to make it available through the API, but that seemed wrong.

The user of the API should be very aware of the state of the pool, since
the user is the one who sets it. Why would it have to be read back?

>> +void *pmalloc(struct gen_pool *pool, size_t size, gfp_t gfp);
>> +
>> +
>> +/**
>> + * pzalloc() - zero-initialized version of pmalloc
>> + * @pool: handle to the pool to be used for memory allocation
>> + * @size: amount of memory (in bytes) requested
>> + * @gfp: flags for page allocation
>> + *
>> + * Executes pmalloc, initializing the memory requested to 0,
>> + * before returning the pointer to it.
>> + *
>> + * Return:
>> + * * pointer to the memory requested	- success
>> + * * NULL				- either no memory available or
>> + *					  pool already read-only
>> + */
> Same comment here, though that inline function below looks highly 
> optimized...

The same applies here.
I'm not very fond of the idea of returning the status elsewhere and in a
way that is not intrinsically connected with the action that has
determined the change of state.

AFAIK *alloc functions return either the memory requested or NULL.
I wonder how realistic this case is.

[...]

>> +	if (unlikely(!(pool && n && size)))
> Has this code been run through sparse?

I use "make C=1 W=1"

> I know one thing sparse looks at 
> is if NULL is being treated like a 0, and sparse does check cases when 0 
> is being used in place for NULL for pointer checks, and I'm wondering if 
> that line of code would pass.

It's a logical AND: wouldn't NULL translate to false, rather 0?
I can add an explicit check against NULL, it's probably more readable
too, but I don't think that the current construct treats NULL as 0.

[...]

>> +	if (unlikely(pool == NULL || s == NULL))
> Here, the right check is being done, so at the very least, I would make 
> the last line I commented on the same as this one for code continuity.

ok

[...]

>> +	if (unlikely(!(pool && chunk)))
> 
> Please make this check the same as the last line I commented on, 
> especially since it's the same struct being checked.

yes

[...]

>> +	if (!name) {
>> +		WARN_ON(1);
> ??  Maybe the name check should be in WARN_ON()?

true :-(

[...]

>> +	if (unlikely(!req_size || !pool))
> same unlikely() check problem mentioned before.
>> +		return -1;
> Can we use an errno value instead for better diagnosibility?
>> +
>> +	data = pool->data;
>> +
>> +	if (data == NULL)
>> +		return -1;
> Same here (ENOMEM or ENXIO comes to mind).
>> +
>> +	if (unlikely(data->protected)) {
>> +		WARN_ON(1);
> Maybe re-write this with the check inside WARN_ON()?
>> +		return -1;
> 
> Same here, how about a different errno value for this case?

yes, to all of the above

[...]

>> +static void pmalloc_chunk_set_protection(struct gen_pool *pool,
>> +
>> +					 struct gen_pool_chunk *chunk,
>> +					 void *data)
>> +{
>> +	const bool *flag = data;
>> +	size_t chunk_size = chunk->end_addr + 1 - chunk->start_addr;
>> +	unsigned long pages = chunk_size / PAGE_SIZE;
>> +
>> +	BUG_ON(chunk_size & (PAGE_SIZE - 1));
> Re-think WARN_ON() for BUG_ON()?  And also check chunk as well, as it's 
> being used below?

ok

>> +
>> +	if (*flag)
>> +		set_memory_ro(chunk->start_addr, pages);
>> +	else
>> +		set_memory_rw(chunk->start_addr, pages);
>> +}
>> +
>> +static int pmalloc_pool_set_protection(struct gen_pool *pool, bool protection)
>> +{
>> +	struct pmalloc_data *data;
>> +	struct gen_pool_chunk *chunk;
>> +
>> +	if (unlikely(!pool))
>> +		return -EINVAL;
> This is example of what I'd perfer seeing in check_alloc_params().

yes

>> +
>> +	data = pool->data;
>> +
>> +	if (unlikely(!data))
>> +		return -EINVAL;
> ENXIO or EIO or ENOMEM sound better?

Why? At least based on he description from errno-base.h, EINVAL seemed
the most appropriate:

#define	EIO		 5	/* I/O error */
#define	ENXIO		 6	/* No such device or address */
#define	ENOMEM		12	/* Out of memory */

#define	EINVAL		22	/* Invalid argument */

If I was really pressed to change it, I'd rather pick:

#define	EFAULT		14	/* Bad address */


>> +
>> +	if (unlikely(data->protected == protection)) {
>> +		WARN_ON(1);
> Better to put the check inside WARN_ON, me thinks...

yes, I have no idea why I wrote it like that  :-(

>> +		return 0;
>> +	}
>> +
>> +	data->protected = protection;
>> +	list_for_each_entry(chunk, &(pool)->chunks, next_chunk)
>> +		pmalloc_chunk_set_protection(pool, chunk, &protection);
>> +	return 0;
>> +}
>> +
>> +int pmalloc_protect_pool(struct gen_pool *pool)
>> +{
>> +	return pmalloc_pool_set_protection(pool, true);
> Is pool == NULL being checked somewhere, similar to previous functions 
> in this patch?

right.

>> +}
>> +
>> +
>> +static void pmalloc_chunk_free(struct gen_pool *pool,
>> +			       struct gen_pool_chunk *chunk, void *data)
>> +{
> Wat is 'data' being used for? Looks unused.  Should parameters be 
> checked, like other ones?


This is the iterator that is passed to genalloc
genalloc defines the format for the iterator, because it *will* want to
pass the pointer to the opaque data structure.

>> +	untag_chunk(chunk);
>> +	gen_pool_flush_chunk(pool, chunk);
>> +	vfree_atomic((void *)chunk->start_addr);
>> +}
>> +
>> +
>> +int pmalloc_destroy_pool(struct gen_pool *pool)
>> +{
>> +	struct pmalloc_data *data;
>> +
>> +	if (unlikely(pool == NULL))
>> +		return -EINVAL;
>> +
>> +	data = pool->data;
>> +
>> +	if (unlikely(data == NULL))
>> +		return -EINVAL;
> 
> I'd use a different errno value since you already used it for pool.

Thinking more about this, how about collapsing them?
I do need to check for the value of data, before I dereference it,
however the causes are very different:

* wrong pool pointer - definitely possible
* corrupted data structure (data member overwritten) - highly unlikely

>> +
>> +	mutex_lock(&pmalloc_mutex);
>> +	list_del(&data->node);
>> +	mutex_unlock(&pmalloc_mutex);
>> +
>> +	if (likely(data->pool_kobject))
>> +		pmalloc_disconnect(data, data->pool_kobject);
>> +
>> +	pmalloc_pool_set_protection(pool, false);
>> +	gen_pool_for_each_chunk(pool, pmalloc_chunk_free, NULL);
>> +	gen_pool_destroy(pool);
>> +	kfree(data);
> Does data need to be set to NULL in this case, as data is a member of 
> pool (pool->data)?  I'm worried about dangling pointer scenarios which 
> probably isn't good for security modules?

The pool was destroyed in the previous step.
There is nothing left that can be set to NULL.
Unless we are expecting an use-after-free of the pool structure.
But everything it was referring to is gone as well.

If we really want to go after this case, then basically everything that
has been allocated should be poisoned before being freed.

Isn't it a bit too much?

>> +	return 0;
>> +}
>> +
>> +/*
>> + * When the sysfs is ready to receive registrations, connect all the
>> + * pools previously created. Also enable further pools to be connected
>> + * right away.
>> + */
>> +static int __init pmalloc_late_init(void)
>> +{
>> +	struct pmalloc_data *data, *n;
>> +
>> +	pmalloc_kobject = kobject_create_and_add("pmalloc", kernel_kobj);
>> +
>> +	mutex_lock(&pmalloc_mutex);
>> +	pmalloc_list = &pmalloc_final_list;
>> +
>> +	if (likely(pmalloc_kobject != NULL)) {
>> +		list_for_each_entry_safe(data, n, &pmalloc_tmp_list, node) {
>> +			list_move(&data->node, &pmalloc_final_list);
>> +			pmalloc_connect(data);
>> +		}
>> +	}
> It would be nice to have the init() return an error value in case of 
> failure.

ok

--
igor
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jay Freyensee Feb. 26, 2018, 6:25 p.m. UTC | #4
inlined responses.


On 2/26/18 6:28 AM, Igor Stoppa wrote:
>
> On 24/02/18 02:10, J Freyensee wrote:
>> On 2/23/18 6:48 AM, Igor Stoppa wrote:
> [...]
>
>>> +struct gen_pool *pmalloc_create_pool(const char *name,
>>> +					 int min_alloc_order);
>> Same comments as earlier.  If this is new API with new code being
>> introduced into the kernel, can the variables be declared to avoid weird
>> problems?  Like min_alloc_order being a negative value makes little
>> sense (based on the description here), so can it be declared as size_t
>> or unsigned int?
> in this case, yes, but I see it as different case

OK.

>
> [...]
>
>>> + * * NULL				- either no memory available or
>>> + *					  pool already read-only
>>> + */
>> I don't know if an errno value is being set, but setting a variable
>> somewhere using EROFS or ENOMEM would more accurate diagnose those two
>> NULL conditions.
> I expect that the latter is highly unlikely to happen, because the user
> of the API controls if the pool is locked or not.
>
> I think it shouldn't come as a surprise to the one who locked the pool,
> if the pool is locked.
>
> If the pool is used with concurrent users, attention should be paid to
> not lock it before ever user is happy (this is where the user of the API
> has to provide own locking.)
>
> Since the information if the pool is already protected is actually
> present in the pmalloc_data structure associated with the pool, I was
> tempted to make it available through the API, but that seemed wrong.
>
> The user of the API should be very aware of the state of the pool, since
> the user is the one who sets it. Why would it have to be read back?

OK, sounds good :-).

>
>>> +void *pmalloc(struct gen_pool *pool, size_t size, gfp_t gfp);
>>> +
>>> +
>>> +/**
>>> + * pzalloc() - zero-initialized version of pmalloc
>>> + * @pool: handle to the pool to be used for memory allocation
>>> + * @size: amount of memory (in bytes) requested
>>> + * @gfp: flags for page allocation
>>> + *
>>> + * Executes pmalloc, initializing the memory requested to 0,
>>> + * before returning the pointer to it.
>>> + *
>>> + * Return:
>>> + * * pointer to the memory requested	- success
>>> + * * NULL				- either no memory available or
>>> + *					  pool already read-only
>>> + */
>> Same comment here, though that inline function below looks highly
>> optimized...
> The same applies here.
> I'm not very fond of the idea of returning the status elsewhere and in a
> way that is not intrinsically connected with the action that has
> determined the change of state.
>
> AFAIK *alloc functions return either the memory requested or NULL.
> I wonder how realistic this case is.

OK.

>
> [...]
>
>>> +	if (unlikely(!(pool && n && size)))
>> Has this code been run through sparse?
> I use "make C=1 W=1"

OK.

>
>> I know one thing sparse looks at
>> is if NULL is being treated like a 0, and sparse does check cases when 0
>> is being used in place for NULL for pointer checks, and I'm wondering if
>> that line of code would pass.
> It's a logical AND: wouldn't NULL translate to false, rather 0?
> I can add an explicit check against NULL, it's probably more readable
> too, but I don't think that the current construct treats NULL as 0.

If it passes sparse, I think it's fine.

>
> [...]
>
>>> +	if (unlikely(pool == NULL || s == NULL))
>> Here, the right check is being done, so at the very least, I would make
>> the last line I commented on the same as this one for code continuity.
> ok
>
> [...]
>
>>> +	if (unlikely(!(pool && chunk)))
>> Please make this check the same as the last line I commented on,
>> especially since it's the same struct being checked.
> yes

OK :-)


>
> [...]
>
>>> +	if (!name) {
>>> +		WARN_ON(1);
>> ??  Maybe the name check should be in WARN_ON()?
> true :-(

OK.


>
> [...]
>
>>> +	if (unlikely(!req_size || !pool))
>> same unlikely() check problem mentioned before.
>>> +		return -1;
>> Can we use an errno value instead for better diagnosibility?
>>> +
>>> +	data = pool->data;
>>> +
>>> +	if (data == NULL)
>>> +		return -1;
>> Same here (ENOMEM or ENXIO comes to mind).
>>> +
>>> +	if (unlikely(data->protected)) {
>>> +		WARN_ON(1);
>> Maybe re-write this with the check inside WARN_ON()?
>>> +		return -1;
>> Same here, how about a different errno value for this case?
> yes, to all of the above

OK.

>
> [...]
>
>>> +static void pmalloc_chunk_set_protection(struct gen_pool *pool,
>>> +
>>> +					 struct gen_pool_chunk *chunk,
>>> +					 void *data)
>>> +{
>>> +	const bool *flag = data;
>>> +	size_t chunk_size = chunk->end_addr + 1 - chunk->start_addr;
>>> +	unsigned long pages = chunk_size / PAGE_SIZE;
>>> +
>>> +	BUG_ON(chunk_size & (PAGE_SIZE - 1));
>> Re-think WARN_ON() for BUG_ON()?  And also check chunk as well, as it's
>> being used below?
> ok
>
>>> +
>>> +	if (*flag)
>>> +		set_memory_ro(chunk->start_addr, pages);
>>> +	else
>>> +		set_memory_rw(chunk->start_addr, pages);
>>> +}
>>> +
>>> +static int pmalloc_pool_set_protection(struct gen_pool *pool, bool protection)
>>> +{
>>> +	struct pmalloc_data *data;
>>> +	struct gen_pool_chunk *chunk;
>>> +
>>> +	if (unlikely(!pool))
>>> +		return -EINVAL;
>> This is example of what I'd perfer seeing in check_alloc_params().
> yes
>
>>> +
>>> +	data = pool->data;
>>> +
>>> +	if (unlikely(!data))
>>> +		return -EINVAL;
>> ENXIO or EIO or ENOMEM sound better?
> Why? At least based on he description from errno-base.h, EINVAL seemed
> the most appropriate:
>
> #define	EIO		 5	/* I/O error */
> #define	ENXIO		 6	/* No such device or address */
> #define	ENOMEM		12	/* Out of memory */
>
> #define	EINVAL		22	/* Invalid argument */
>
> If I was really pressed to change it, I'd rather pick:
>
> #define	EFAULT		14	/* Bad address */
>

OK, very reasonable.


>>> +
>>> +	if (unlikely(data->protected == protection)) {
>>> +		WARN_ON(1);
>> Better to put the check inside WARN_ON, me thinks...
> yes, I have no idea why I wrote it like that  :-(

No worries :-).


>
>>> +		return 0;
>>> +	}
>>> +
>>> +	data->protected = protection;
>>> +	list_for_each_entry(chunk, &(pool)->chunks, next_chunk)
>>> +		pmalloc_chunk_set_protection(pool, chunk, &protection);
>>> +	return 0;
>>> +}
>>> +
>>> +int pmalloc_protect_pool(struct gen_pool *pool)
>>> +{
>>> +	return pmalloc_pool_set_protection(pool, true);
>> Is pool == NULL being checked somewhere, similar to previous functions
>> in this patch?
> right.

OK.


>
>>> +}
>>> +
>>> +
>>> +static void pmalloc_chunk_free(struct gen_pool *pool,
>>> +			       struct gen_pool_chunk *chunk, void *data)
>>> +{
>> Wat is 'data' being used for? Looks unused.  Should parameters be
>> checked, like other ones?
>
> This is the iterator that is passed to genalloc
> genalloc defines the format for the iterator, because it *will* want to
> pass the pointer to the opaque data structure.

OK.


>
>>> +	untag_chunk(chunk);
>>> +	gen_pool_flush_chunk(pool, chunk);
>>> +	vfree_atomic((void *)chunk->start_addr);
>>> +}
>>> +
>>> +
>>> +int pmalloc_destroy_pool(struct gen_pool *pool)
>>> +{
>>> +	struct pmalloc_data *data;
>>> +
>>> +	if (unlikely(pool == NULL))
>>> +		return -EINVAL;
>>> +
>>> +	data = pool->data;
>>> +
>>> +	if (unlikely(data == NULL))
>>> +		return -EINVAL;
>> I'd use a different errno value since you already used it for pool.
> Thinking more about this, how about collapsing them?
> I do need to check for the value of data, before I dereference it,
> however the causes are very different:
>
> * wrong pool pointer - definitely possible
> * corrupted data structure (data member overwritten) - highly unlikely


That sounds good.


>>> +
>>> +	mutex_lock(&pmalloc_mutex);
>>> +	list_del(&data->node);
>>> +	mutex_unlock(&pmalloc_mutex);
>>> +
>>> +	if (likely(data->pool_kobject))
>>> +		pmalloc_disconnect(data, data->pool_kobject);
>>> +
>>> +	pmalloc_pool_set_protection(pool, false);
>>> +	gen_pool_for_each_chunk(pool, pmalloc_chunk_free, NULL);
>>> +	gen_pool_destroy(pool);
>>> +	kfree(data);
>> Does data need to be set to NULL in this case, as data is a member of
>> pool (pool->data)?  I'm worried about dangling pointer scenarios which
>> probably isn't good for security modules?
> The pool was destroyed in the previous step.
> There is nothing left that can be set to NULL.
> Unless we are expecting an use-after-free of the pool structure.
> But everything it was referring to is gone as well.
>
> If we really want to go after this case, then basically everything that
> has been allocated should be poisoned before being freed.
>
> Isn't it a bit too much?

I'd ask one of the maintainers (James or Serge) if it's too much, this 
is the Linux Security Module after all.  But it also could be a kernel 
hardening thing to do?

>
>>> +	return 0;
>>> +}
>>> +
>>> +/*
>>> + * When the sysfs is ready to receive registrations, connect all the
>>> + * pools previously created. Also enable further pools to be connected
>>> + * right away.
>>> + */
>>> +static int __init pmalloc_late_init(void)
>>> +{
>>> +	struct pmalloc_data *data, *n;
>>> +
>>> +	pmalloc_kobject = kobject_create_and_add("pmalloc", kernel_kobj);
>>> +
>>> +	mutex_lock(&pmalloc_mutex);
>>> +	pmalloc_list = &pmalloc_final_list;
>>> +
>>> +	if (likely(pmalloc_kobject != NULL)) {
>>> +		list_for_each_entry_safe(data, n, &pmalloc_tmp_list, node) {
>>> +			list_move(&data->node, &pmalloc_final_list);
>>> +			pmalloc_connect(data);
>>> +		}
>>> +	}
>> It would be nice to have the init() return an error value in case of
>> failure.
> ok
>
> --
> igor

Thanks,
Jay


--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index dcaa33e74b1c..b6c4cea9fbd8 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -121,6 +121,9 @@  extern unsigned long gen_pool_alloc_algo(struct gen_pool *, size_t,
 extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma);
 extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
+
+extern void gen_pool_flush_chunk(struct gen_pool *pool,
+				 struct gen_pool_chunk *chunk);
 extern void gen_pool_for_each_chunk(struct gen_pool *,
 	void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *);
 extern size_t gen_pool_avail(struct gen_pool *);
diff --git a/include/linux/pmalloc.h b/include/linux/pmalloc.h
new file mode 100644
index 000000000000..afc2068d5545
--- /dev/null
+++ b/include/linux/pmalloc.h
@@ -0,0 +1,242 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * pmalloc.h: Header for Protectable Memory Allocator
+ *
+ * (C) Copyright 2017 Huawei Technologies Co. Ltd.
+ * Author: Igor Stoppa <igor.stoppa@huawei.com>
+ */
+
+#ifndef _LINUX_PMALLOC_H
+#define _LINUX_PMALLOC_H
+
+
+#include <linux/genalloc.h>
+#include <linux/string.h>
+
+#define PMALLOC_DEFAULT_ALLOC_ORDER (-1)
+
+/*
+ * Library for dynamic allocation of pools of memory that can be,
+ * after initialization, marked as read-only.
+ *
+ * This is intended to complement __read_only_after_init, for those cases
+ * where either it is not possible to know the initialization value before
+ * init is completed, or the amount of data is variable and can be
+ * determined only at run-time.
+ *
+ * ***WARNING***
+ * The user of the API is expected to synchronize:
+ * 1) allocation,
+ * 2) writes to the allocated memory,
+ * 3) write protection of the pool,
+ * 4) freeing of the allocated memory, and
+ * 5) destruction of the pool.
+ *
+ * For a non-threaded scenario, this type of locking is not even required.
+ *
+ * Even if the library were to provide support for locking, point 2)
+ * would still depend on the user taking the lock.
+ */
+
+
+/**
+ * pmalloc_create_pool() - create a new protectable memory pool
+ * @name: the name of the pool, enforced to be unique
+ * @min_alloc_order: log2 of the minimum allocation size obtainable
+ *                   from the pool
+ *
+ * Creates a new (empty) memory pool for allocation of protectable
+ * memory. Memory will be allocated upon request (through pmalloc).
+ *
+ * Return:
+ * * pointer to the new pool	- success
+ * * NULL			- error
+ */
+struct gen_pool *pmalloc_create_pool(const char *name,
+					 int min_alloc_order);
+
+/**
+ * is_pmalloc_object() - validates the existence of an alleged object
+ * @ptr: address of the object
+ * @n: size of the object, in bytes
+ *
+ * Return:
+ * * 0		- the object does not belong to pmalloc
+ * * 1		- the object belongs to pmalloc
+ * * \-1	- the object overlaps pmalloc memory incorrectly
+ */
+int is_pmalloc_object(const void *ptr, const unsigned long n);
+
+/**
+ * pmalloc_prealloc() - tries to allocate a memory chunk of the requested size
+ * @pool: handle to the pool to be used for memory allocation
+ * @size: amount of memory (in bytes) requested
+ *
+ * Prepares a chunk of the requested size.
+ * This is intended to both minimize latency in later memory requests and
+ * avoid sleeping during allocation.
+ * Memory allocated with prealloc is stored in one single chunk, as
+ * opposed to what is allocated on-demand when pmalloc runs out of free
+ * space already existing in the pool and has to invoke vmalloc.
+ * One additional advantage of pre-allocating larger chunks of memory is
+ * that the total slack tends to be smaller.
+ *
+ * Return:
+ * * true	- the vmalloc call was successful
+ * * false	- error
+ */
+bool pmalloc_prealloc(struct gen_pool *pool, size_t size);
+
+/**
+ * pmalloc() - allocate protectable memory from a pool
+ * @pool: handle to the pool to be used for memory allocation
+ * @size: amount of memory (in bytes) requested
+ * @gfp: flags for page allocation
+ *
+ * Allocates memory from an unprotected pool. If the pool doesn't have
+ * enough memory, and the request did not include GFP_ATOMIC, an attempt
+ * is made to add a new chunk of memory to the pool
+ * (a multiple of PAGE_SIZE), in order to fit the new request.
+ * Otherwise, NULL is returned.
+ *
+ * Return:
+ * * pointer to the memory requested	- success
+ * * NULL				- either no memory available or
+ *					  pool already read-only
+ */
+void *pmalloc(struct gen_pool *pool, size_t size, gfp_t gfp);
+
+
+/**
+ * pzalloc() - zero-initialized version of pmalloc
+ * @pool: handle to the pool to be used for memory allocation
+ * @size: amount of memory (in bytes) requested
+ * @gfp: flags for page allocation
+ *
+ * Executes pmalloc, initializing the memory requested to 0,
+ * before returning the pointer to it.
+ *
+ * Return:
+ * * pointer to the memory requested	- success
+ * * NULL				- either no memory available or
+ *					  pool already read-only
+ */
+static inline void *pzalloc(struct gen_pool *pool, size_t size, gfp_t gfp)
+{
+	return pmalloc(pool, size, gfp | __GFP_ZERO);
+}
+
+/**
+ * pmalloc_array() - allocates an array according to the parameters
+ * @pool: handle to the pool to be used for memory allocation
+ * @n: number of elements in the array
+ * @size: amount of memory (in bytes) requested for each element
+ * @flags: flags for page allocation
+ *
+ * Executes pmalloc, if it has a chance to succeed.
+ *
+ * Return:
+ * * the pmalloc result	- success
+ * * NULL		- error
+ */
+static inline void *pmalloc_array(struct gen_pool *pool, size_t n,
+				  size_t size, gfp_t flags)
+{
+	if (unlikely(!(pool && n && size)))
+		return NULL;
+	return pmalloc(pool, n * size, flags);
+}
+
+/**
+ * pcalloc() - allocates a 0-initialized array according to the parameters
+ * @pool: handle to the pool to be used for memory allocation
+ * @n: number of elements in the array
+ * @size: amount of memory (in bytes) requested
+ * @flags: flags for page allocation
+ *
+ * Executes pmalloc_array, if it has a chance to succeed.
+ *
+ * Return:
+ * * the pmalloc result	- success
+ * * NULL		- error
+ */
+static inline void *pcalloc(struct gen_pool *pool, size_t n,
+			    size_t size, gfp_t flags)
+{
+	return pmalloc_array(pool, n, size, flags | __GFP_ZERO);
+}
+
+/**
+ * pstrdup() - duplicate a string, using pmalloc as allocator
+ * @pool: handle to the pool to be used for memory allocation
+ * @s: string to duplicate
+ * @gfp: flags for page allocation
+ *
+ * Generates a copy of the given string, allocating sufficient memory
+ * from the given pmalloc pool.
+ *
+ * Return:
+ * * pointer to the replica	- success
+ * * NULL			- error
+ */
+static inline char *pstrdup(struct gen_pool *pool, const char *s, gfp_t gfp)
+{
+	size_t len;
+	char *buf;
+
+	if (unlikely(pool == NULL || s == NULL))
+		return NULL;
+
+	len = strlen(s) + 1;
+	buf = pmalloc(pool, len, gfp);
+	if (likely(buf))
+		strncpy(buf, s, len);
+	return buf;
+}
+
+/**
+ * pmalloc_protect_pool() - turn a read/write pool read-only
+ * @pool: the pool to protect
+ *
+ * Write-protects all the memory chunks assigned to the pool.
+ * This prevents any further allocation.
+ *
+ * Return:
+ * * 0		- success
+ * * -EINVAL	- error
+ */
+int pmalloc_protect_pool(struct gen_pool *pool);
+
+/**
+ * pfree() - mark as unused memory that was previously in use
+ * @pool: handle to the pool to be used for memory allocation
+ * @addr: the beginning of the memory area to be freed
+ *
+ * The behavior of pfree is different, depending on the state of the
+ * protection.
+ * If the pool is not yet protected, the memory is marked as unused and
+ * will be available for further allocations.
+ * If the pool is already protected, the memory is marked as unused, but
+ * it will still be impossible to perform further allocation, because of
+ * the existing protection.
+ * The freed memory, in this case, will be truly released only when the
+ * pool is destroyed.
+ */
+static inline void pfree(struct gen_pool *pool, const void *addr)
+{
+	gen_pool_free(pool, (unsigned long)addr, 0);
+}
+
+/**
+ * pmalloc_destroy_pool() - destroys a pool and all the associated memory
+ * @pool: the pool to destroy
+ *
+ * All the memory that was allocated through pmalloc in the pool will be freed.
+ *
+ * Return:
+ * * 0		- success
+ * * -EINVAL	- error
+ */
+int pmalloc_destroy_pool(struct gen_pool *pool);
+
+#endif
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1e5d8c392f15..116d280cca53 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -20,6 +20,7 @@  struct notifier_block;		/* in notifier.h */
 #define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
 #define VM_NO_GUARD		0x00000040      /* don't add guard page */
 #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
+#define VM_PMALLOC		0x00000100	/* pmalloc area - see docs */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 87f62f31b52f..24ed35035095 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -625,6 +625,33 @@  void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 }
 EXPORT_SYMBOL(gen_pool_free);
 
+
+/**
+ * gen_pool_flush_chunk() - drops all the allocations from a specific chunk
+ * @pool:	the generic memory pool
+ * @chunk:	The chunk to wipe clear.
+ *
+ * This is meant to be called only while destroying a pool. It's up to the
+ * caller to avoid races, but really, at this point the pool should have
+ * already been retired and it should have become unavailable for any other
+ * sort of operation.
+ */
+void gen_pool_flush_chunk(struct gen_pool *pool,
+			  struct gen_pool_chunk *chunk)
+{
+	size_t size;
+
+	if (unlikely(!(pool && chunk)))
+		return;
+
+	size = chunk->end_addr + 1 - chunk->start_addr;
+	memset(chunk->entries, 0,
+	       DIV_ROUND_UP(size >> pool->min_alloc_order * BITS_PER_ENTRY,
+			    BITS_PER_BYTE));
+	atomic_long_set(&chunk->avail, size);
+}
+
+
 /**
  * gen_pool_for_each_chunk() - call func for every chunk of generic memory pool
  * @pool:	the generic memory pool
diff --git a/mm/Kconfig b/mm/Kconfig
index c782e8fb7235..be578fbdce6d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -760,3 +760,9 @@  config GUP_BENCHMARK
 	  performance of get_user_pages_fast().
 
 	  See tools/testing/selftests/vm/gup_benchmark.c
+
+config PROTECTABLE_MEMORY
+    bool
+    depends on ARCH_HAS_SET_MEMORY
+    select GENERIC_ALLOCATOR
+    default y
diff --git a/mm/Makefile b/mm/Makefile
index e669f02c5a54..959fdbdac118 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -65,6 +65,7 @@  obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_PROTECTABLE_MEMORY) += pmalloc.o
 obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_PAGE_POISONING) += page_poison.o
 obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/pmalloc.c b/mm/pmalloc.c
new file mode 100644
index 000000000000..abddba90a9f6
--- /dev/null
+++ b/mm/pmalloc.c
@@ -0,0 +1,499 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * pmalloc.c: Protectable Memory Allocator
+ *
+ * (C) Copyright 2017 Huawei Technologies Co. Ltd.
+ * Author: Igor Stoppa <igor.stoppa@huawei.com>
+ */
+
+#include <linux/printk.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/genalloc.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/atomic.h>
+#include <linux/rculist.h>
+#include <linux/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/page.h>
+
+#include <linux/pmalloc.h>
+/*
+ * pmalloc_data contains the data specific to a pmalloc pool,
+ * in a format compatible with the design of gen_alloc.
+ * Some of the fields are used for exposing the corresponding parameter
+ * to userspace, through sysfs.
+ */
+struct pmalloc_data {
+	struct gen_pool *pool;  /* Link back to the associated pool. */
+	bool protected;     /* Status of the pool: RO or RW. */
+	struct kobj_attribute attr_protected; /* Sysfs attribute. */
+	struct kobj_attribute attr_avail;     /* Sysfs attribute. */
+	struct kobj_attribute attr_size;      /* Sysfs attribute. */
+	struct kobj_attribute attr_chunks;    /* Sysfs attribute. */
+	struct kobject *pool_kobject;
+	struct list_head node; /* list of pools */
+};
+
+static LIST_HEAD(pmalloc_final_list);
+static LIST_HEAD(pmalloc_tmp_list);
+static struct list_head *pmalloc_list = &pmalloc_tmp_list;
+static DEFINE_MUTEX(pmalloc_mutex);
+static struct kobject *pmalloc_kobject;
+
+static ssize_t pmalloc_pool_show_protected(struct kobject *dev,
+					   struct kobj_attribute *attr,
+					   char *buf)
+{
+	struct pmalloc_data *data;
+
+	data = container_of(attr, struct pmalloc_data, attr_protected);
+	if (data->protected)
+		return sprintf(buf, "protected\n");
+	else
+		return sprintf(buf, "unprotected\n");
+}
+
+static ssize_t pmalloc_pool_show_avail(struct kobject *dev,
+				       struct kobj_attribute *attr,
+				       char *buf)
+{
+	struct pmalloc_data *data;
+
+	data = container_of(attr, struct pmalloc_data, attr_avail);
+	return sprintf(buf, "%lu\n",
+		       (unsigned long)gen_pool_avail(data->pool));
+}
+
+static ssize_t pmalloc_pool_show_size(struct kobject *dev,
+				      struct kobj_attribute *attr,
+				      char *buf)
+{
+	struct pmalloc_data *data;
+
+	data = container_of(attr, struct pmalloc_data, attr_size);
+	return sprintf(buf, "%lu\n",
+		       (unsigned long)gen_pool_size(data->pool));
+}
+
+static void pool_chunk_number(struct gen_pool *pool,
+			      struct gen_pool_chunk *chunk, void *data)
+{
+	unsigned long *counter = data;
+
+	(*counter)++;
+}
+
+static ssize_t pmalloc_pool_show_chunks(struct kobject *dev,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	struct pmalloc_data *data;
+	unsigned long chunks_num = 0;
+
+	data = container_of(attr, struct pmalloc_data, attr_chunks);
+	gen_pool_for_each_chunk(data->pool, pool_chunk_number, &chunks_num);
+	return sprintf(buf, "%lu\n", chunks_num);
+}
+
+/* Exposes the pool and its attributes through sysfs. */
+static struct kobject *pmalloc_connect(struct pmalloc_data *data)
+{
+	const struct attribute *attrs[] = {
+		&data->attr_protected.attr,
+		&data->attr_avail.attr,
+		&data->attr_size.attr,
+		&data->attr_chunks.attr,
+		NULL
+	};
+	struct kobject *kobj;
+
+	kobj = kobject_create_and_add(data->pool->name, pmalloc_kobject);
+	if (unlikely(!kobj))
+		return NULL;
+
+	if (unlikely(sysfs_create_files(kobj, attrs) < 0)) {
+		kobject_put(kobj);
+		kobj = NULL;
+	}
+	return kobj;
+}
+
+/* Removes the pool and its attributes from sysfs. */
+static void pmalloc_disconnect(struct pmalloc_data *data,
+			       struct kobject *kobj)
+{
+	const struct attribute *attrs[] = {
+		&data->attr_protected.attr,
+		&data->attr_avail.attr,
+		&data->attr_size.attr,
+		&data->attr_chunks.attr,
+		NULL
+	};
+
+	sysfs_remove_files(kobj, attrs);
+	kobject_put(kobj);
+}
+
+/* Declares an attribute of the pool. */
+#define pmalloc_attr_init(data, attr_name) \
+do { \
+	sysfs_attr_init(&data->attr_##attr_name.attr); \
+	data->attr_##attr_name.attr.name = #attr_name; \
+	data->attr_##attr_name.attr.mode = VERIFY_OCTAL_PERMISSIONS(0400); \
+	data->attr_##attr_name.show = pmalloc_pool_show_##attr_name; \
+} while (0)
+
+struct gen_pool *pmalloc_create_pool(const char *name, int min_alloc_order)
+{
+	struct gen_pool *pool;
+	const char *pool_name;
+	struct pmalloc_data *data;
+
+	if (!name) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	if (min_alloc_order < 0)
+		min_alloc_order = ilog2(sizeof(unsigned long));
+
+	pool = gen_pool_create(min_alloc_order, NUMA_NO_NODE);
+	if (unlikely(!pool))
+		return NULL;
+
+	mutex_lock(&pmalloc_mutex);
+	list_for_each_entry(data, pmalloc_list, node)
+		if (!strcmp(name, data->pool->name))
+			goto same_name_err;
+
+	pool_name = kstrdup(name, GFP_KERNEL);
+	if (unlikely(!pool_name))
+		goto name_alloc_err;
+
+	data = kzalloc(sizeof(struct pmalloc_data), GFP_KERNEL);
+	if (unlikely(!data))
+		goto data_alloc_err;
+
+	data->protected = false;
+	data->pool = pool;
+	pmalloc_attr_init(data, protected);
+	pmalloc_attr_init(data, avail);
+	pmalloc_attr_init(data, size);
+	pmalloc_attr_init(data, chunks);
+	pool->data = data;
+	pool->name = pool_name;
+
+	list_add(&data->node, pmalloc_list);
+	if (pmalloc_list == &pmalloc_final_list)
+		data->pool_kobject = pmalloc_connect(data);
+	mutex_unlock(&pmalloc_mutex);
+	return pool;
+
+data_alloc_err:
+	kfree(pool_name);
+name_alloc_err:
+same_name_err:
+	mutex_unlock(&pmalloc_mutex);
+	gen_pool_destroy(pool);
+	return NULL;
+}
+
+static inline int check_alloc_params(struct gen_pool *pool, size_t req_size)
+{
+	struct pmalloc_data *data;
+
+	if (unlikely(!req_size || !pool))
+		return -1;
+
+	data = pool->data;
+
+	if (data == NULL)
+		return -1;
+
+	if (unlikely(data->protected)) {
+		WARN_ON(1);
+		return -1;
+	}
+	return 0;
+}
+
+
+static inline bool chunk_tagging(void *chunk, bool tag)
+{
+	struct vm_struct *area;
+	struct page *page;
+
+	if (!is_vmalloc_addr(chunk))
+		return false;
+
+	page = vmalloc_to_page(chunk);
+	if (unlikely(!page))
+		return false;
+
+	area = page->area;
+	if (tag)
+		area->flags |= VM_PMALLOC;
+	else
+		area->flags &= ~VM_PMALLOC;
+	return true;
+}
+
+
+static inline bool tag_chunk(void *chunk)
+{
+	return chunk_tagging(chunk, true);
+}
+
+
+static inline bool untag_chunk(void *chunk)
+{
+	return chunk_tagging(chunk, false);
+}
+
+enum {
+	INVALID_PMALLOC_OBJECT = -1,
+	NOT_PMALLOC_OBJECT = 0,
+	VALID_PMALLOC_OBJECT = 1,
+};
+
+int is_pmalloc_object(const void *ptr, const unsigned long n)
+{
+	struct vm_struct *area;
+	struct page *page;
+	unsigned long area_start;
+	unsigned long area_end;
+	unsigned long object_start;
+	unsigned long object_end;
+
+
+	/*
+	 * is_pmalloc_object gets called pretty late, so chances are high
+	 * that the object is indeed of vmalloc type
+	 */
+	if (unlikely(!is_vmalloc_addr(ptr)))
+		return NOT_PMALLOC_OBJECT;
+
+	page = vmalloc_to_page(ptr);
+	if (unlikely(!page))
+		return NOT_PMALLOC_OBJECT;
+
+	area = page->area;
+
+	if (likely(!(area->flags & VM_PMALLOC)))
+		return NOT_PMALLOC_OBJECT;
+
+	area_start = (unsigned long)area->addr;
+	area_end = area_start + area->nr_pages * PAGE_SIZE - 1;
+	object_start = (unsigned long)ptr;
+	object_end = object_start + n - 1;
+
+	if (likely((area_start <= object_start) &&
+		   (object_end <= area_end)))
+		return VALID_PMALLOC_OBJECT;
+	else
+		return INVALID_PMALLOC_OBJECT;
+}
+
+
+bool pmalloc_prealloc(struct gen_pool *pool, size_t size)
+{
+	void *chunk;
+	size_t chunk_size;
+	bool add_error;
+
+	if (check_alloc_params(pool, size))
+		return false;
+
+	/* Expand pool */
+	chunk_size = roundup(size, PAGE_SIZE);
+	chunk = vmalloc(chunk_size);
+	if (unlikely(chunk == NULL))
+		return false;
+
+	/* Locking is already done inside gen_pool_add */
+	add_error = gen_pool_add(pool, (unsigned long)chunk, chunk_size,
+				 NUMA_NO_NODE);
+	if (unlikely(add_error != 0))
+		goto abort;
+
+	return true;
+abort:
+	vfree_atomic(chunk);
+	return false;
+
+}
+
+void *pmalloc(struct gen_pool *pool, size_t size, gfp_t gfp)
+{
+	void *chunk;
+	size_t chunk_size;
+	bool add_error;
+	unsigned long retval;
+
+	if (check_alloc_params(pool, size))
+		return NULL;
+
+retry_alloc_from_pool:
+	retval = gen_pool_alloc(pool, size);
+	if (retval)
+		goto return_allocation;
+
+	if (unlikely((gfp & __GFP_ATOMIC))) {
+		if (unlikely((gfp & __GFP_NOFAIL)))
+			goto retry_alloc_from_pool;
+		else
+			return NULL;
+	}
+
+	/* Expand pool */
+	chunk_size = roundup(size, PAGE_SIZE);
+	chunk = vmalloc(chunk_size);
+	if (unlikely(!chunk)) {
+		if (unlikely((gfp & __GFP_NOFAIL)))
+			goto retry_alloc_from_pool;
+		else
+			return NULL;
+	}
+	if (unlikely(!tag_chunk(chunk)))
+		goto free;
+
+	/* Locking is already done inside gen_pool_add */
+	add_error = gen_pool_add(pool, (unsigned long)chunk, chunk_size,
+				 NUMA_NO_NODE);
+	if (unlikely(add_error))
+		goto abort;
+
+	retval = gen_pool_alloc(pool, size);
+	if (retval) {
+return_allocation:
+		*(size_t *)retval = size;
+		if (gfp & __GFP_ZERO)
+			memset((void *)retval, 0, size);
+		return (void *)retval;
+	}
+	/*
+	 * Here there is no test for __GFP_NO_FAIL because, in case of
+	 * concurrent allocation, one thread might add a chunk to the
+	 * pool and this memory could be allocated by another thread,
+	 * before the first thread gets a chance to use it.
+	 * As long as vmalloc succeeds, it's ok to retry.
+	 */
+	goto retry_alloc_from_pool;
+abort:
+	untag_chunk(chunk);
+free:
+	vfree_atomic(chunk);
+	return NULL;
+}
+
+static void pmalloc_chunk_set_protection(struct gen_pool *pool,
+
+					 struct gen_pool_chunk *chunk,
+					 void *data)
+{
+	const bool *flag = data;
+	size_t chunk_size = chunk->end_addr + 1 - chunk->start_addr;
+	unsigned long pages = chunk_size / PAGE_SIZE;
+
+	BUG_ON(chunk_size & (PAGE_SIZE - 1));
+
+	if (*flag)
+		set_memory_ro(chunk->start_addr, pages);
+	else
+		set_memory_rw(chunk->start_addr, pages);
+}
+
+static int pmalloc_pool_set_protection(struct gen_pool *pool, bool protection)
+{
+	struct pmalloc_data *data;
+	struct gen_pool_chunk *chunk;
+
+	if (unlikely(!pool))
+		return -EINVAL;
+
+	data = pool->data;
+
+	if (unlikely(!data))
+		return -EINVAL;
+
+	if (unlikely(data->protected == protection)) {
+		WARN_ON(1);
+		return 0;
+	}
+
+	data->protected = protection;
+	list_for_each_entry(chunk, &(pool)->chunks, next_chunk)
+		pmalloc_chunk_set_protection(pool, chunk, &protection);
+	return 0;
+}
+
+int pmalloc_protect_pool(struct gen_pool *pool)
+{
+	return pmalloc_pool_set_protection(pool, true);
+}
+
+
+static void pmalloc_chunk_free(struct gen_pool *pool,
+			       struct gen_pool_chunk *chunk, void *data)
+{
+	untag_chunk(chunk);
+	gen_pool_flush_chunk(pool, chunk);
+	vfree_atomic((void *)chunk->start_addr);
+}
+
+
+int pmalloc_destroy_pool(struct gen_pool *pool)
+{
+	struct pmalloc_data *data;
+
+	if (unlikely(pool == NULL))
+		return -EINVAL;
+
+	data = pool->data;
+
+	if (unlikely(data == NULL))
+		return -EINVAL;
+
+	mutex_lock(&pmalloc_mutex);
+	list_del(&data->node);
+	mutex_unlock(&pmalloc_mutex);
+
+	if (likely(data->pool_kobject))
+		pmalloc_disconnect(data, data->pool_kobject);
+
+	pmalloc_pool_set_protection(pool, false);
+	gen_pool_for_each_chunk(pool, pmalloc_chunk_free, NULL);
+	gen_pool_destroy(pool);
+	kfree(data);
+	return 0;
+}
+
+/*
+ * When the sysfs is ready to receive registrations, connect all the
+ * pools previously created. Also enable further pools to be connected
+ * right away.
+ */
+static int __init pmalloc_late_init(void)
+{
+	struct pmalloc_data *data, *n;
+
+	pmalloc_kobject = kobject_create_and_add("pmalloc", kernel_kobj);
+
+	mutex_lock(&pmalloc_mutex);
+	pmalloc_list = &pmalloc_final_list;
+
+	if (likely(pmalloc_kobject != NULL)) {
+		list_for_each_entry_safe(data, n, &pmalloc_tmp_list, node) {
+			list_move(&data->node, &pmalloc_final_list);
+			pmalloc_connect(data);
+		}
+	}
+	mutex_unlock(&pmalloc_mutex);
+	return 0;
+}
+late_initcall(pmalloc_late_init);
diff --git a/mm/usercopy.c b/mm/usercopy.c
index e9e9325f7638..946ce051e296 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -240,6 +240,36 @@  static inline void check_heap_object(const void *ptr, unsigned long n,
 	}
 }
 
+#ifdef CONFIG_PROTECTABLE_MEMORY
+
+int is_pmalloc_object(const void *ptr, const unsigned long n);
+
+static void check_pmalloc_object(const void *ptr, unsigned long n,
+				 bool to_user)
+{
+	int retv;
+
+	retv = is_pmalloc_object(ptr, n);
+	if (unlikely(retv)) {
+		if (unlikely(!to_user))
+			usercopy_abort("pmalloc",
+				       "trying to write to pmalloc object",
+				       to_user, (const unsigned long)ptr, n);
+		if (retv < 0)
+			usercopy_abort("pmalloc",
+				       "invalid pmalloc object",
+				       to_user, (const unsigned long)ptr, n);
+	}
+}
+
+#else
+
+static void check_pmalloc_object(const void *ptr, unsigned long n,
+				 bool to_user)
+{
+}
+#endif
+
 /*
  * Validates that the given object is:
  * - not bogus address
@@ -277,5 +307,8 @@  void __check_object_size(const void *ptr, unsigned long n, bool to_user)
 
 	/* Check for object in kernel to avoid text exposure. */
 	check_kernel_text_object((const unsigned long)ptr, n, to_user);
+
+	/* Check if object is from a pmalloc chunk. */
+	check_pmalloc_object(ptr, n, to_user);
 }
 EXPORT_SYMBOL(__check_object_size);