diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 93feb84444896..cdc46a501b857 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -476,11 +476,13 @@ cgroup->parent is still valid. (Note - can also be called for a newly-created cgroup if an error occurs after this subsystem's create() method has been called for the new cgroup). -void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); Called before checking the reference count on each subsystem. This may be useful for subsystems which have some extra references even if -there are not tasks in the cgroup. +there are not tasks in the cgroup. If pre_destroy() returns error code, +rmdir() will fail with it. From this behavior, pre_destroy() can be +called multiple times against a cgroup. int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *task) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9a23bb0982057..7d824b80b3d78 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -135,6 +135,10 @@ enum { CGRP_RELEASABLE, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, + /* + * A thread in rmdir() is wating for this cgroup. + */ + CGRP_WAIT_ON_RMDIR, }; struct cgroup { @@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); struct cgroup_subsys { struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, struct cgroup *cgrp); - void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); + int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d3c521137425f..fc5e4a48582f6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) * Call subsys's pre_destroy handler. * This is called before css refcnt check. */ -static void cgroup_call_pre_destroy(struct cgroup *cgrp) +static int cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; + int ret = 0; + for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy) - ss->pre_destroy(ss, cgrp); - return; + if (ss->pre_destroy) { + ret = ss->pre_destroy(ss, cgrp); + if (ret) + break; + } + return ret; } static void free_cgroup_rcu(struct rcu_head *obj) @@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry) remove_dir(dentry); } +/* + * A queue for waiters to do rmdir() cgroup. A tasks will sleep when + * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some + * reference to css->refcnt. In general, this refcnt is expected to goes down + * to zero, soon. + * + * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + */ +DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); + +static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +{ + if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) + wake_up_all(&cgroup_rmdir_waitq); +} + static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); put_css_set(cg); + + /* + * wake up rmdir() waiter. the rmdir should fail since the cgroup + * is no longer empty. + */ + cgroup_wakeup_rmdir_waiters(cgrp); return 0; } @@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; + DEFINE_WAIT(wait); + int ret; /* the vfs holds both inode->i_mutex already */ - +again: mutex_lock(&cgroup_mutex); if (atomic_read(&cgrp->count) != 0) { mutex_unlock(&cgroup_mutex); @@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) * Call pre_destroy handlers of subsys. Notify subsystems * that rmdir() request comes. */ - cgroup_call_pre_destroy(cgrp); + ret = cgroup_call_pre_destroy(cgrp); + if (ret) + return ret; mutex_lock(&cgroup_mutex); parent = cgrp->parent; - - if (atomic_read(&cgrp->count) - || !list_empty(&cgrp->children) - || !cgroup_clear_css_refs(cgrp)) { + if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } + /* + * css_put/get is provided for subsys to grab refcnt to css. In typical + * case, subsystem has no reference after pre_destroy(). But, under + * hierarchy management, some *temporal* refcnt can be hold. + * To avoid returning -EBUSY to a user, waitqueue is used. If subsys + * is really busy, it should return -EBUSY at pre_destroy(). wake_up + * is called when css_put() is called and refcnt goes down to 0. + */ + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); + + if (!cgroup_clear_css_refs(cgrp)) { + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&cgroup_rmdir_waitq, &wait); + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + if (signal_pending(current)) + return -EINTR; + goto again; + } + /* NO css_tryget() can success after here. */ + finish_wait(&cgroup_rmdir_waitq, &wait); + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); @@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - if ((atomic_dec_return(&css->refcnt) == 1) && - notify_on_release(cgrp)) { - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); + if (atomic_dec_return(&css->refcnt) == 1) { + if (notify_on_release(cgrp)) { + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } + cgroup_wakeup_rmdir_waiters(cgrp); } rcu_read_unlock(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8e4be9cb2a6a7..8ffec674c5acc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2272,11 +2272,12 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return ERR_PTR(-ENOMEM); } -static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, +static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - mem_cgroup_force_empty(mem, false); + + return mem_cgroup_force_empty(mem, false); } static void mem_cgroup_destroy(struct cgroup_subsys *ss,