/*1* kernel/cpuset.c2*3* Processor and Memory placement constraints for sets of tasks.4*5* Copyright (C) 2003 BULL SA.6* Copyright (C) 2004-2007 Silicon Graphics, Inc.7* Copyright (C) 2006 Google, Inc8*9* Portions derived from Patrick Mochel's sysfs code.10* sysfs is Copyright (c) 2001-3 Patrick Mochel11*12* 2003-10-10 Written by Simon Derr.13* 2003-10-22 Updates by Stephen Hemminger.14* 2004 May-July Rework by Paul Jackson.15* 2006 Rework by Paul Menage to use generic cgroups16* 2008 Rework of the scheduler domains and CPU hotplug handling17* by Max Krasnyansky18*19* This file is subject to the terms and conditions of the GNU General Public20* License. See the file COPYING in the main directory of the Linux21* distribution for more details.22*/23#include "cpuset-internal.h"2425#include <linux/init.h>26#include <linux/interrupt.h>27#include <linux/kernel.h>28#include <linux/mempolicy.h>29#include <linux/mm.h>30#include <linux/memory.h>31#include <linux/export.h>32#include <linux/rcupdate.h>33#include <linux/sched.h>34#include <linux/sched/deadline.h>35#include <linux/sched/mm.h>36#include <linux/sched/task.h>37#include <linux/security.h>38#include <linux/oom.h>39#include <linux/sched/isolation.h>40#include <linux/wait.h>41#include <linux/workqueue.h>42#include <linux/task_work.h>4344DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);45DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);4647/*48* There could be abnormal cpuset configurations for cpu or memory49* node binding, add this key to provide a quick low-cost judgment50* of the situation.51*/52DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);5354static const char * const perr_strings[] = {55[PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",56[PERR_INVPARENT] = "Parent is an invalid partition root",57[PERR_NOTPART] = "Parent is not a partition root",58[PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",59[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",60[PERR_HOTPLUG] = "No cpu available due to hotplug",61[PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",62[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",63[PERR_ACCESS] = "Enable partition not permitted",64[PERR_REMOTE] = "Have remote partition underneath",65};6667/*68* For local partitions, update to subpartitions_cpus & isolated_cpus is done69* in update_parent_effective_cpumask(). For remote partitions, it is done in70* the remote_partition_*() and remote_cpus_update() helpers.71*/72/*73* Exclusive CPUs distributed out to local or remote sub-partitions of74* top_cpuset75*/76static cpumask_var_t subpartitions_cpus;7778/*79* Exclusive CPUs in isolated partitions80*/81static cpumask_var_t isolated_cpus;8283/*84* Housekeeping (HK_TYPE_DOMAIN) CPUs at boot85*/86static cpumask_var_t boot_hk_cpus;87static bool have_boot_isolcpus;8889/* List of remote partition root children */90static struct list_head remote_children;9192/*93* A flag to force sched domain rebuild at the end of an operation.94* It can be set in95* - update_partition_sd_lb()96* - update_cpumasks_hier()97* - cpuset_update_flag()98* - cpuset_hotplug_update_tasks()99* - cpuset_handle_hotplug()100*101* Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.102*103* Note that update_relax_domain_level() in cpuset-v1.c can still call104* rebuild_sched_domains_locked() directly without using this flag.105*/106static bool force_sd_rebuild;107108/*109* Partition root states:110*111* 0 - member (not a partition root)112* 1 - partition root113* 2 - partition root without load balancing (isolated)114* -1 - invalid partition root115* -2 - invalid isolated partition root116*117* There are 2 types of partitions - local or remote. Local partitions are118* those whose parents are partition root themselves. Setting of119* cpuset.cpus.exclusive are optional in setting up local partitions.120* Remote partitions are those whose parents are not partition roots. Passing121* down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor122* nodes are mandatory in creating a remote partition.123*124* For simplicity, a local partition can be created under a local or remote125* partition but a remote partition cannot have any partition root in its126* ancestor chain except the cgroup root.127*/128#define PRS_MEMBER 0129#define PRS_ROOT 1130#define PRS_ISOLATED 2131#define PRS_INVALID_ROOT -1132#define PRS_INVALID_ISOLATED -2133134/*135* Temporary cpumasks for working with partitions that are passed among136* functions to avoid memory allocation in inner functions.137*/138struct tmpmasks {139cpumask_var_t addmask, delmask; /* For partition root */140cpumask_var_t new_cpus; /* For update_cpumasks_hier() */141};142143void inc_dl_tasks_cs(struct task_struct *p)144{145struct cpuset *cs = task_cs(p);146147cs->nr_deadline_tasks++;148}149150void dec_dl_tasks_cs(struct task_struct *p)151{152struct cpuset *cs = task_cs(p);153154cs->nr_deadline_tasks--;155}156157static inline bool is_partition_valid(const struct cpuset *cs)158{159return cs->partition_root_state > 0;160}161162static inline bool is_partition_invalid(const struct cpuset *cs)163{164return cs->partition_root_state < 0;165}166167static inline bool cs_is_member(const struct cpuset *cs)168{169return cs->partition_root_state == PRS_MEMBER;170}171172/*173* Callers should hold callback_lock to modify partition_root_state.174*/175static inline void make_partition_invalid(struct cpuset *cs)176{177if (cs->partition_root_state > 0)178cs->partition_root_state = -cs->partition_root_state;179}180181/*182* Send notification event of whenever partition_root_state changes.183*/184static inline void notify_partition_change(struct cpuset *cs, int old_prs)185{186if (old_prs == cs->partition_root_state)187return;188cgroup_file_notify(&cs->partition_file);189190/* Reset prs_err if not invalid */191if (is_partition_valid(cs))192WRITE_ONCE(cs->prs_err, PERR_NONE);193}194195/*196* The top_cpuset is always synchronized to cpu_active_mask and we should avoid197* using cpu_online_mask as much as possible. An active CPU is always an online198* CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ199* during hotplug operations. A CPU is marked active at the last stage of CPU200* bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code201* will be called to update the sched domains so that the scheduler can move202* a normal task to a newly active CPU or remove tasks away from a newly203* inactivated CPU. The online bit is set much earlier in the CPU bringup204* process and cleared much later in CPU teardown.205*206* If cpu_online_mask is used while a hotunplug operation is happening in207* parallel, we may leave an offline CPU in cpu_allowed or some other masks.208*/209static struct cpuset top_cpuset = {210.flags = BIT(CS_CPU_EXCLUSIVE) |211BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),212.partition_root_state = PRS_ROOT,213.relax_domain_level = -1,214.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),215};216217/*218* There are two global locks guarding cpuset structures - cpuset_mutex and219* callback_lock. The cpuset code uses only cpuset_mutex. Other kernel220* subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset221* structures. Note that cpuset_mutex needs to be a mutex as it is used in222* paths that rely on priority inheritance (e.g. scheduler - on RT) for223* correctness.224*225* A task must hold both locks to modify cpusets. If a task holds226* cpuset_mutex, it blocks others, ensuring that it is the only task able to227* also acquire callback_lock and be able to modify cpusets. It can perform228* various checks on the cpuset structure first, knowing nothing will change.229* It can also allocate memory while just holding cpuset_mutex. While it is230* performing these checks, various callback routines can briefly acquire231* callback_lock to query cpusets. Once it is ready to make the changes, it232* takes callback_lock, blocking everyone else.233*234* Calls to the kernel memory allocator can not be made while holding235* callback_lock, as that would risk double tripping on callback_lock236* from one of the callbacks into the cpuset code from within237* __alloc_pages().238*239* If a task is only holding callback_lock, then it has read-only240* access to cpusets.241*242* Now, the task_struct fields mems_allowed and mempolicy may be changed243* by other task, we use alloc_lock in the task_struct fields to protect244* them.245*246* The cpuset_common_seq_show() handlers only hold callback_lock across247* small pieces of code, such as when reading out possibly multi-word248* cpumasks and nodemasks.249*/250251static DEFINE_MUTEX(cpuset_mutex);252253/**254* cpuset_lock - Acquire the global cpuset mutex255*256* This locks the global cpuset mutex to prevent modifications to cpuset257* hierarchy and configurations. This helper is not enough to make modification.258*/259void cpuset_lock(void)260{261mutex_lock(&cpuset_mutex);262}263264void cpuset_unlock(void)265{266mutex_unlock(&cpuset_mutex);267}268269/**270* cpuset_full_lock - Acquire full protection for cpuset modification271*272* Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex273* to safely modify cpuset data.274*/275void cpuset_full_lock(void)276{277cpus_read_lock();278mutex_lock(&cpuset_mutex);279}280281void cpuset_full_unlock(void)282{283mutex_unlock(&cpuset_mutex);284cpus_read_unlock();285}286287static DEFINE_SPINLOCK(callback_lock);288289void cpuset_callback_lock_irq(void)290{291spin_lock_irq(&callback_lock);292}293294void cpuset_callback_unlock_irq(void)295{296spin_unlock_irq(&callback_lock);297}298299static struct workqueue_struct *cpuset_migrate_mm_wq;300301static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);302303static inline void check_insane_mems_config(nodemask_t *nodes)304{305if (!cpusets_insane_config() &&306movable_only_nodes(nodes)) {307static_branch_enable_cpuslocked(&cpusets_insane_config_key);308pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"309"Cpuset allocations might fail even with a lot of memory available.\n",310nodemask_pr_args(nodes));311}312}313314/*315* decrease cs->attach_in_progress.316* wake_up cpuset_attach_wq if cs->attach_in_progress==0.317*/318static inline void dec_attach_in_progress_locked(struct cpuset *cs)319{320lockdep_assert_held(&cpuset_mutex);321322cs->attach_in_progress--;323if (!cs->attach_in_progress)324wake_up(&cpuset_attach_wq);325}326327static inline void dec_attach_in_progress(struct cpuset *cs)328{329mutex_lock(&cpuset_mutex);330dec_attach_in_progress_locked(cs);331mutex_unlock(&cpuset_mutex);332}333334static inline bool cpuset_v2(void)335{336return !IS_ENABLED(CONFIG_CPUSETS_V1) ||337cgroup_subsys_on_dfl(cpuset_cgrp_subsys);338}339340/*341* Cgroup v2 behavior is used on the "cpus" and "mems" control files when342* on default hierarchy or when the cpuset_v2_mode flag is set by mounting343* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.344* With v2 behavior, "cpus" and "mems" are always what the users have345* requested and won't be changed by hotplug events. Only the effective346* cpus or mems will be affected.347*/348static inline bool is_in_v2_mode(void)349{350return cpuset_v2() ||351(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);352}353354/**355* partition_is_populated - check if partition has tasks356* @cs: partition root to be checked357* @excluded_child: a child cpuset to be excluded in task checking358* Return: true if there are tasks, false otherwise359*360* It is assumed that @cs is a valid partition root. @excluded_child should361* be non-NULL when this cpuset is going to become a partition itself.362*/363static inline bool partition_is_populated(struct cpuset *cs,364struct cpuset *excluded_child)365{366struct cgroup_subsys_state *css;367struct cpuset *child;368369if (cs->css.cgroup->nr_populated_csets)370return true;371if (!excluded_child && !cs->nr_subparts)372return cgroup_is_populated(cs->css.cgroup);373374rcu_read_lock();375cpuset_for_each_child(child, css, cs) {376if (child == excluded_child)377continue;378if (is_partition_valid(child))379continue;380if (cgroup_is_populated(child->css.cgroup)) {381rcu_read_unlock();382return true;383}384}385rcu_read_unlock();386return false;387}388389/*390* Return in pmask the portion of a task's cpusets's cpus_allowed that391* are online and are capable of running the task. If none are found,392* walk up the cpuset hierarchy until we find one that does have some393* appropriate cpus.394*395* One way or another, we guarantee to return some non-empty subset396* of cpu_active_mask.397*398* Call with callback_lock or cpuset_mutex held.399*/400static void guarantee_active_cpus(struct task_struct *tsk,401struct cpumask *pmask)402{403const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);404struct cpuset *cs;405406if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))407cpumask_copy(pmask, cpu_active_mask);408409rcu_read_lock();410cs = task_cs(tsk);411412while (!cpumask_intersects(cs->effective_cpus, pmask))413cs = parent_cs(cs);414415cpumask_and(pmask, pmask, cs->effective_cpus);416rcu_read_unlock();417}418419/*420* Return in *pmask the portion of a cpusets's mems_allowed that421* are online, with memory. If none are online with memory, walk422* up the cpuset hierarchy until we find one that does have some423* online mems. The top cpuset always has some mems online.424*425* One way or another, we guarantee to return some non-empty subset426* of node_states[N_MEMORY].427*428* Call with callback_lock or cpuset_mutex held.429*/430static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)431{432while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))433cs = parent_cs(cs);434nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);435}436437/**438* alloc_cpumasks - Allocate an array of cpumask variables439* @pmasks: Pointer to array of cpumask_var_t pointers440* @size: Number of cpumasks to allocate441* Return: 0 if successful, -ENOMEM otherwise.442*443* Allocates @size cpumasks and initializes them to empty. Returns 0 on444* success, -ENOMEM on allocation failure. On failure, any previously445* allocated cpumasks are freed.446*/447static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)448{449int i;450451for (i = 0; i < size; i++) {452if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {453while (--i >= 0)454free_cpumask_var(*pmasks[i]);455return -ENOMEM;456}457}458return 0;459}460461/**462* alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations.463* @tmp: Pointer to tmpmasks structure to populate464* Return: 0 on success, -ENOMEM on allocation failure465*/466static inline int alloc_tmpmasks(struct tmpmasks *tmp)467{468/*469* Array of pointers to the three cpumask_var_t fields in tmpmasks.470* Note: Array size must match actual number of masks (3)471*/472cpumask_var_t *pmask[3] = {473&tmp->new_cpus,474&tmp->addmask,475&tmp->delmask476};477478return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));479}480481/**482* free_tmpmasks - free cpumasks in a tmpmasks structure483* @tmp: the tmpmasks structure pointer484*/485static inline void free_tmpmasks(struct tmpmasks *tmp)486{487if (!tmp)488return;489490free_cpumask_var(tmp->new_cpus);491free_cpumask_var(tmp->addmask);492free_cpumask_var(tmp->delmask);493}494495/**496* dup_or_alloc_cpuset - Duplicate or allocate a new cpuset497* @cs: Source cpuset to duplicate (NULL for a fresh allocation)498*499* Creates a new cpuset by either:500* 1. Duplicating an existing cpuset (if @cs is non-NULL), or501* 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL)502*503* Return: Pointer to newly allocated cpuset on success, NULL on failure504*/505static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)506{507struct cpuset *trial;508509/* Allocate base structure */510trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :511kzalloc(sizeof(*cs), GFP_KERNEL);512if (!trial)513return NULL;514515/* Setup cpumask pointer array */516cpumask_var_t *pmask[4] = {517&trial->cpus_allowed,518&trial->effective_cpus,519&trial->effective_xcpus,520&trial->exclusive_cpus521};522523if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {524kfree(trial);525return NULL;526}527528/* Copy masks if duplicating */529if (cs) {530cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);531cpumask_copy(trial->effective_cpus, cs->effective_cpus);532cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);533cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);534}535536return trial;537}538539/**540* free_cpuset - free the cpuset541* @cs: the cpuset to be freed542*/543static inline void free_cpuset(struct cpuset *cs)544{545free_cpumask_var(cs->cpus_allowed);546free_cpumask_var(cs->effective_cpus);547free_cpumask_var(cs->effective_xcpus);548free_cpumask_var(cs->exclusive_cpus);549kfree(cs);550}551552/* Return user specified exclusive CPUs */553static inline struct cpumask *user_xcpus(struct cpuset *cs)554{555return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed556: cs->exclusive_cpus;557}558559static inline bool xcpus_empty(struct cpuset *cs)560{561return cpumask_empty(cs->cpus_allowed) &&562cpumask_empty(cs->exclusive_cpus);563}564565/*566* cpusets_are_exclusive() - check if two cpusets are exclusive567*568* Return true if exclusive, false if not569*/570static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)571{572struct cpumask *xcpus1 = user_xcpus(cs1);573struct cpumask *xcpus2 = user_xcpus(cs2);574575if (cpumask_intersects(xcpus1, xcpus2))576return false;577return true;578}579580/**581* cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts582* @cs1: first cpuset to check583* @cs2: second cpuset to check584*585* Returns: true if CPU exclusivity conflict exists, false otherwise586*587* Conflict detection rules:588* 1. If either cpuset is CPU exclusive, they must be mutually exclusive589* 2. exclusive_cpus masks cannot intersect between cpusets590* 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs591*/592static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)593{594/* If either cpuset is exclusive, check if they are mutually exclusive */595if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))596return !cpusets_are_exclusive(cs1, cs2);597598/* Exclusive_cpus cannot intersect */599if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))600return true;601602/* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */603if (!cpumask_empty(cs1->cpus_allowed) &&604cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))605return true;606607if (!cpumask_empty(cs2->cpus_allowed) &&608cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))609return true;610611return false;612}613614static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)615{616if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))617return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);618return false;619}620621/*622* validate_change() - Used to validate that any proposed cpuset change623* follows the structural rules for cpusets.624*625* If we replaced the flag and mask values of the current cpuset626* (cur) with those values in the trial cpuset (trial), would627* our various subset and exclusive rules still be valid? Presumes628* cpuset_mutex held.629*630* 'cur' is the address of an actual, in-use cpuset. Operations631* such as list traversal that depend on the actual address of the632* cpuset in the list must use cur below, not trial.633*634* 'trial' is the address of bulk structure copy of cur, with635* perhaps one or more of the fields cpus_allowed, mems_allowed,636* or flags changed to new, trial values.637*638* Return 0 if valid, -errno if not.639*/640641static int validate_change(struct cpuset *cur, struct cpuset *trial)642{643struct cgroup_subsys_state *css;644struct cpuset *c, *par;645int ret = 0;646647rcu_read_lock();648649if (!is_in_v2_mode())650ret = cpuset1_validate_change(cur, trial);651if (ret)652goto out;653654/* Remaining checks don't apply to root cpuset */655if (cur == &top_cpuset)656goto out;657658par = parent_cs(cur);659660/*661* Cpusets with tasks - existing or newly being attached - can't662* be changed to have empty cpus_allowed or mems_allowed.663*/664ret = -ENOSPC;665if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {666if (!cpumask_empty(cur->cpus_allowed) &&667cpumask_empty(trial->cpus_allowed))668goto out;669if (!nodes_empty(cur->mems_allowed) &&670nodes_empty(trial->mems_allowed))671goto out;672}673674/*675* We can't shrink if we won't have enough room for SCHED_DEADLINE676* tasks. This check is not done when scheduling is disabled as the677* users should know what they are doing.678*679* For v1, effective_cpus == cpus_allowed & user_xcpus() returns680* cpus_allowed.681*682* For v2, is_cpu_exclusive() & is_sched_load_balance() are true only683* for non-isolated partition root. At this point, the target684* effective_cpus isn't computed yet. user_xcpus() is the best685* approximation.686*687* TBD: May need to precompute the real effective_cpus here in case688* incorrect scheduling of SCHED_DEADLINE tasks in a partition689* becomes an issue.690*/691ret = -EBUSY;692if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&693!cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))694goto out;695696/*697* If either I or some sibling (!= me) is exclusive, we can't698* overlap. exclusive_cpus cannot overlap with each other if set.699*/700ret = -EINVAL;701cpuset_for_each_child(c, css, par) {702if (c == cur)703continue;704if (cpus_excl_conflict(trial, c))705goto out;706if (mems_excl_conflict(trial, c))707goto out;708}709710ret = 0;711out:712rcu_read_unlock();713return ret;714}715716#ifdef CONFIG_SMP717/*718* Helper routine for generate_sched_domains().719* Do cpusets a, b have overlapping effective cpus_allowed masks?720*/721static int cpusets_overlap(struct cpuset *a, struct cpuset *b)722{723return cpumask_intersects(a->effective_cpus, b->effective_cpus);724}725726static void727update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)728{729if (dattr->relax_domain_level < c->relax_domain_level)730dattr->relax_domain_level = c->relax_domain_level;731return;732}733734static void update_domain_attr_tree(struct sched_domain_attr *dattr,735struct cpuset *root_cs)736{737struct cpuset *cp;738struct cgroup_subsys_state *pos_css;739740rcu_read_lock();741cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {742/* skip the whole subtree if @cp doesn't have any CPU */743if (cpumask_empty(cp->cpus_allowed)) {744pos_css = css_rightmost_descendant(pos_css);745continue;746}747748if (is_sched_load_balance(cp))749update_domain_attr(dattr, cp);750}751rcu_read_unlock();752}753754/* Must be called with cpuset_mutex held. */755static inline int nr_cpusets(void)756{757/* jump label reference count + the top-level cpuset */758return static_key_count(&cpusets_enabled_key.key) + 1;759}760761/*762* generate_sched_domains()763*764* This function builds a partial partition of the systems CPUs765* A 'partial partition' is a set of non-overlapping subsets whose766* union is a subset of that set.767* The output of this function needs to be passed to kernel/sched/core.c768* partition_sched_domains() routine, which will rebuild the scheduler's769* load balancing domains (sched domains) as specified by that partial770* partition.771*772* See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst773* for a background explanation of this.774*775* Does not return errors, on the theory that the callers of this776* routine would rather not worry about failures to rebuild sched777* domains when operating in the severe memory shortage situations778* that could cause allocation failures below.779*780* Must be called with cpuset_mutex held.781*782* The three key local variables below are:783* cp - cpuset pointer, used (together with pos_css) to perform a784* top-down scan of all cpusets. For our purposes, rebuilding785* the schedulers sched domains, we can ignore !is_sched_load_786* balance cpusets.787* csa - (for CpuSet Array) Array of pointers to all the cpusets788* that need to be load balanced, for convenient iterative789* access by the subsequent code that finds the best partition,790* i.e the set of domains (subsets) of CPUs such that the791* cpus_allowed of every cpuset marked is_sched_load_balance792* is a subset of one of these domains, while there are as793* many such domains as possible, each as small as possible.794* doms - Conversion of 'csa' to an array of cpumasks, for passing to795* the kernel/sched/core.c routine partition_sched_domains() in a796* convenient format, that can be easily compared to the prior797* value to determine what partition elements (sched domains)798* were changed (added or removed.)799*800* Finding the best partition (set of domains):801* The double nested loops below over i, j scan over the load802* balanced cpusets (using the array of cpuset pointers in csa[])803* looking for pairs of cpusets that have overlapping cpus_allowed804* and merging them using a union-find algorithm.805*806* The union of the cpus_allowed masks from the set of all cpusets807* having the same root then form the one element of the partition808* (one sched domain) to be passed to partition_sched_domains().809*810*/811static int generate_sched_domains(cpumask_var_t **domains,812struct sched_domain_attr **attributes)813{814struct cpuset *cp; /* top-down scan of cpusets */815struct cpuset **csa; /* array of all cpuset ptrs */816int csn; /* how many cpuset ptrs in csa so far */817int i, j; /* indices for partition finding loops */818cpumask_var_t *doms; /* resulting partition; i.e. sched domains */819struct sched_domain_attr *dattr; /* attributes for custom domains */820int ndoms = 0; /* number of sched domains in result */821int nslot; /* next empty doms[] struct cpumask slot */822struct cgroup_subsys_state *pos_css;823bool root_load_balance = is_sched_load_balance(&top_cpuset);824bool cgrpv2 = cpuset_v2();825int nslot_update;826827doms = NULL;828dattr = NULL;829csa = NULL;830831/* Special case for the 99% of systems with one, full, sched domain */832if (root_load_balance && cpumask_empty(subpartitions_cpus)) {833single_root_domain:834ndoms = 1;835doms = alloc_sched_domains(ndoms);836if (!doms)837goto done;838839dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);840if (dattr) {841*dattr = SD_ATTR_INIT;842update_domain_attr_tree(dattr, &top_cpuset);843}844cpumask_and(doms[0], top_cpuset.effective_cpus,845housekeeping_cpumask(HK_TYPE_DOMAIN));846847goto done;848}849850csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);851if (!csa)852goto done;853csn = 0;854855rcu_read_lock();856if (root_load_balance)857csa[csn++] = &top_cpuset;858cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {859if (cp == &top_cpuset)860continue;861862if (cgrpv2)863goto v2;864865/*866* v1:867* Continue traversing beyond @cp iff @cp has some CPUs and868* isn't load balancing. The former is obvious. The869* latter: All child cpusets contain a subset of the870* parent's cpus, so just skip them, and then we call871* update_domain_attr_tree() to calc relax_domain_level of872* the corresponding sched domain.873*/874if (!cpumask_empty(cp->cpus_allowed) &&875!(is_sched_load_balance(cp) &&876cpumask_intersects(cp->cpus_allowed,877housekeeping_cpumask(HK_TYPE_DOMAIN))))878continue;879880if (is_sched_load_balance(cp) &&881!cpumask_empty(cp->effective_cpus))882csa[csn++] = cp;883884/* skip @cp's subtree */885pos_css = css_rightmost_descendant(pos_css);886continue;887888v2:889/*890* Only valid partition roots that are not isolated and with891* non-empty effective_cpus will be saved into csn[].892*/893if ((cp->partition_root_state == PRS_ROOT) &&894!cpumask_empty(cp->effective_cpus))895csa[csn++] = cp;896897/*898* Skip @cp's subtree if not a partition root and has no899* exclusive CPUs to be granted to child cpusets.900*/901if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))902pos_css = css_rightmost_descendant(pos_css);903}904rcu_read_unlock();905906/*907* If there are only isolated partitions underneath the cgroup root,908* we can optimize out unneeded sched domains scanning.909*/910if (root_load_balance && (csn == 1))911goto single_root_domain;912913for (i = 0; i < csn; i++)914uf_node_init(&csa[i]->node);915916/* Merge overlapping cpusets */917for (i = 0; i < csn; i++) {918for (j = i + 1; j < csn; j++) {919if (cpusets_overlap(csa[i], csa[j])) {920/*921* Cgroup v2 shouldn't pass down overlapping922* partition root cpusets.923*/924WARN_ON_ONCE(cgrpv2);925uf_union(&csa[i]->node, &csa[j]->node);926}927}928}929930/* Count the total number of domains */931for (i = 0; i < csn; i++) {932if (uf_find(&csa[i]->node) == &csa[i]->node)933ndoms++;934}935936/*937* Now we know how many domains to create.938* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.939*/940doms = alloc_sched_domains(ndoms);941if (!doms)942goto done;943944/*945* The rest of the code, including the scheduler, can deal with946* dattr==NULL case. No need to abort if alloc fails.947*/948dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),949GFP_KERNEL);950951/*952* Cgroup v2 doesn't support domain attributes, just set all of them953* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a954* subset of HK_TYPE_DOMAIN housekeeping CPUs.955*/956if (cgrpv2) {957for (i = 0; i < ndoms; i++) {958/*959* The top cpuset may contain some boot time isolated960* CPUs that need to be excluded from the sched domain.961*/962if (csa[i] == &top_cpuset)963cpumask_and(doms[i], csa[i]->effective_cpus,964housekeeping_cpumask(HK_TYPE_DOMAIN));965else966cpumask_copy(doms[i], csa[i]->effective_cpus);967if (dattr)968dattr[i] = SD_ATTR_INIT;969}970goto done;971}972973for (nslot = 0, i = 0; i < csn; i++) {974nslot_update = 0;975for (j = i; j < csn; j++) {976if (uf_find(&csa[j]->node) == &csa[i]->node) {977struct cpumask *dp = doms[nslot];978979if (i == j) {980nslot_update = 1;981cpumask_clear(dp);982if (dattr)983*(dattr + nslot) = SD_ATTR_INIT;984}985cpumask_or(dp, dp, csa[j]->effective_cpus);986cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));987if (dattr)988update_domain_attr_tree(dattr + nslot, csa[j]);989}990}991if (nslot_update)992nslot++;993}994BUG_ON(nslot != ndoms);995996done:997kfree(csa);998999/*1000* Fallback to the default domain if kmalloc() failed.1001* See comments in partition_sched_domains().1002*/1003if (doms == NULL)1004ndoms = 1;10051006*domains = doms;1007*attributes = dattr;1008return ndoms;1009}10101011static void dl_update_tasks_root_domain(struct cpuset *cs)1012{1013struct css_task_iter it;1014struct task_struct *task;10151016if (cs->nr_deadline_tasks == 0)1017return;10181019css_task_iter_start(&cs->css, 0, &it);10201021while ((task = css_task_iter_next(&it)))1022dl_add_task_root_domain(task);10231024css_task_iter_end(&it);1025}10261027void dl_rebuild_rd_accounting(void)1028{1029struct cpuset *cs = NULL;1030struct cgroup_subsys_state *pos_css;1031int cpu;1032u64 cookie = ++dl_cookie;10331034lockdep_assert_held(&cpuset_mutex);1035lockdep_assert_cpus_held();1036lockdep_assert_held(&sched_domains_mutex);10371038rcu_read_lock();10391040for_each_possible_cpu(cpu) {1041if (dl_bw_visited(cpu, cookie))1042continue;10431044dl_clear_root_domain_cpu(cpu);1045}10461047cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {10481049if (cpumask_empty(cs->effective_cpus)) {1050pos_css = css_rightmost_descendant(pos_css);1051continue;1052}10531054css_get(&cs->css);10551056rcu_read_unlock();10571058dl_update_tasks_root_domain(cs);10591060rcu_read_lock();1061css_put(&cs->css);1062}1063rcu_read_unlock();1064}10651066/*1067* Rebuild scheduler domains.1068*1069* If the flag 'sched_load_balance' of any cpuset with non-empty1070* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset1071* which has that flag enabled, or if any cpuset with a non-empty1072* 'cpus' is removed, then call this routine to rebuild the1073* scheduler's dynamic sched domains.1074*1075* Call with cpuset_mutex held. Takes cpus_read_lock().1076*/1077void rebuild_sched_domains_locked(void)1078{1079struct cgroup_subsys_state *pos_css;1080struct sched_domain_attr *attr;1081cpumask_var_t *doms;1082struct cpuset *cs;1083int ndoms;10841085lockdep_assert_cpus_held();1086lockdep_assert_held(&cpuset_mutex);1087force_sd_rebuild = false;10881089/*1090* If we have raced with CPU hotplug, return early to avoid1091* passing doms with offlined cpu to partition_sched_domains().1092* Anyways, cpuset_handle_hotplug() will rebuild sched domains.1093*1094* With no CPUs in any subpartitions, top_cpuset's effective CPUs1095* should be the same as the active CPUs, so checking only top_cpuset1096* is enough to detect racing CPU offlines.1097*/1098if (cpumask_empty(subpartitions_cpus) &&1099!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))1100return;11011102/*1103* With subpartition CPUs, however, the effective CPUs of a partition1104* root should be only a subset of the active CPUs. Since a CPU in any1105* partition root could be offlined, all must be checked.1106*/1107if (!cpumask_empty(subpartitions_cpus)) {1108rcu_read_lock();1109cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {1110if (!is_partition_valid(cs)) {1111pos_css = css_rightmost_descendant(pos_css);1112continue;1113}1114if (!cpumask_subset(cs->effective_cpus,1115cpu_active_mask)) {1116rcu_read_unlock();1117return;1118}1119}1120rcu_read_unlock();1121}11221123/* Generate domain masks and attrs */1124ndoms = generate_sched_domains(&doms, &attr);11251126/* Have scheduler rebuild the domains */1127partition_sched_domains(ndoms, doms, attr);1128}1129#else /* !CONFIG_SMP */1130void rebuild_sched_domains_locked(void)1131{1132}1133#endif /* CONFIG_SMP */11341135static void rebuild_sched_domains_cpuslocked(void)1136{1137mutex_lock(&cpuset_mutex);1138rebuild_sched_domains_locked();1139mutex_unlock(&cpuset_mutex);1140}11411142void rebuild_sched_domains(void)1143{1144cpus_read_lock();1145rebuild_sched_domains_cpuslocked();1146cpus_read_unlock();1147}11481149void cpuset_reset_sched_domains(void)1150{1151mutex_lock(&cpuset_mutex);1152partition_sched_domains(1, NULL, NULL);1153mutex_unlock(&cpuset_mutex);1154}11551156/**1157* cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.1158* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed1159* @new_cpus: the temp variable for the new effective_cpus mask1160*1161* Iterate through each task of @cs updating its cpus_allowed to the1162* effective cpuset's. As this function is called with cpuset_mutex held,1163* cpuset membership stays stable.1164*1165* For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus1166* to make sure all offline CPUs are also included as hotplug code won't1167* update cpumasks for tasks in top_cpuset.1168*1169* As task_cpu_possible_mask() can be task dependent in arm64, we have to1170* do cpu masking per task instead of doing it once for all.1171*/1172void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)1173{1174struct css_task_iter it;1175struct task_struct *task;1176bool top_cs = cs == &top_cpuset;11771178css_task_iter_start(&cs->css, 0, &it);1179while ((task = css_task_iter_next(&it))) {1180const struct cpumask *possible_mask = task_cpu_possible_mask(task);11811182if (top_cs) {1183/*1184* PF_NO_SETAFFINITY tasks are ignored.1185* All per cpu kthreads should have PF_NO_SETAFFINITY1186* flag set, see kthread_set_per_cpu().1187*/1188if (task->flags & PF_NO_SETAFFINITY)1189continue;1190cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);1191} else {1192cpumask_and(new_cpus, possible_mask, cs->effective_cpus);1193}1194set_cpus_allowed_ptr(task, new_cpus);1195}1196css_task_iter_end(&it);1197}11981199/**1200* compute_effective_cpumask - Compute the effective cpumask of the cpuset1201* @new_cpus: the temp variable for the new effective_cpus mask1202* @cs: the cpuset the need to recompute the new effective_cpus mask1203* @parent: the parent cpuset1204*1205* The result is valid only if the given cpuset isn't a partition root.1206*/1207static void compute_effective_cpumask(struct cpumask *new_cpus,1208struct cpuset *cs, struct cpuset *parent)1209{1210cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);1211}12121213/*1214* Commands for update_parent_effective_cpumask1215*/1216enum partition_cmd {1217partcmd_enable, /* Enable partition root */1218partcmd_enablei, /* Enable isolated partition root */1219partcmd_disable, /* Disable partition root */1220partcmd_update, /* Update parent's effective_cpus */1221partcmd_invalidate, /* Make partition invalid */1222};12231224static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,1225struct tmpmasks *tmp);12261227/*1228* Update partition exclusive flag1229*1230* Return: 0 if successful, an error code otherwise1231*/1232static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs)1233{1234bool exclusive = (new_prs > PRS_MEMBER);12351236if (exclusive && !is_cpu_exclusive(cs)) {1237if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1))1238return PERR_NOTEXCL;1239} else if (!exclusive && is_cpu_exclusive(cs)) {1240/* Turning off CS_CPU_EXCLUSIVE will not return error */1241cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0);1242}1243return 0;1244}12451246/*1247* Update partition load balance flag and/or rebuild sched domain1248*1249* Changing load balance flag will automatically call1250* rebuild_sched_domains_locked().1251* This function is for cgroup v2 only.1252*/1253static void update_partition_sd_lb(struct cpuset *cs, int old_prs)1254{1255int new_prs = cs->partition_root_state;1256bool rebuild_domains = (new_prs > 0) || (old_prs > 0);1257bool new_lb;12581259/*1260* If cs is not a valid partition root, the load balance state1261* will follow its parent.1262*/1263if (new_prs > 0) {1264new_lb = (new_prs != PRS_ISOLATED);1265} else {1266new_lb = is_sched_load_balance(parent_cs(cs));1267}1268if (new_lb != !!is_sched_load_balance(cs)) {1269rebuild_domains = true;1270if (new_lb)1271set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);1272else1273clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);1274}12751276if (rebuild_domains)1277cpuset_force_rebuild();1278}12791280/*1281* tasks_nocpu_error - Return true if tasks will have no effective_cpus1282*/1283static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,1284struct cpumask *xcpus)1285{1286/*1287* A populated partition (cs or parent) can't have empty effective_cpus1288*/1289return (cpumask_subset(parent->effective_cpus, xcpus) &&1290partition_is_populated(parent, cs)) ||1291(!cpumask_intersects(xcpus, cpu_active_mask) &&1292partition_is_populated(cs, NULL));1293}12941295static void reset_partition_data(struct cpuset *cs)1296{1297struct cpuset *parent = parent_cs(cs);12981299if (!cpuset_v2())1300return;13011302lockdep_assert_held(&callback_lock);13031304cs->nr_subparts = 0;1305if (cpumask_empty(cs->exclusive_cpus)) {1306cpumask_clear(cs->effective_xcpus);1307if (is_cpu_exclusive(cs))1308clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);1309}1310if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed))1311cpumask_copy(cs->effective_cpus, parent->effective_cpus);1312}13131314/*1315* isolated_cpus_update - Update the isolated_cpus mask1316* @old_prs: old partition_root_state1317* @new_prs: new partition_root_state1318* @xcpus: exclusive CPUs with state change1319*/1320static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)1321{1322WARN_ON_ONCE(old_prs == new_prs);1323if (new_prs == PRS_ISOLATED)1324cpumask_or(isolated_cpus, isolated_cpus, xcpus);1325else1326cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);1327}13281329/*1330* partition_xcpus_add - Add new exclusive CPUs to partition1331* @new_prs: new partition_root_state1332* @parent: parent cpuset1333* @xcpus: exclusive CPUs to be added1334* Return: true if isolated_cpus modified, false otherwise1335*1336* Remote partition if parent == NULL1337*/1338static bool partition_xcpus_add(int new_prs, struct cpuset *parent,1339struct cpumask *xcpus)1340{1341bool isolcpus_updated;13421343WARN_ON_ONCE(new_prs < 0);1344lockdep_assert_held(&callback_lock);1345if (!parent)1346parent = &top_cpuset;134713481349if (parent == &top_cpuset)1350cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);13511352isolcpus_updated = (new_prs != parent->partition_root_state);1353if (isolcpus_updated)1354isolated_cpus_update(parent->partition_root_state, new_prs,1355xcpus);13561357cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);1358return isolcpus_updated;1359}13601361/*1362* partition_xcpus_del - Remove exclusive CPUs from partition1363* @old_prs: old partition_root_state1364* @parent: parent cpuset1365* @xcpus: exclusive CPUs to be removed1366* Return: true if isolated_cpus modified, false otherwise1367*1368* Remote partition if parent == NULL1369*/1370static bool partition_xcpus_del(int old_prs, struct cpuset *parent,1371struct cpumask *xcpus)1372{1373bool isolcpus_updated;13741375WARN_ON_ONCE(old_prs < 0);1376lockdep_assert_held(&callback_lock);1377if (!parent)1378parent = &top_cpuset;13791380if (parent == &top_cpuset)1381cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);13821383isolcpus_updated = (old_prs != parent->partition_root_state);1384if (isolcpus_updated)1385isolated_cpus_update(old_prs, parent->partition_root_state,1386xcpus);13871388cpumask_and(xcpus, xcpus, cpu_active_mask);1389cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);1390return isolcpus_updated;1391}13921393static void update_unbound_workqueue_cpumask(bool isolcpus_updated)1394{1395int ret;13961397lockdep_assert_cpus_held();13981399if (!isolcpus_updated)1400return;14011402ret = workqueue_unbound_exclude_cpumask(isolated_cpus);1403WARN_ON_ONCE(ret < 0);1404}14051406/**1407* cpuset_cpu_is_isolated - Check if the given CPU is isolated1408* @cpu: the CPU number to be checked1409* Return: true if CPU is used in an isolated partition, false otherwise1410*/1411bool cpuset_cpu_is_isolated(int cpu)1412{1413return cpumask_test_cpu(cpu, isolated_cpus);1414}1415EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);14161417/**1418* rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets1419* @parent: Parent cpuset containing all siblings1420* @cs: Current cpuset (will be skipped)1421* @excpus: exclusive effective CPU mask to modify1422*1423* This function ensures the given @excpus mask doesn't include any CPUs that1424* are exclusively allocated to sibling cpusets. It walks through all siblings1425* of @cs under @parent and removes their exclusive CPUs from @excpus.1426*/1427static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,1428struct cpumask *excpus)1429{1430struct cgroup_subsys_state *css;1431struct cpuset *sibling;1432int retval = 0;14331434if (cpumask_empty(excpus))1435return retval;14361437/*1438* Exclude exclusive CPUs from siblings1439*/1440rcu_read_lock();1441cpuset_for_each_child(sibling, css, parent) {1442if (sibling == cs)1443continue;14441445if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {1446cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);1447retval++;1448continue;1449}1450if (cpumask_intersects(excpus, sibling->effective_xcpus)) {1451cpumask_andnot(excpus, excpus, sibling->effective_xcpus);1452retval++;1453}1454}1455rcu_read_unlock();14561457return retval;1458}14591460/*1461* compute_excpus - compute effective exclusive CPUs1462* @cs: cpuset1463* @xcpus: effective exclusive CPUs value to be set1464* Return: 0 if there is no sibling conflict, > 0 otherwise1465*1466* If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets1467* and exclude their exclusive_cpus or effective_xcpus as well.1468*/1469static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)1470{1471struct cpuset *parent = parent_cs(cs);14721473cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);14741475if (!cpumask_empty(cs->exclusive_cpus))1476return 0;14771478return rm_siblings_excl_cpus(parent, cs, excpus);1479}14801481/*1482* compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset1483* @trialcs: The trial cpuset containing the proposed new configuration1484* @cs: The original cpuset that the trial configuration is based on1485* Return: 0 if successful with no sibling conflict, >0 if a conflict is found1486*1487* Computes the effective_xcpus for a trial configuration. @cs is provided to represent1488* the real cs.1489*/1490static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)1491{1492struct cpuset *parent = parent_cs(trialcs);1493struct cpumask *excpus = trialcs->effective_xcpus;14941495/* trialcs is member, cpuset.cpus has no impact to excpus */1496if (cs_is_member(cs))1497cpumask_and(excpus, trialcs->exclusive_cpus,1498parent->effective_xcpus);1499else1500cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);15011502return rm_siblings_excl_cpus(parent, cs, excpus);1503}15041505static inline bool is_remote_partition(struct cpuset *cs)1506{1507return !list_empty(&cs->remote_sibling);1508}15091510static inline bool is_local_partition(struct cpuset *cs)1511{1512return is_partition_valid(cs) && !is_remote_partition(cs);1513}15141515/*1516* remote_partition_enable - Enable current cpuset as a remote partition root1517* @cs: the cpuset to update1518* @new_prs: new partition_root_state1519* @tmp: temporary masks1520* Return: 0 if successful, errcode if error1521*1522* Enable the current cpuset to become a remote partition root taking CPUs1523* directly from the top cpuset. cpuset_mutex must be held by the caller.1524*/1525static int remote_partition_enable(struct cpuset *cs, int new_prs,1526struct tmpmasks *tmp)1527{1528bool isolcpus_updated;15291530/*1531* The user must have sysadmin privilege.1532*/1533if (!capable(CAP_SYS_ADMIN))1534return PERR_ACCESS;15351536/*1537* The requested exclusive_cpus must not be allocated to other1538* partitions and it can't use up all the root's effective_cpus.1539*1540* The effective_xcpus mask can contain offline CPUs, but there must1541* be at least one or more online CPUs present before it can be enabled.1542*1543* Note that creating a remote partition with any local partition root1544* above it or remote partition root underneath it is not allowed.1545*/1546compute_excpus(cs, tmp->new_cpus);1547WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));1548if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||1549cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))1550return PERR_INVCPUS;15511552spin_lock_irq(&callback_lock);1553isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);1554list_add(&cs->remote_sibling, &remote_children);1555cpumask_copy(cs->effective_xcpus, tmp->new_cpus);1556spin_unlock_irq(&callback_lock);1557update_unbound_workqueue_cpumask(isolcpus_updated);1558cpuset_force_rebuild();1559cs->prs_err = 0;15601561/*1562* Propagate changes in top_cpuset's effective_cpus down the hierarchy.1563*/1564cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);1565update_sibling_cpumasks(&top_cpuset, NULL, tmp);1566return 0;1567}15681569/*1570* remote_partition_disable - Remove current cpuset from remote partition list1571* @cs: the cpuset to update1572* @tmp: temporary masks1573*1574* The effective_cpus is also updated.1575*1576* cpuset_mutex must be held by the caller.1577*/1578static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)1579{1580bool isolcpus_updated;15811582WARN_ON_ONCE(!is_remote_partition(cs));1583WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));15841585spin_lock_irq(&callback_lock);1586list_del_init(&cs->remote_sibling);1587isolcpus_updated = partition_xcpus_del(cs->partition_root_state,1588NULL, cs->effective_xcpus);1589if (cs->prs_err)1590cs->partition_root_state = -cs->partition_root_state;1591else1592cs->partition_root_state = PRS_MEMBER;15931594/* effective_xcpus may need to be changed */1595compute_excpus(cs, cs->effective_xcpus);1596reset_partition_data(cs);1597spin_unlock_irq(&callback_lock);1598update_unbound_workqueue_cpumask(isolcpus_updated);1599cpuset_force_rebuild();16001601/*1602* Propagate changes in top_cpuset's effective_cpus down the hierarchy.1603*/1604cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);1605update_sibling_cpumasks(&top_cpuset, NULL, tmp);1606}16071608/*1609* remote_cpus_update - cpus_exclusive change of remote partition1610* @cs: the cpuset to be updated1611* @xcpus: the new exclusive_cpus mask, if non-NULL1612* @excpus: the new effective_xcpus mask1613* @tmp: temporary masks1614*1615* top_cpuset and subpartitions_cpus will be updated or partition can be1616* invalidated.1617*/1618static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,1619struct cpumask *excpus, struct tmpmasks *tmp)1620{1621bool adding, deleting;1622int prs = cs->partition_root_state;1623int isolcpus_updated = 0;16241625if (WARN_ON_ONCE(!is_remote_partition(cs)))1626return;16271628WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));16291630if (cpumask_empty(excpus)) {1631cs->prs_err = PERR_CPUSEMPTY;1632goto invalidate;1633}16341635adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus);1636deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus);16371638/*1639* Additions of remote CPUs is only allowed if those CPUs are1640* not allocated to other partitions and there are effective_cpus1641* left in the top cpuset.1642*/1643if (adding) {1644WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus));1645if (!capable(CAP_SYS_ADMIN))1646cs->prs_err = PERR_ACCESS;1647else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||1648cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))1649cs->prs_err = PERR_NOCPUS;1650if (cs->prs_err)1651goto invalidate;1652}16531654spin_lock_irq(&callback_lock);1655if (adding)1656isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);1657if (deleting)1658isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);1659/*1660* Need to update effective_xcpus and exclusive_cpus now as1661* update_sibling_cpumasks() below may iterate back to the same cs.1662*/1663cpumask_copy(cs->effective_xcpus, excpus);1664if (xcpus)1665cpumask_copy(cs->exclusive_cpus, xcpus);1666spin_unlock_irq(&callback_lock);1667update_unbound_workqueue_cpumask(isolcpus_updated);1668if (adding || deleting)1669cpuset_force_rebuild();16701671/*1672* Propagate changes in top_cpuset's effective_cpus down the hierarchy.1673*/1674cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);1675update_sibling_cpumasks(&top_cpuset, NULL, tmp);1676return;16771678invalidate:1679remote_partition_disable(cs, tmp);1680}16811682/*1683* prstate_housekeeping_conflict - check for partition & housekeeping conflicts1684* @prstate: partition root state to be checked1685* @new_cpus: cpu mask1686* Return: true if there is conflict, false otherwise1687*1688* CPUs outside of boot_hk_cpus, if defined, can only be used in an1689* isolated partition.1690*/1691static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)1692{1693if (!have_boot_isolcpus)1694return false;16951696if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))1697return true;16981699return false;1700}17011702/**1703* update_parent_effective_cpumask - update effective_cpus mask of parent cpuset1704* @cs: The cpuset that requests change in partition root state1705* @cmd: Partition root state change command1706* @newmask: Optional new cpumask for partcmd_update1707* @tmp: Temporary addmask and delmask1708* Return: 0 or a partition root state error code1709*1710* For partcmd_enable*, the cpuset is being transformed from a non-partition1711* root to a partition root. The effective_xcpus (cpus_allowed if1712* effective_xcpus not set) mask of the given cpuset will be taken away from1713* parent's effective_cpus. The function will return 0 if all the CPUs listed1714* in effective_xcpus can be granted or an error code will be returned.1715*1716* For partcmd_disable, the cpuset is being transformed from a partition1717* root back to a non-partition root. Any CPUs in effective_xcpus will be1718* given back to parent's effective_cpus. 0 will always be returned.1719*1720* For partcmd_update, if the optional newmask is specified, the cpu list is1721* to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is1722* assumed to remain the same. The cpuset should either be a valid or invalid1723* partition root. The partition root state may change from valid to invalid1724* or vice versa. An error code will be returned if transitioning from1725* invalid to valid violates the exclusivity rule.1726*1727* For partcmd_invalidate, the current partition will be made invalid.1728*1729* The partcmd_enable* and partcmd_disable commands are used by1730* update_prstate(). An error code may be returned and the caller will check1731* for error.1732*1733* The partcmd_update command is used by update_cpumasks_hier() with newmask1734* NULL and update_cpumask() with newmask set. The partcmd_invalidate is used1735* by update_cpumask() with NULL newmask. In both cases, the callers won't1736* check for error and so partition_root_state and prs_err will be updated1737* directly.1738*/1739static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,1740struct cpumask *newmask,1741struct tmpmasks *tmp)1742{1743struct cpuset *parent = parent_cs(cs);1744int adding; /* Adding cpus to parent's effective_cpus */1745int deleting; /* Deleting cpus from parent's effective_cpus */1746int old_prs, new_prs;1747int part_error = PERR_NONE; /* Partition error? */1748int subparts_delta = 0;1749int isolcpus_updated = 0;1750struct cpumask *xcpus = user_xcpus(cs);1751bool nocpu;17521753lockdep_assert_held(&cpuset_mutex);1754WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */17551756/*1757* new_prs will only be changed for the partcmd_update and1758* partcmd_invalidate commands.1759*/1760adding = deleting = false;1761old_prs = new_prs = cs->partition_root_state;17621763if (cmd == partcmd_invalidate) {1764if (is_partition_invalid(cs))1765return 0;17661767/*1768* Make the current partition invalid.1769*/1770if (is_partition_valid(parent))1771adding = cpumask_and(tmp->addmask,1772xcpus, parent->effective_xcpus);1773if (old_prs > 0) {1774new_prs = -old_prs;1775subparts_delta--;1776}1777goto write_error;1778}17791780/*1781* The parent must be a partition root.1782* The new cpumask, if present, or the current cpus_allowed must1783* not be empty.1784*/1785if (!is_partition_valid(parent)) {1786return is_partition_invalid(parent)1787? PERR_INVPARENT : PERR_NOTPART;1788}1789if (!newmask && xcpus_empty(cs))1790return PERR_CPUSEMPTY;17911792nocpu = tasks_nocpu_error(parent, cs, xcpus);17931794if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {1795/*1796* Need to call compute_excpus() in case1797* exclusive_cpus not set. Sibling conflict should only happen1798* if exclusive_cpus isn't set.1799*/1800xcpus = tmp->delmask;1801if (compute_excpus(cs, xcpus))1802WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));1803new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;18041805/*1806* Enabling partition root is not allowed if its1807* effective_xcpus is empty.1808*/1809if (cpumask_empty(xcpus))1810return PERR_INVCPUS;18111812if (prstate_housekeeping_conflict(new_prs, xcpus))1813return PERR_HKEEPING;18141815if (tasks_nocpu_error(parent, cs, xcpus))1816return PERR_NOCPUS;18171818/*1819* This function will only be called when all the preliminary1820* checks have passed. At this point, the following condition1821* should hold.1822*1823* (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus1824*1825* Warn if it is not the case.1826*/1827cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask);1828WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));18291830deleting = true;1831subparts_delta++;1832} else if (cmd == partcmd_disable) {1833/*1834* May need to add cpus back to parent's effective_cpus1835* (and maybe removed from subpartitions_cpus/isolated_cpus)1836* for valid partition root. xcpus may contain CPUs that1837* shouldn't be removed from the two global cpumasks.1838*/1839if (is_partition_valid(cs)) {1840cpumask_copy(tmp->addmask, cs->effective_xcpus);1841adding = true;1842subparts_delta--;1843}1844new_prs = PRS_MEMBER;1845} else if (newmask) {1846/*1847* Empty cpumask is not allowed1848*/1849if (cpumask_empty(newmask)) {1850part_error = PERR_CPUSEMPTY;1851goto write_error;1852}18531854/* Check newmask again, whether cpus are available for parent/cs */1855nocpu |= tasks_nocpu_error(parent, cs, newmask);18561857/*1858* partcmd_update with newmask:1859*1860* Compute add/delete mask to/from effective_cpus1861*1862* For valid partition:1863* addmask = exclusive_cpus & ~newmask1864* & parent->effective_xcpus1865* delmask = newmask & ~exclusive_cpus1866* & parent->effective_xcpus1867*1868* For invalid partition:1869* delmask = newmask & parent->effective_xcpus1870*/1871if (is_partition_invalid(cs)) {1872adding = false;1873deleting = cpumask_and(tmp->delmask,1874newmask, parent->effective_xcpus);1875} else {1876cpumask_andnot(tmp->addmask, xcpus, newmask);1877adding = cpumask_and(tmp->addmask, tmp->addmask,1878parent->effective_xcpus);18791880cpumask_andnot(tmp->delmask, newmask, xcpus);1881deleting = cpumask_and(tmp->delmask, tmp->delmask,1882parent->effective_xcpus);1883}1884/*1885* The new CPUs to be removed from parent's effective CPUs1886* must be present.1887*/1888if (deleting) {1889cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask);1890WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));1891}18921893/*1894* Make partition invalid if parent's effective_cpus could1895* become empty and there are tasks in the parent.1896*/1897if (nocpu && (!adding ||1898!cpumask_intersects(tmp->addmask, cpu_active_mask))) {1899part_error = PERR_NOCPUS;1900deleting = false;1901adding = cpumask_and(tmp->addmask,1902xcpus, parent->effective_xcpus);1903}1904} else {1905/*1906* partcmd_update w/o newmask1907*1908* delmask = effective_xcpus & parent->effective_cpus1909*1910* This can be called from:1911* 1) update_cpumasks_hier()1912* 2) cpuset_hotplug_update_tasks()1913*1914* Check to see if it can be transitioned from valid to1915* invalid partition or vice versa.1916*1917* A partition error happens when parent has tasks and all1918* its effective CPUs will have to be distributed out.1919*/1920if (nocpu) {1921part_error = PERR_NOCPUS;1922if (is_partition_valid(cs))1923adding = cpumask_and(tmp->addmask,1924xcpus, parent->effective_xcpus);1925} else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&1926cpumask_subset(xcpus, parent->effective_xcpus)) {1927struct cgroup_subsys_state *css;1928struct cpuset *child;1929bool exclusive = true;19301931/*1932* Convert invalid partition to valid has to1933* pass the cpu exclusivity test.1934*/1935rcu_read_lock();1936cpuset_for_each_child(child, css, parent) {1937if (child == cs)1938continue;1939if (!cpusets_are_exclusive(cs, child)) {1940exclusive = false;1941break;1942}1943}1944rcu_read_unlock();1945if (exclusive)1946deleting = cpumask_and(tmp->delmask,1947xcpus, parent->effective_cpus);1948else1949part_error = PERR_NOTEXCL;1950}1951}19521953write_error:1954if (part_error)1955WRITE_ONCE(cs->prs_err, part_error);19561957if (cmd == partcmd_update) {1958/*1959* Check for possible transition between valid and invalid1960* partition root.1961*/1962switch (cs->partition_root_state) {1963case PRS_ROOT:1964case PRS_ISOLATED:1965if (part_error) {1966new_prs = -old_prs;1967subparts_delta--;1968}1969break;1970case PRS_INVALID_ROOT:1971case PRS_INVALID_ISOLATED:1972if (!part_error) {1973new_prs = -old_prs;1974subparts_delta++;1975}1976break;1977}1978}19791980if (!adding && !deleting && (new_prs == old_prs))1981return 0;19821983/*1984* Transitioning between invalid to valid or vice versa may require1985* changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,1986* validate_change() has already been successfully called and1987* CPU lists in cs haven't been updated yet. So defer it to later.1988*/1989if ((old_prs != new_prs) && (cmd != partcmd_update)) {1990int err = update_partition_exclusive_flag(cs, new_prs);19911992if (err)1993return err;1994}19951996/*1997* Change the parent's effective_cpus & effective_xcpus (top cpuset1998* only).1999*2000* Newly added CPUs will be removed from effective_cpus and2001* newly deleted ones will be added back to effective_cpus.2002*/2003spin_lock_irq(&callback_lock);2004if (old_prs != new_prs) {2005cs->partition_root_state = new_prs;2006if (new_prs <= 0)2007cs->nr_subparts = 0;2008}2009/*2010* Adding to parent's effective_cpus means deletion CPUs from cs2011* and vice versa.2012*/2013if (adding)2014isolcpus_updated += partition_xcpus_del(old_prs, parent,2015tmp->addmask);2016if (deleting)2017isolcpus_updated += partition_xcpus_add(new_prs, parent,2018tmp->delmask);20192020if (is_partition_valid(parent)) {2021parent->nr_subparts += subparts_delta;2022WARN_ON_ONCE(parent->nr_subparts < 0);2023}2024spin_unlock_irq(&callback_lock);2025update_unbound_workqueue_cpumask(isolcpus_updated);20262027if ((old_prs != new_prs) && (cmd == partcmd_update))2028update_partition_exclusive_flag(cs, new_prs);20292030if (adding || deleting) {2031cpuset_update_tasks_cpumask(parent, tmp->addmask);2032update_sibling_cpumasks(parent, cs, tmp);2033}20342035/*2036* For partcmd_update without newmask, it is being called from2037* cpuset_handle_hotplug(). Update the load balance flag and2038* scheduling domain accordingly.2039*/2040if ((cmd == partcmd_update) && !newmask)2041update_partition_sd_lb(cs, old_prs);20422043notify_partition_change(cs, old_prs);2044return 0;2045}20462047/**2048* compute_partition_effective_cpumask - compute effective_cpus for partition2049* @cs: partition root cpuset2050* @new_ecpus: previously computed effective_cpus to be updated2051*2052* Compute the effective_cpus of a partition root by scanning effective_xcpus2053* of child partition roots and excluding their effective_xcpus.2054*2055* This has the side effect of invalidating valid child partition roots,2056* if necessary. Since it is called from either cpuset_hotplug_update_tasks()2057* or update_cpumasks_hier() where parent and children are modified2058* successively, we don't need to call update_parent_effective_cpumask()2059* and the child's effective_cpus will be updated in later iterations.2060*2061* Note that rcu_read_lock() is assumed to be held.2062*/2063static void compute_partition_effective_cpumask(struct cpuset *cs,2064struct cpumask *new_ecpus)2065{2066struct cgroup_subsys_state *css;2067struct cpuset *child;2068bool populated = partition_is_populated(cs, NULL);20692070/*2071* Check child partition roots to see if they should be2072* invalidated when2073* 1) child effective_xcpus not a subset of new2074* excluisve_cpus2075* 2) All the effective_cpus will be used up and cp2076* has tasks2077*/2078compute_excpus(cs, new_ecpus);2079cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);20802081rcu_read_lock();2082cpuset_for_each_child(child, css, cs) {2083if (!is_partition_valid(child))2084continue;20852086/*2087* There shouldn't be a remote partition underneath another2088* partition root.2089*/2090WARN_ON_ONCE(is_remote_partition(child));2091child->prs_err = 0;2092if (!cpumask_subset(child->effective_xcpus,2093cs->effective_xcpus))2094child->prs_err = PERR_INVCPUS;2095else if (populated &&2096cpumask_subset(new_ecpus, child->effective_xcpus))2097child->prs_err = PERR_NOCPUS;20982099if (child->prs_err) {2100int old_prs = child->partition_root_state;21012102/*2103* Invalidate child partition2104*/2105spin_lock_irq(&callback_lock);2106make_partition_invalid(child);2107cs->nr_subparts--;2108child->nr_subparts = 0;2109spin_unlock_irq(&callback_lock);2110notify_partition_change(child, old_prs);2111continue;2112}2113cpumask_andnot(new_ecpus, new_ecpus,2114child->effective_xcpus);2115}2116rcu_read_unlock();2117}21182119/*2120* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree2121* @cs: the cpuset to consider2122* @tmp: temp variables for calculating effective_cpus & partition setup2123* @force: don't skip any descendant cpusets if set2124*2125* When configured cpumask is changed, the effective cpumasks of this cpuset2126* and all its descendants need to be updated.2127*2128* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.2129*2130* Called with cpuset_mutex held2131*/2132static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,2133bool force)2134{2135struct cpuset *cp;2136struct cgroup_subsys_state *pos_css;2137bool need_rebuild_sched_domains = false;2138int old_prs, new_prs;21392140rcu_read_lock();2141cpuset_for_each_descendant_pre(cp, pos_css, cs) {2142struct cpuset *parent = parent_cs(cp);2143bool remote = is_remote_partition(cp);2144bool update_parent = false;21452146old_prs = new_prs = cp->partition_root_state;21472148/*2149* For child remote partition root (!= cs), we need to call2150* remote_cpus_update() if effective_xcpus will be changed.2151* Otherwise, we can skip the whole subtree.2152*2153* remote_cpus_update() will reuse tmp->new_cpus only after2154* its value is being processed.2155*/2156if (remote && (cp != cs)) {2157compute_excpus(cp, tmp->new_cpus);2158if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {2159pos_css = css_rightmost_descendant(pos_css);2160continue;2161}2162rcu_read_unlock();2163remote_cpus_update(cp, NULL, tmp->new_cpus, tmp);2164rcu_read_lock();21652166/* Remote partition may be invalidated */2167new_prs = cp->partition_root_state;2168remote = (new_prs == old_prs);2169}21702171if (remote || (is_partition_valid(parent) && is_partition_valid(cp)))2172compute_partition_effective_cpumask(cp, tmp->new_cpus);2173else2174compute_effective_cpumask(tmp->new_cpus, cp, parent);21752176if (remote)2177goto get_css; /* Ready to update cpuset data */21782179/*2180* A partition with no effective_cpus is allowed as long as2181* there is no task associated with it. Call2182* update_parent_effective_cpumask() to check it.2183*/2184if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {2185update_parent = true;2186goto update_parent_effective;2187}21882189/*2190* If it becomes empty, inherit the effective mask of the2191* parent, which is guaranteed to have some CPUs unless2192* it is a partition root that has explicitly distributed2193* out all its CPUs.2194*/2195if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))2196cpumask_copy(tmp->new_cpus, parent->effective_cpus);21972198/*2199* Skip the whole subtree if2200* 1) the cpumask remains the same,2201* 2) has no partition root state,2202* 3) force flag not set, and2203* 4) for v2 load balance state same as its parent.2204*/2205if (!cp->partition_root_state && !force &&2206cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&2207(!cpuset_v2() ||2208(is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {2209pos_css = css_rightmost_descendant(pos_css);2210continue;2211}22122213update_parent_effective:2214/*2215* update_parent_effective_cpumask() should have been called2216* for cs already in update_cpumask(). We should also call2217* cpuset_update_tasks_cpumask() again for tasks in the parent2218* cpuset if the parent's effective_cpus changes.2219*/2220if ((cp != cs) && old_prs) {2221switch (parent->partition_root_state) {2222case PRS_ROOT:2223case PRS_ISOLATED:2224update_parent = true;2225break;22262227default:2228/*2229* When parent is not a partition root or is2230* invalid, child partition roots become2231* invalid too.2232*/2233if (is_partition_valid(cp))2234new_prs = -cp->partition_root_state;2235WRITE_ONCE(cp->prs_err,2236is_partition_invalid(parent)2237? PERR_INVPARENT : PERR_NOTPART);2238break;2239}2240}2241get_css:2242if (!css_tryget_online(&cp->css))2243continue;2244rcu_read_unlock();22452246if (update_parent) {2247update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);2248/*2249* The cpuset partition_root_state may become2250* invalid. Capture it.2251*/2252new_prs = cp->partition_root_state;2253}22542255spin_lock_irq(&callback_lock);2256cpumask_copy(cp->effective_cpus, tmp->new_cpus);2257cp->partition_root_state = new_prs;2258if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))2259compute_excpus(cp, cp->effective_xcpus);22602261/*2262* Make sure effective_xcpus is properly set for a valid2263* partition root.2264*/2265if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))2266cpumask_and(cp->effective_xcpus,2267cp->cpus_allowed, parent->effective_xcpus);2268else if (new_prs < 0)2269reset_partition_data(cp);2270spin_unlock_irq(&callback_lock);22712272notify_partition_change(cp, old_prs);22732274WARN_ON(!is_in_v2_mode() &&2275!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));22762277cpuset_update_tasks_cpumask(cp, cp->effective_cpus);22782279/*2280* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE2281* from parent if current cpuset isn't a valid partition root2282* and their load balance states differ.2283*/2284if (cpuset_v2() && !is_partition_valid(cp) &&2285(is_sched_load_balance(parent) != is_sched_load_balance(cp))) {2286if (is_sched_load_balance(parent))2287set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);2288else2289clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);2290}22912292/*2293* On legacy hierarchy, if the effective cpumask of any non-2294* empty cpuset is changed, we need to rebuild sched domains.2295* On default hierarchy, the cpuset needs to be a partition2296* root as well.2297*/2298if (!cpumask_empty(cp->cpus_allowed) &&2299is_sched_load_balance(cp) &&2300(!cpuset_v2() || is_partition_valid(cp)))2301need_rebuild_sched_domains = true;23022303rcu_read_lock();2304css_put(&cp->css);2305}2306rcu_read_unlock();23072308if (need_rebuild_sched_domains)2309cpuset_force_rebuild();2310}23112312/**2313* update_sibling_cpumasks - Update siblings cpumasks2314* @parent: Parent cpuset2315* @cs: Current cpuset2316* @tmp: Temp variables2317*/2318static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,2319struct tmpmasks *tmp)2320{2321struct cpuset *sibling;2322struct cgroup_subsys_state *pos_css;23232324lockdep_assert_held(&cpuset_mutex);23252326/*2327* Check all its siblings and call update_cpumasks_hier()2328* if their effective_cpus will need to be changed.2329*2330* It is possible a change in parent's effective_cpus2331* due to a change in a child partition's effective_xcpus will impact2332* its siblings even if they do not inherit parent's effective_cpus2333* directly.2334*2335* The update_cpumasks_hier() function may sleep. So we have to2336* release the RCU read lock before calling it.2337*/2338rcu_read_lock();2339cpuset_for_each_child(sibling, pos_css, parent) {2340if (sibling == cs)2341continue;2342if (!is_partition_valid(sibling)) {2343compute_effective_cpumask(tmp->new_cpus, sibling,2344parent);2345if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))2346continue;2347} else if (is_remote_partition(sibling)) {2348/*2349* Change in a sibling cpuset won't affect a remote2350* partition root.2351*/2352continue;2353}23542355if (!css_tryget_online(&sibling->css))2356continue;23572358rcu_read_unlock();2359update_cpumasks_hier(sibling, tmp, false);2360rcu_read_lock();2361css_put(&sibling->css);2362}2363rcu_read_unlock();2364}23652366static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)2367{2368int retval;23692370retval = cpulist_parse(buf, out_mask);2371if (retval < 0)2372return retval;2373if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))2374return -EINVAL;23752376return 0;2377}23782379/**2380* validate_partition - Validate a cpuset partition configuration2381* @cs: The cpuset to validate2382* @trialcs: The trial cpuset containing proposed configuration changes2383*2384* If any validation check fails, the appropriate error code is set in the2385* cpuset's prs_err field.2386*2387* Return: PRS error code (0 if valid, non-zero error code if invalid)2388*/2389static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)2390{2391struct cpuset *parent = parent_cs(cs);23922393if (cs_is_member(trialcs))2394return PERR_NONE;23952396if (cpumask_empty(trialcs->effective_xcpus))2397return PERR_INVCPUS;23982399if (prstate_housekeeping_conflict(trialcs->partition_root_state,2400trialcs->effective_xcpus))2401return PERR_HKEEPING;24022403if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))2404return PERR_NOCPUS;24052406return PERR_NONE;2407}24082409static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,2410struct tmpmasks *tmp)2411{2412int retval;2413struct cpuset *parent = parent_cs(cs);24142415retval = validate_change(cs, trialcs);24162417if ((retval == -EINVAL) && cpuset_v2()) {2418struct cgroup_subsys_state *css;2419struct cpuset *cp;24202421/*2422* The -EINVAL error code indicates that partition sibling2423* CPU exclusivity rule has been violated. We still allow2424* the cpumask change to proceed while invalidating the2425* partition. However, any conflicting sibling partitions2426* have to be marked as invalid too.2427*/2428trialcs->prs_err = PERR_NOTEXCL;2429rcu_read_lock();2430cpuset_for_each_child(cp, css, parent) {2431struct cpumask *xcpus = user_xcpus(trialcs);24322433if (is_partition_valid(cp) &&2434cpumask_intersects(xcpus, cp->effective_xcpus)) {2435rcu_read_unlock();2436update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);2437rcu_read_lock();2438}2439}2440rcu_read_unlock();2441retval = 0;2442}2443return retval;2444}24452446/**2447* partition_cpus_change - Handle partition state changes due to CPU mask updates2448* @cs: The target cpuset being modified2449* @trialcs: The trial cpuset containing proposed configuration changes2450* @tmp: Temporary masks for intermediate calculations2451*2452* This function handles partition state transitions triggered by CPU mask changes.2453* CPU modifications may cause a partition to be disabled or require state updates.2454*/2455static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,2456struct tmpmasks *tmp)2457{2458enum prs_errcode prs_err;24592460if (cs_is_member(cs))2461return;24622463prs_err = validate_partition(cs, trialcs);2464if (prs_err)2465trialcs->prs_err = cs->prs_err = prs_err;24662467if (is_remote_partition(cs)) {2468if (trialcs->prs_err)2469remote_partition_disable(cs, tmp);2470else2471remote_cpus_update(cs, trialcs->exclusive_cpus,2472trialcs->effective_xcpus, tmp);2473} else {2474if (trialcs->prs_err)2475update_parent_effective_cpumask(cs, partcmd_invalidate,2476NULL, tmp);2477else2478update_parent_effective_cpumask(cs, partcmd_update,2479trialcs->effective_xcpus, tmp);2480}2481}24822483/**2484* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it2485* @cs: the cpuset to consider2486* @trialcs: trial cpuset2487* @buf: buffer of cpu numbers written to this cpuset2488*/2489static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,2490const char *buf)2491{2492int retval;2493struct tmpmasks tmp;2494bool force = false;2495int old_prs = cs->partition_root_state;24962497retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);2498if (retval < 0)2499return retval;25002501/* Nothing to do if the cpus didn't change */2502if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))2503return 0;25042505if (alloc_tmpmasks(&tmp))2506return -ENOMEM;25072508compute_trialcs_excpus(trialcs, cs);2509trialcs->prs_err = PERR_NONE;25102511retval = cpus_allowed_validate_change(cs, trialcs, &tmp);2512if (retval < 0)2513goto out_free;25142515/*2516* Check all the descendants in update_cpumasks_hier() if2517* effective_xcpus is to be changed.2518*/2519force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);25202521partition_cpus_change(cs, trialcs, &tmp);25222523spin_lock_irq(&callback_lock);2524cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);2525cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);2526if ((old_prs > 0) && !is_partition_valid(cs))2527reset_partition_data(cs);2528spin_unlock_irq(&callback_lock);25292530/* effective_cpus/effective_xcpus will be updated here */2531update_cpumasks_hier(cs, &tmp, force);25322533/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */2534if (cs->partition_root_state)2535update_partition_sd_lb(cs, old_prs);2536out_free:2537free_tmpmasks(&tmp);2538return retval;2539}25402541/**2542* update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset2543* @cs: the cpuset to consider2544* @trialcs: trial cpuset2545* @buf: buffer of cpu numbers written to this cpuset2546*2547* The tasks' cpumask will be updated if cs is a valid partition root.2548*/2549static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,2550const char *buf)2551{2552int retval;2553struct tmpmasks tmp;2554bool force = false;2555int old_prs = cs->partition_root_state;25562557retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);2558if (retval < 0)2559return retval;25602561/* Nothing to do if the CPUs didn't change */2562if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))2563return 0;25642565/*2566* Reject the change if there is exclusive CPUs conflict with2567* the siblings.2568*/2569if (compute_trialcs_excpus(trialcs, cs))2570return -EINVAL;25712572/*2573* Check all the descendants in update_cpumasks_hier() if2574* effective_xcpus is to be changed.2575*/2576force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);25772578retval = validate_change(cs, trialcs);2579if (retval)2580return retval;25812582if (alloc_tmpmasks(&tmp))2583return -ENOMEM;25842585trialcs->prs_err = PERR_NONE;2586partition_cpus_change(cs, trialcs, &tmp);25872588spin_lock_irq(&callback_lock);2589cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);2590cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);2591if ((old_prs > 0) && !is_partition_valid(cs))2592reset_partition_data(cs);2593spin_unlock_irq(&callback_lock);25942595/*2596* Call update_cpumasks_hier() to update effective_cpus/effective_xcpus2597* of the subtree when it is a valid partition root or effective_xcpus2598* is updated.2599*/2600if (is_partition_valid(cs) || force)2601update_cpumasks_hier(cs, &tmp, force);26022603/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */2604if (cs->partition_root_state)2605update_partition_sd_lb(cs, old_prs);26062607free_tmpmasks(&tmp);2608return 0;2609}26102611/*2612* Migrate memory region from one set of nodes to another. This is2613* performed asynchronously as it can be called from process migration path2614* holding locks involved in process management. All mm migrations are2615* performed in the queued order and can be waited for by flushing2616* cpuset_migrate_mm_wq.2617*/26182619struct cpuset_migrate_mm_work {2620struct work_struct work;2621struct mm_struct *mm;2622nodemask_t from;2623nodemask_t to;2624};26252626static void cpuset_migrate_mm_workfn(struct work_struct *work)2627{2628struct cpuset_migrate_mm_work *mwork =2629container_of(work, struct cpuset_migrate_mm_work, work);26302631/* on a wq worker, no need to worry about %current's mems_allowed */2632do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);2633mmput(mwork->mm);2634kfree(mwork);2635}26362637static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,2638const nodemask_t *to)2639{2640struct cpuset_migrate_mm_work *mwork;26412642if (nodes_equal(*from, *to)) {2643mmput(mm);2644return;2645}26462647mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);2648if (mwork) {2649mwork->mm = mm;2650mwork->from = *from;2651mwork->to = *to;2652INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);2653queue_work(cpuset_migrate_mm_wq, &mwork->work);2654} else {2655mmput(mm);2656}2657}26582659static void flush_migrate_mm_task_workfn(struct callback_head *head)2660{2661flush_workqueue(cpuset_migrate_mm_wq);2662kfree(head);2663}26642665static void schedule_flush_migrate_mm(void)2666{2667struct callback_head *flush_cb;26682669flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);2670if (!flush_cb)2671return;26722673init_task_work(flush_cb, flush_migrate_mm_task_workfn);26742675if (task_work_add(current, flush_cb, TWA_RESUME))2676kfree(flush_cb);2677}26782679/*2680* cpuset_change_task_nodemask - change task's mems_allowed and mempolicy2681* @tsk: the task to change2682* @newmems: new nodes that the task will be set2683*2684* We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed2685* and rebind an eventual tasks' mempolicy. If the task is allocating in2686* parallel, it might temporarily see an empty intersection, which results in2687* a seqlock check and retry before OOM or allocation failure.2688*/2689static void cpuset_change_task_nodemask(struct task_struct *tsk,2690nodemask_t *newmems)2691{2692task_lock(tsk);26932694local_irq_disable();2695write_seqcount_begin(&tsk->mems_allowed_seq);26962697nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);2698mpol_rebind_task(tsk, newmems);2699tsk->mems_allowed = *newmems;27002701write_seqcount_end(&tsk->mems_allowed_seq);2702local_irq_enable();27032704task_unlock(tsk);2705}27062707static void *cpuset_being_rebound;27082709/**2710* cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.2711* @cs: the cpuset in which each task's mems_allowed mask needs to be changed2712*2713* Iterate through each task of @cs updating its mems_allowed to the2714* effective cpuset's. As this function is called with cpuset_mutex held,2715* cpuset membership stays stable.2716*/2717void cpuset_update_tasks_nodemask(struct cpuset *cs)2718{2719static nodemask_t newmems; /* protected by cpuset_mutex */2720struct css_task_iter it;2721struct task_struct *task;27222723cpuset_being_rebound = cs; /* causes mpol_dup() rebind */27242725guarantee_online_mems(cs, &newmems);27262727/*2728* The mpol_rebind_mm() call takes mmap_lock, which we couldn't2729* take while holding tasklist_lock. Forks can happen - the2730* mpol_dup() cpuset_being_rebound check will catch such forks,2731* and rebind their vma mempolicies too. Because we still hold2732* the global cpuset_mutex, we know that no other rebind effort2733* will be contending for the global variable cpuset_being_rebound.2734* It's ok if we rebind the same mm twice; mpol_rebind_mm()2735* is idempotent. Also migrate pages in each mm to new nodes.2736*/2737css_task_iter_start(&cs->css, 0, &it);2738while ((task = css_task_iter_next(&it))) {2739struct mm_struct *mm;2740bool migrate;27412742cpuset_change_task_nodemask(task, &newmems);27432744mm = get_task_mm(task);2745if (!mm)2746continue;27472748migrate = is_memory_migrate(cs);27492750mpol_rebind_mm(mm, &cs->mems_allowed);2751if (migrate)2752cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);2753else2754mmput(mm);2755}2756css_task_iter_end(&it);27572758/*2759* All the tasks' nodemasks have been updated, update2760* cs->old_mems_allowed.2761*/2762cs->old_mems_allowed = newmems;27632764/* We're done rebinding vmas to this cpuset's new mems_allowed. */2765cpuset_being_rebound = NULL;2766}27672768/*2769* update_nodemasks_hier - Update effective nodemasks and tasks in the subtree2770* @cs: the cpuset to consider2771* @new_mems: a temp variable for calculating new effective_mems2772*2773* When configured nodemask is changed, the effective nodemasks of this cpuset2774* and all its descendants need to be updated.2775*2776* On legacy hierarchy, effective_mems will be the same with mems_allowed.2777*2778* Called with cpuset_mutex held2779*/2780static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)2781{2782struct cpuset *cp;2783struct cgroup_subsys_state *pos_css;27842785rcu_read_lock();2786cpuset_for_each_descendant_pre(cp, pos_css, cs) {2787struct cpuset *parent = parent_cs(cp);27882789nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);27902791/*2792* If it becomes empty, inherit the effective mask of the2793* parent, which is guaranteed to have some MEMs.2794*/2795if (is_in_v2_mode() && nodes_empty(*new_mems))2796*new_mems = parent->effective_mems;27972798/* Skip the whole subtree if the nodemask remains the same. */2799if (nodes_equal(*new_mems, cp->effective_mems)) {2800pos_css = css_rightmost_descendant(pos_css);2801continue;2802}28032804if (!css_tryget_online(&cp->css))2805continue;2806rcu_read_unlock();28072808spin_lock_irq(&callback_lock);2809cp->effective_mems = *new_mems;2810spin_unlock_irq(&callback_lock);28112812WARN_ON(!is_in_v2_mode() &&2813!nodes_equal(cp->mems_allowed, cp->effective_mems));28142815cpuset_update_tasks_nodemask(cp);28162817rcu_read_lock();2818css_put(&cp->css);2819}2820rcu_read_unlock();2821}28222823/*2824* Handle user request to change the 'mems' memory placement2825* of a cpuset. Needs to validate the request, update the2826* cpusets mems_allowed, and for each task in the cpuset,2827* update mems_allowed and rebind task's mempolicy and any vma2828* mempolicies and if the cpuset is marked 'memory_migrate',2829* migrate the tasks pages to the new memory.2830*2831* Call with cpuset_mutex held. May take callback_lock during call.2832* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,2833* lock each such tasks mm->mmap_lock, scan its vma's and rebind2834* their mempolicies to the cpusets new mems_allowed.2835*/2836static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,2837const char *buf)2838{2839int retval;28402841/*2842* An empty mems_allowed is ok iff there are no tasks in the cpuset.2843* The validate_change() call ensures that cpusets with tasks have memory.2844*/2845retval = nodelist_parse(buf, trialcs->mems_allowed);2846if (retval < 0)2847goto done;28482849if (!nodes_subset(trialcs->mems_allowed,2850top_cpuset.mems_allowed)) {2851retval = -EINVAL;2852goto done;2853}28542855if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {2856retval = 0; /* Too easy - nothing to do */2857goto done;2858}2859retval = validate_change(cs, trialcs);2860if (retval < 0)2861goto done;28622863check_insane_mems_config(&trialcs->mems_allowed);28642865spin_lock_irq(&callback_lock);2866cs->mems_allowed = trialcs->mems_allowed;2867spin_unlock_irq(&callback_lock);28682869/* use trialcs->mems_allowed as a temp variable */2870update_nodemasks_hier(cs, &trialcs->mems_allowed);2871done:2872return retval;2873}28742875bool current_cpuset_is_being_rebound(void)2876{2877bool ret;28782879rcu_read_lock();2880ret = task_cs(current) == cpuset_being_rebound;2881rcu_read_unlock();28822883return ret;2884}28852886/*2887* cpuset_update_flag - read a 0 or a 1 in a file and update associated flag2888* bit: the bit to update (see cpuset_flagbits_t)2889* cs: the cpuset to update2890* turning_on: whether the flag is being set or cleared2891*2892* Call with cpuset_mutex held.2893*/28942895int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,2896int turning_on)2897{2898struct cpuset *trialcs;2899int balance_flag_changed;2900int spread_flag_changed;2901int err;29022903trialcs = dup_or_alloc_cpuset(cs);2904if (!trialcs)2905return -ENOMEM;29062907if (turning_on)2908set_bit(bit, &trialcs->flags);2909else2910clear_bit(bit, &trialcs->flags);29112912err = validate_change(cs, trialcs);2913if (err < 0)2914goto out;29152916balance_flag_changed = (is_sched_load_balance(cs) !=2917is_sched_load_balance(trialcs));29182919spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))2920|| (is_spread_page(cs) != is_spread_page(trialcs)));29212922spin_lock_irq(&callback_lock);2923cs->flags = trialcs->flags;2924spin_unlock_irq(&callback_lock);29252926if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {2927if (cpuset_v2())2928cpuset_force_rebuild();2929else2930rebuild_sched_domains_locked();2931}29322933if (spread_flag_changed)2934cpuset1_update_tasks_flags(cs);2935out:2936free_cpuset(trialcs);2937return err;2938}29392940/**2941* update_prstate - update partition_root_state2942* @cs: the cpuset to update2943* @new_prs: new partition root state2944* Return: 0 if successful, != 0 if error2945*2946* Call with cpuset_mutex held.2947*/2948static int update_prstate(struct cpuset *cs, int new_prs)2949{2950int err = PERR_NONE, old_prs = cs->partition_root_state;2951struct cpuset *parent = parent_cs(cs);2952struct tmpmasks tmpmask;2953bool isolcpus_updated = false;29542955if (old_prs == new_prs)2956return 0;29572958/*2959* Treat a previously invalid partition root as if it is a "member".2960*/2961if (new_prs && is_partition_invalid(cs))2962old_prs = PRS_MEMBER;29632964if (alloc_tmpmasks(&tmpmask))2965return -ENOMEM;29662967err = update_partition_exclusive_flag(cs, new_prs);2968if (err)2969goto out;29702971if (!old_prs) {2972/*2973* cpus_allowed and exclusive_cpus cannot be both empty.2974*/2975if (xcpus_empty(cs)) {2976err = PERR_CPUSEMPTY;2977goto out;2978}29792980/*2981* We don't support the creation of a new local partition with2982* a remote partition underneath it. This unsupported2983* setting can happen only if parent is the top_cpuset because2984* a remote partition cannot be created underneath an existing2985* local or remote partition.2986*/2987if ((parent == &top_cpuset) &&2988cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) {2989err = PERR_REMOTE;2990goto out;2991}29922993/*2994* If parent is valid partition, enable local partiion.2995* Otherwise, enable a remote partition.2996*/2997if (is_partition_valid(parent)) {2998enum partition_cmd cmd = (new_prs == PRS_ROOT)2999? partcmd_enable : partcmd_enablei;30003001err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);3002} else {3003err = remote_partition_enable(cs, new_prs, &tmpmask);3004}3005} else if (old_prs && new_prs) {3006/*3007* A change in load balance state only, no change in cpumasks.3008* Need to update isolated_cpus.3009*/3010isolcpus_updated = true;3011} else {3012/*3013* Switching back to member is always allowed even if it3014* disables child partitions.3015*/3016if (is_remote_partition(cs))3017remote_partition_disable(cs, &tmpmask);3018else3019update_parent_effective_cpumask(cs, partcmd_disable,3020NULL, &tmpmask);30213022/*3023* Invalidation of child partitions will be done in3024* update_cpumasks_hier().3025*/3026}3027out:3028/*3029* Make partition invalid & disable CS_CPU_EXCLUSIVE if an error3030* happens.3031*/3032if (err) {3033new_prs = -new_prs;3034update_partition_exclusive_flag(cs, new_prs);3035}30363037spin_lock_irq(&callback_lock);3038cs->partition_root_state = new_prs;3039WRITE_ONCE(cs->prs_err, err);3040if (!is_partition_valid(cs))3041reset_partition_data(cs);3042else if (isolcpus_updated)3043isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);3044spin_unlock_irq(&callback_lock);3045update_unbound_workqueue_cpumask(isolcpus_updated);30463047/* Force update if switching back to member & update effective_xcpus */3048update_cpumasks_hier(cs, &tmpmask, !new_prs);30493050/* A newly created partition must have effective_xcpus set */3051WARN_ON_ONCE(!old_prs && (new_prs > 0)3052&& cpumask_empty(cs->effective_xcpus));30533054/* Update sched domains and load balance flag */3055update_partition_sd_lb(cs, old_prs);30563057notify_partition_change(cs, old_prs);3058if (force_sd_rebuild)3059rebuild_sched_domains_locked();3060free_tmpmasks(&tmpmask);3061return 0;3062}30633064static struct cpuset *cpuset_attach_old_cs;30653066/*3067* Check to see if a cpuset can accept a new task3068* For v1, cpus_allowed and mems_allowed can't be empty.3069* For v2, effective_cpus can't be empty.3070* Note that in v1, effective_cpus = cpus_allowed.3071*/3072static int cpuset_can_attach_check(struct cpuset *cs)3073{3074if (cpumask_empty(cs->effective_cpus) ||3075(!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))3076return -ENOSPC;3077return 0;3078}30793080static void reset_migrate_dl_data(struct cpuset *cs)3081{3082cs->nr_migrate_dl_tasks = 0;3083cs->sum_migrate_dl_bw = 0;3084}30853086/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */3087static int cpuset_can_attach(struct cgroup_taskset *tset)3088{3089struct cgroup_subsys_state *css;3090struct cpuset *cs, *oldcs;3091struct task_struct *task;3092bool cpus_updated, mems_updated;3093int ret;30943095/* used later by cpuset_attach() */3096cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));3097oldcs = cpuset_attach_old_cs;3098cs = css_cs(css);30993100mutex_lock(&cpuset_mutex);31013102/* Check to see if task is allowed in the cpuset */3103ret = cpuset_can_attach_check(cs);3104if (ret)3105goto out_unlock;31063107cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);3108mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);31093110cgroup_taskset_for_each(task, css, tset) {3111ret = task_can_attach(task);3112if (ret)3113goto out_unlock;31143115/*3116* Skip rights over task check in v2 when nothing changes,3117* migration permission derives from hierarchy ownership in3118* cgroup_procs_write_permission()).3119*/3120if (!cpuset_v2() || (cpus_updated || mems_updated)) {3121ret = security_task_setscheduler(task);3122if (ret)3123goto out_unlock;3124}31253126if (dl_task(task)) {3127cs->nr_migrate_dl_tasks++;3128cs->sum_migrate_dl_bw += task->dl.dl_bw;3129}3130}31313132if (!cs->nr_migrate_dl_tasks)3133goto out_success;31343135if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {3136int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);31373138if (unlikely(cpu >= nr_cpu_ids)) {3139reset_migrate_dl_data(cs);3140ret = -EINVAL;3141goto out_unlock;3142}31433144ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);3145if (ret) {3146reset_migrate_dl_data(cs);3147goto out_unlock;3148}3149}31503151out_success:3152/*3153* Mark attach is in progress. This makes validate_change() fail3154* changes which zero cpus/mems_allowed.3155*/3156cs->attach_in_progress++;3157out_unlock:3158mutex_unlock(&cpuset_mutex);3159return ret;3160}31613162static void cpuset_cancel_attach(struct cgroup_taskset *tset)3163{3164struct cgroup_subsys_state *css;3165struct cpuset *cs;31663167cgroup_taskset_first(tset, &css);3168cs = css_cs(css);31693170mutex_lock(&cpuset_mutex);3171dec_attach_in_progress_locked(cs);31723173if (cs->nr_migrate_dl_tasks) {3174int cpu = cpumask_any(cs->effective_cpus);31753176dl_bw_free(cpu, cs->sum_migrate_dl_bw);3177reset_migrate_dl_data(cs);3178}31793180mutex_unlock(&cpuset_mutex);3181}31823183/*3184* Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()3185* but we can't allocate it dynamically there. Define it global and3186* allocate from cpuset_init().3187*/3188static cpumask_var_t cpus_attach;3189static nodemask_t cpuset_attach_nodemask_to;31903191static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)3192{3193lockdep_assert_held(&cpuset_mutex);31943195if (cs != &top_cpuset)3196guarantee_active_cpus(task, cpus_attach);3197else3198cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),3199subpartitions_cpus);3200/*3201* can_attach beforehand should guarantee that this doesn't3202* fail. TODO: have a better way to handle failure here3203*/3204WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));32053206cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);3207cpuset1_update_task_spread_flags(cs, task);3208}32093210static void cpuset_attach(struct cgroup_taskset *tset)3211{3212struct task_struct *task;3213struct task_struct *leader;3214struct cgroup_subsys_state *css;3215struct cpuset *cs;3216struct cpuset *oldcs = cpuset_attach_old_cs;3217bool cpus_updated, mems_updated;3218bool queue_task_work = false;32193220cgroup_taskset_first(tset, &css);3221cs = css_cs(css);32223223lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */3224mutex_lock(&cpuset_mutex);3225cpus_updated = !cpumask_equal(cs->effective_cpus,3226oldcs->effective_cpus);3227mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);32283229/*3230* In the default hierarchy, enabling cpuset in the child cgroups3231* will trigger a number of cpuset_attach() calls with no change3232* in effective cpus and mems. In that case, we can optimize out3233* by skipping the task iteration and update.3234*/3235if (cpuset_v2() && !cpus_updated && !mems_updated) {3236cpuset_attach_nodemask_to = cs->effective_mems;3237goto out;3238}32393240guarantee_online_mems(cs, &cpuset_attach_nodemask_to);32413242cgroup_taskset_for_each(task, css, tset)3243cpuset_attach_task(cs, task);32443245/*3246* Change mm for all threadgroup leaders. This is expensive and may3247* sleep and should be moved outside migration path proper. Skip it3248* if there is no change in effective_mems and CS_MEMORY_MIGRATE is3249* not set.3250*/3251cpuset_attach_nodemask_to = cs->effective_mems;3252if (!is_memory_migrate(cs) && !mems_updated)3253goto out;32543255cgroup_taskset_for_each_leader(leader, css, tset) {3256struct mm_struct *mm = get_task_mm(leader);32573258if (mm) {3259mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);32603261/*3262* old_mems_allowed is the same with mems_allowed3263* here, except if this task is being moved3264* automatically due to hotplug. In that case3265* @mems_allowed has been updated and is empty, so3266* @old_mems_allowed is the right nodesets that we3267* migrate mm from.3268*/3269if (is_memory_migrate(cs)) {3270cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,3271&cpuset_attach_nodemask_to);3272queue_task_work = true;3273} else3274mmput(mm);3275}3276}32773278out:3279if (queue_task_work)3280schedule_flush_migrate_mm();3281cs->old_mems_allowed = cpuset_attach_nodemask_to;32823283if (cs->nr_migrate_dl_tasks) {3284cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;3285oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;3286reset_migrate_dl_data(cs);3287}32883289dec_attach_in_progress_locked(cs);32903291mutex_unlock(&cpuset_mutex);3292}32933294/*3295* Common handling for a write to a "cpus" or "mems" file.3296*/3297ssize_t cpuset_write_resmask(struct kernfs_open_file *of,3298char *buf, size_t nbytes, loff_t off)3299{3300struct cpuset *cs = css_cs(of_css(of));3301struct cpuset *trialcs;3302int retval = -ENODEV;33033304/* root is read-only */3305if (cs == &top_cpuset)3306return -EACCES;33073308buf = strstrip(buf);3309cpuset_full_lock();3310if (!is_cpuset_online(cs))3311goto out_unlock;33123313trialcs = dup_or_alloc_cpuset(cs);3314if (!trialcs) {3315retval = -ENOMEM;3316goto out_unlock;3317}33183319switch (of_cft(of)->private) {3320case FILE_CPULIST:3321retval = update_cpumask(cs, trialcs, buf);3322break;3323case FILE_EXCLUSIVE_CPULIST:3324retval = update_exclusive_cpumask(cs, trialcs, buf);3325break;3326case FILE_MEMLIST:3327retval = update_nodemask(cs, trialcs, buf);3328break;3329default:3330retval = -EINVAL;3331break;3332}33333334free_cpuset(trialcs);3335if (force_sd_rebuild)3336rebuild_sched_domains_locked();3337out_unlock:3338cpuset_full_unlock();3339if (of_cft(of)->private == FILE_MEMLIST)3340schedule_flush_migrate_mm();3341return retval ?: nbytes;3342}33433344/*3345* These ascii lists should be read in a single call, by using a user3346* buffer large enough to hold the entire map. If read in smaller3347* chunks, there is no guarantee of atomicity. Since the display format3348* used, list of ranges of sequential numbers, is variable length,3349* and since these maps can change value dynamically, one could read3350* gibberish by doing partial reads while a list was changing.3351*/3352int cpuset_common_seq_show(struct seq_file *sf, void *v)3353{3354struct cpuset *cs = css_cs(seq_css(sf));3355cpuset_filetype_t type = seq_cft(sf)->private;3356int ret = 0;33573358spin_lock_irq(&callback_lock);33593360switch (type) {3361case FILE_CPULIST:3362seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));3363break;3364case FILE_MEMLIST:3365seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));3366break;3367case FILE_EFFECTIVE_CPULIST:3368seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));3369break;3370case FILE_EFFECTIVE_MEMLIST:3371seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));3372break;3373case FILE_EXCLUSIVE_CPULIST:3374seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));3375break;3376case FILE_EFFECTIVE_XCPULIST:3377seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));3378break;3379case FILE_SUBPARTS_CPULIST:3380seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));3381break;3382case FILE_ISOLATED_CPULIST:3383seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));3384break;3385default:3386ret = -EINVAL;3387}33883389spin_unlock_irq(&callback_lock);3390return ret;3391}33923393static int cpuset_partition_show(struct seq_file *seq, void *v)3394{3395struct cpuset *cs = css_cs(seq_css(seq));3396const char *err, *type = NULL;33973398switch (cs->partition_root_state) {3399case PRS_ROOT:3400seq_puts(seq, "root\n");3401break;3402case PRS_ISOLATED:3403seq_puts(seq, "isolated\n");3404break;3405case PRS_MEMBER:3406seq_puts(seq, "member\n");3407break;3408case PRS_INVALID_ROOT:3409type = "root";3410fallthrough;3411case PRS_INVALID_ISOLATED:3412if (!type)3413type = "isolated";3414err = perr_strings[READ_ONCE(cs->prs_err)];3415if (err)3416seq_printf(seq, "%s invalid (%s)\n", type, err);3417else3418seq_printf(seq, "%s invalid\n", type);3419break;3420}3421return 0;3422}34233424static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,3425size_t nbytes, loff_t off)3426{3427struct cpuset *cs = css_cs(of_css(of));3428int val;3429int retval = -ENODEV;34303431buf = strstrip(buf);34323433if (!strcmp(buf, "root"))3434val = PRS_ROOT;3435else if (!strcmp(buf, "member"))3436val = PRS_MEMBER;3437else if (!strcmp(buf, "isolated"))3438val = PRS_ISOLATED;3439else3440return -EINVAL;34413442cpuset_full_lock();3443if (is_cpuset_online(cs))3444retval = update_prstate(cs, val);3445cpuset_full_unlock();3446return retval ?: nbytes;3447}34483449/*3450* This is currently a minimal set for the default hierarchy. It can be3451* expanded later on by migrating more features and control files from v1.3452*/3453static struct cftype dfl_files[] = {3454{3455.name = "cpus",3456.seq_show = cpuset_common_seq_show,3457.write = cpuset_write_resmask,3458.max_write_len = (100U + 6 * NR_CPUS),3459.private = FILE_CPULIST,3460.flags = CFTYPE_NOT_ON_ROOT,3461},34623463{3464.name = "mems",3465.seq_show = cpuset_common_seq_show,3466.write = cpuset_write_resmask,3467.max_write_len = (100U + 6 * MAX_NUMNODES),3468.private = FILE_MEMLIST,3469.flags = CFTYPE_NOT_ON_ROOT,3470},34713472{3473.name = "cpus.effective",3474.seq_show = cpuset_common_seq_show,3475.private = FILE_EFFECTIVE_CPULIST,3476},34773478{3479.name = "mems.effective",3480.seq_show = cpuset_common_seq_show,3481.private = FILE_EFFECTIVE_MEMLIST,3482},34833484{3485.name = "cpus.partition",3486.seq_show = cpuset_partition_show,3487.write = cpuset_partition_write,3488.private = FILE_PARTITION_ROOT,3489.flags = CFTYPE_NOT_ON_ROOT,3490.file_offset = offsetof(struct cpuset, partition_file),3491},34923493{3494.name = "cpus.exclusive",3495.seq_show = cpuset_common_seq_show,3496.write = cpuset_write_resmask,3497.max_write_len = (100U + 6 * NR_CPUS),3498.private = FILE_EXCLUSIVE_CPULIST,3499.flags = CFTYPE_NOT_ON_ROOT,3500},35013502{3503.name = "cpus.exclusive.effective",3504.seq_show = cpuset_common_seq_show,3505.private = FILE_EFFECTIVE_XCPULIST,3506.flags = CFTYPE_NOT_ON_ROOT,3507},35083509{3510.name = "cpus.subpartitions",3511.seq_show = cpuset_common_seq_show,3512.private = FILE_SUBPARTS_CPULIST,3513.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,3514},35153516{3517.name = "cpus.isolated",3518.seq_show = cpuset_common_seq_show,3519.private = FILE_ISOLATED_CPULIST,3520.flags = CFTYPE_ONLY_ON_ROOT,3521},35223523{ } /* terminate */3524};352535263527/**3528* cpuset_css_alloc - Allocate a cpuset css3529* @parent_css: Parent css of the control group that the new cpuset will be3530* part of3531* Return: cpuset css on success, -ENOMEM on failure.3532*3533* Allocate and initialize a new cpuset css, for non-NULL @parent_css, return3534* top cpuset css otherwise.3535*/3536static struct cgroup_subsys_state *3537cpuset_css_alloc(struct cgroup_subsys_state *parent_css)3538{3539struct cpuset *cs;35403541if (!parent_css)3542return &top_cpuset.css;35433544cs = dup_or_alloc_cpuset(NULL);3545if (!cs)3546return ERR_PTR(-ENOMEM);35473548__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);3549fmeter_init(&cs->fmeter);3550cs->relax_domain_level = -1;3551INIT_LIST_HEAD(&cs->remote_sibling);35523553/* Set CS_MEMORY_MIGRATE for default hierarchy */3554if (cpuset_v2())3555__set_bit(CS_MEMORY_MIGRATE, &cs->flags);35563557return &cs->css;3558}35593560static int cpuset_css_online(struct cgroup_subsys_state *css)3561{3562struct cpuset *cs = css_cs(css);3563struct cpuset *parent = parent_cs(cs);3564struct cpuset *tmp_cs;3565struct cgroup_subsys_state *pos_css;35663567if (!parent)3568return 0;35693570cpuset_full_lock();3571if (is_spread_page(parent))3572set_bit(CS_SPREAD_PAGE, &cs->flags);3573if (is_spread_slab(parent))3574set_bit(CS_SPREAD_SLAB, &cs->flags);3575/*3576* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated3577*/3578if (cpuset_v2() && !is_sched_load_balance(parent))3579clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);35803581cpuset_inc();35823583spin_lock_irq(&callback_lock);3584if (is_in_v2_mode()) {3585cpumask_copy(cs->effective_cpus, parent->effective_cpus);3586cs->effective_mems = parent->effective_mems;3587}3588spin_unlock_irq(&callback_lock);35893590if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))3591goto out_unlock;35923593/*3594* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is3595* set. This flag handling is implemented in cgroup core for3596* historical reasons - the flag may be specified during mount.3597*3598* Currently, if any sibling cpusets have exclusive cpus or mem, we3599* refuse to clone the configuration - thereby refusing the task to3600* be entered, and as a result refusing the sys_unshare() or3601* clone() which initiated it. If this becomes a problem for some3602* users who wish to allow that scenario, then this could be3603* changed to grant parent->cpus_allowed-sibling_cpus_exclusive3604* (and likewise for mems) to the new cgroup.3605*/3606rcu_read_lock();3607cpuset_for_each_child(tmp_cs, pos_css, parent) {3608if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {3609rcu_read_unlock();3610goto out_unlock;3611}3612}3613rcu_read_unlock();36143615spin_lock_irq(&callback_lock);3616cs->mems_allowed = parent->mems_allowed;3617cs->effective_mems = parent->mems_allowed;3618cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);3619cpumask_copy(cs->effective_cpus, parent->cpus_allowed);3620spin_unlock_irq(&callback_lock);3621out_unlock:3622cpuset_full_unlock();3623return 0;3624}36253626/*3627* If the cpuset being removed has its flag 'sched_load_balance'3628* enabled, then simulate turning sched_load_balance off, which3629* will call rebuild_sched_domains_locked(). That is not needed3630* in the default hierarchy where only changes in partition3631* will cause repartitioning.3632*/3633static void cpuset_css_offline(struct cgroup_subsys_state *css)3634{3635struct cpuset *cs = css_cs(css);36363637cpuset_full_lock();3638if (!cpuset_v2() && is_sched_load_balance(cs))3639cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);36403641cpuset_dec();3642cpuset_full_unlock();3643}36443645/*3646* If a dying cpuset has the 'cpus.partition' enabled, turn it off by3647* changing it back to member to free its exclusive CPUs back to the pool to3648* be used by other online cpusets.3649*/3650static void cpuset_css_killed(struct cgroup_subsys_state *css)3651{3652struct cpuset *cs = css_cs(css);36533654cpuset_full_lock();3655/* Reset valid partition back to member */3656if (is_partition_valid(cs))3657update_prstate(cs, PRS_MEMBER);3658cpuset_full_unlock();3659}36603661static void cpuset_css_free(struct cgroup_subsys_state *css)3662{3663struct cpuset *cs = css_cs(css);36643665free_cpuset(cs);3666}36673668static void cpuset_bind(struct cgroup_subsys_state *root_css)3669{3670mutex_lock(&cpuset_mutex);3671spin_lock_irq(&callback_lock);36723673if (is_in_v2_mode()) {3674cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);3675cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);3676top_cpuset.mems_allowed = node_possible_map;3677} else {3678cpumask_copy(top_cpuset.cpus_allowed,3679top_cpuset.effective_cpus);3680top_cpuset.mems_allowed = top_cpuset.effective_mems;3681}36823683spin_unlock_irq(&callback_lock);3684mutex_unlock(&cpuset_mutex);3685}36863687/*3688* In case the child is cloned into a cpuset different from its parent,3689* additional checks are done to see if the move is allowed.3690*/3691static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)3692{3693struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);3694bool same_cs;3695int ret;36963697rcu_read_lock();3698same_cs = (cs == task_cs(current));3699rcu_read_unlock();37003701if (same_cs)3702return 0;37033704lockdep_assert_held(&cgroup_mutex);3705mutex_lock(&cpuset_mutex);37063707/* Check to see if task is allowed in the cpuset */3708ret = cpuset_can_attach_check(cs);3709if (ret)3710goto out_unlock;37113712ret = task_can_attach(task);3713if (ret)3714goto out_unlock;37153716ret = security_task_setscheduler(task);3717if (ret)3718goto out_unlock;37193720/*3721* Mark attach is in progress. This makes validate_change() fail3722* changes which zero cpus/mems_allowed.3723*/3724cs->attach_in_progress++;3725out_unlock:3726mutex_unlock(&cpuset_mutex);3727return ret;3728}37293730static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)3731{3732struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);3733bool same_cs;37343735rcu_read_lock();3736same_cs = (cs == task_cs(current));3737rcu_read_unlock();37383739if (same_cs)3740return;37413742dec_attach_in_progress(cs);3743}37443745/*3746* Make sure the new task conform to the current state of its parent,3747* which could have been changed by cpuset just after it inherits the3748* state from the parent and before it sits on the cgroup's task list.3749*/3750static void cpuset_fork(struct task_struct *task)3751{3752struct cpuset *cs;3753bool same_cs;37543755rcu_read_lock();3756cs = task_cs(task);3757same_cs = (cs == task_cs(current));3758rcu_read_unlock();37593760if (same_cs) {3761if (cs == &top_cpuset)3762return;37633764set_cpus_allowed_ptr(task, current->cpus_ptr);3765task->mems_allowed = current->mems_allowed;3766return;3767}37683769/* CLONE_INTO_CGROUP */3770mutex_lock(&cpuset_mutex);3771guarantee_online_mems(cs, &cpuset_attach_nodemask_to);3772cpuset_attach_task(cs, task);37733774dec_attach_in_progress_locked(cs);3775mutex_unlock(&cpuset_mutex);3776}37773778struct cgroup_subsys cpuset_cgrp_subsys = {3779.css_alloc = cpuset_css_alloc,3780.css_online = cpuset_css_online,3781.css_offline = cpuset_css_offline,3782.css_killed = cpuset_css_killed,3783.css_free = cpuset_css_free,3784.can_attach = cpuset_can_attach,3785.cancel_attach = cpuset_cancel_attach,3786.attach = cpuset_attach,3787.bind = cpuset_bind,3788.can_fork = cpuset_can_fork,3789.cancel_fork = cpuset_cancel_fork,3790.fork = cpuset_fork,3791#ifdef CONFIG_CPUSETS_V13792.legacy_cftypes = cpuset1_files,3793#endif3794.dfl_cftypes = dfl_files,3795.early_init = true,3796.threaded = true,3797};37983799/**3800* cpuset_init - initialize cpusets at system boot3801*3802* Description: Initialize top_cpuset3803**/38043805int __init cpuset_init(void)3806{3807BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));3808BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));3809BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));3810BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));3811BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));3812BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));38133814cpumask_setall(top_cpuset.cpus_allowed);3815nodes_setall(top_cpuset.mems_allowed);3816cpumask_setall(top_cpuset.effective_cpus);3817cpumask_setall(top_cpuset.effective_xcpus);3818cpumask_setall(top_cpuset.exclusive_cpus);3819nodes_setall(top_cpuset.effective_mems);38203821fmeter_init(&top_cpuset.fmeter);3822INIT_LIST_HEAD(&remote_children);38233824BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));38253826have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);3827if (have_boot_isolcpus) {3828BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));3829cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));3830cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);3831}38323833return 0;3834}38353836static void3837hotplug_update_tasks(struct cpuset *cs,3838struct cpumask *new_cpus, nodemask_t *new_mems,3839bool cpus_updated, bool mems_updated)3840{3841/* A partition root is allowed to have empty effective cpus */3842if (cpumask_empty(new_cpus) && !is_partition_valid(cs))3843cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);3844if (nodes_empty(*new_mems))3845*new_mems = parent_cs(cs)->effective_mems;38463847spin_lock_irq(&callback_lock);3848cpumask_copy(cs->effective_cpus, new_cpus);3849cs->effective_mems = *new_mems;3850spin_unlock_irq(&callback_lock);38513852if (cpus_updated)3853cpuset_update_tasks_cpumask(cs, new_cpus);3854if (mems_updated)3855cpuset_update_tasks_nodemask(cs);3856}38573858void cpuset_force_rebuild(void)3859{3860force_sd_rebuild = true;3861}38623863/**3864* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug3865* @cs: cpuset in interest3866* @tmp: the tmpmasks structure pointer3867*3868* Compare @cs's cpu and mem masks against top_cpuset and if some have gone3869* offline, update @cs accordingly. If @cs ends up with no CPU or memory,3870* all its tasks are moved to the nearest ancestor with both resources.3871*/3872static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)3873{3874static cpumask_t new_cpus;3875static nodemask_t new_mems;3876bool cpus_updated;3877bool mems_updated;3878bool remote;3879int partcmd = -1;3880struct cpuset *parent;3881retry:3882wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);38833884mutex_lock(&cpuset_mutex);38853886/*3887* We have raced with task attaching. We wait until attaching3888* is finished, so we won't attach a task to an empty cpuset.3889*/3890if (cs->attach_in_progress) {3891mutex_unlock(&cpuset_mutex);3892goto retry;3893}38943895parent = parent_cs(cs);3896compute_effective_cpumask(&new_cpus, cs, parent);3897nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);38983899if (!tmp || !cs->partition_root_state)3900goto update_tasks;39013902/*3903* Compute effective_cpus for valid partition root, may invalidate3904* child partition roots if necessary.3905*/3906remote = is_remote_partition(cs);3907if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))3908compute_partition_effective_cpumask(cs, &new_cpus);39093910if (remote && cpumask_empty(&new_cpus) &&3911partition_is_populated(cs, NULL)) {3912cs->prs_err = PERR_HOTPLUG;3913remote_partition_disable(cs, tmp);3914compute_effective_cpumask(&new_cpus, cs, parent);3915remote = false;3916}39173918/*3919* Force the partition to become invalid if either one of3920* the following conditions hold:3921* 1) empty effective cpus but not valid empty partition.3922* 2) parent is invalid or doesn't grant any cpus to child3923* partitions.3924*/3925if (is_local_partition(cs) && (!is_partition_valid(parent) ||3926tasks_nocpu_error(parent, cs, &new_cpus)))3927partcmd = partcmd_invalidate;3928/*3929* On the other hand, an invalid partition root may be transitioned3930* back to a regular one with a non-empty effective xcpus.3931*/3932else if (is_partition_valid(parent) && is_partition_invalid(cs) &&3933!cpumask_empty(cs->effective_xcpus))3934partcmd = partcmd_update;39353936if (partcmd >= 0) {3937update_parent_effective_cpumask(cs, partcmd, NULL, tmp);3938if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {3939compute_partition_effective_cpumask(cs, &new_cpus);3940cpuset_force_rebuild();3941}3942}39433944update_tasks:3945cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);3946mems_updated = !nodes_equal(new_mems, cs->effective_mems);3947if (!cpus_updated && !mems_updated)3948goto unlock; /* Hotplug doesn't affect this cpuset */39493950if (mems_updated)3951check_insane_mems_config(&new_mems);39523953if (is_in_v2_mode())3954hotplug_update_tasks(cs, &new_cpus, &new_mems,3955cpus_updated, mems_updated);3956else3957cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems,3958cpus_updated, mems_updated);39593960unlock:3961mutex_unlock(&cpuset_mutex);3962}39633964/**3965* cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset3966*3967* This function is called after either CPU or memory configuration has3968* changed and updates cpuset accordingly. The top_cpuset is always3969* synchronized to cpu_active_mask and N_MEMORY, which is necessary in3970* order to make cpusets transparent (of no affect) on systems that are3971* actively using CPU hotplug but making no active use of cpusets.3972*3973* Non-root cpusets are only affected by offlining. If any CPUs or memory3974* nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on3975* all descendants.3976*3977* Note that CPU offlining during suspend is ignored. We don't modify3978* cpusets across suspend/resume cycles at all.3979*3980* CPU / memory hotplug is handled synchronously.3981*/3982static void cpuset_handle_hotplug(void)3983{3984static cpumask_t new_cpus;3985static nodemask_t new_mems;3986bool cpus_updated, mems_updated;3987bool on_dfl = is_in_v2_mode();3988struct tmpmasks tmp, *ptmp = NULL;39893990if (on_dfl && !alloc_tmpmasks(&tmp))3991ptmp = &tmp;39923993lockdep_assert_cpus_held();3994mutex_lock(&cpuset_mutex);39953996/* fetch the available cpus/mems and find out which changed how */3997cpumask_copy(&new_cpus, cpu_active_mask);3998new_mems = node_states[N_MEMORY];39994000/*4001* If subpartitions_cpus is populated, it is likely that the check4002* below will produce a false positive on cpus_updated when the cpu4003* list isn't changed. It is extra work, but it is better to be safe.4004*/4005cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||4006!cpumask_empty(subpartitions_cpus);4007mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);40084009/* For v1, synchronize cpus_allowed to cpu_active_mask */4010if (cpus_updated) {4011cpuset_force_rebuild();4012spin_lock_irq(&callback_lock);4013if (!on_dfl)4014cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);4015/*4016* Make sure that CPUs allocated to child partitions4017* do not show up in effective_cpus. If no CPU is left,4018* we clear the subpartitions_cpus & let the child partitions4019* fight for the CPUs again.4020*/4021if (!cpumask_empty(subpartitions_cpus)) {4022if (cpumask_subset(&new_cpus, subpartitions_cpus)) {4023top_cpuset.nr_subparts = 0;4024cpumask_clear(subpartitions_cpus);4025} else {4026cpumask_andnot(&new_cpus, &new_cpus,4027subpartitions_cpus);4028}4029}4030cpumask_copy(top_cpuset.effective_cpus, &new_cpus);4031spin_unlock_irq(&callback_lock);4032/* we don't mess with cpumasks of tasks in top_cpuset */4033}40344035/* synchronize mems_allowed to N_MEMORY */4036if (mems_updated) {4037spin_lock_irq(&callback_lock);4038if (!on_dfl)4039top_cpuset.mems_allowed = new_mems;4040top_cpuset.effective_mems = new_mems;4041spin_unlock_irq(&callback_lock);4042cpuset_update_tasks_nodemask(&top_cpuset);4043}40444045mutex_unlock(&cpuset_mutex);40464047/* if cpus or mems changed, we need to propagate to descendants */4048if (cpus_updated || mems_updated) {4049struct cpuset *cs;4050struct cgroup_subsys_state *pos_css;40514052rcu_read_lock();4053cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {4054if (cs == &top_cpuset || !css_tryget_online(&cs->css))4055continue;4056rcu_read_unlock();40574058cpuset_hotplug_update_tasks(cs, ptmp);40594060rcu_read_lock();4061css_put(&cs->css);4062}4063rcu_read_unlock();4064}40654066/* rebuild sched domains if necessary */4067if (force_sd_rebuild)4068rebuild_sched_domains_cpuslocked();40694070free_tmpmasks(ptmp);4071}40724073void cpuset_update_active_cpus(void)4074{4075/*4076* We're inside cpu hotplug critical region which usually nests4077* inside cgroup synchronization. Bounce actual hotplug processing4078* to a work item to avoid reverse locking order.4079*/4080cpuset_handle_hotplug();4081}40824083/*4084* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].4085* Call this routine anytime after node_states[N_MEMORY] changes.4086* See cpuset_update_active_cpus() for CPU hotplug handling.4087*/4088static int cpuset_track_online_nodes(struct notifier_block *self,4089unsigned long action, void *arg)4090{4091cpuset_handle_hotplug();4092return NOTIFY_OK;4093}40944095/**4096* cpuset_init_smp - initialize cpus_allowed4097*4098* Description: Finish top cpuset after cpu, node maps are initialized4099*/4100void __init cpuset_init_smp(void)4101{4102/*4103* cpus_allowd/mems_allowed set to v2 values in the initial4104* cpuset_bind() call will be reset to v1 values in another4105* cpuset_bind() call when v1 cpuset is mounted.4106*/4107top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;41084109cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);4110top_cpuset.effective_mems = node_states[N_MEMORY];41114112hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);41134114cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);4115BUG_ON(!cpuset_migrate_mm_wq);4116}41174118/**4119* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.4120* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.4121* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.4122*4123* Description: Returns the cpumask_var_t cpus_allowed of the cpuset4124* attached to the specified @tsk. Guaranteed to return some non-empty4125* subset of cpu_active_mask, even if this means going outside the4126* tasks cpuset, except when the task is in the top cpuset.4127**/41284129void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)4130{4131unsigned long flags;4132struct cpuset *cs;41334134spin_lock_irqsave(&callback_lock, flags);41354136cs = task_cs(tsk);4137if (cs != &top_cpuset)4138guarantee_active_cpus(tsk, pmask);4139/*4140* Tasks in the top cpuset won't get update to their cpumasks4141* when a hotplug online/offline event happens. So we include all4142* offline cpus in the allowed cpu list.4143*/4144if ((cs == &top_cpuset) || cpumask_empty(pmask)) {4145const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);41464147/*4148* We first exclude cpus allocated to partitions. If there is no4149* allowable online cpu left, we fall back to all possible cpus.4150*/4151cpumask_andnot(pmask, possible_mask, subpartitions_cpus);4152if (!cpumask_intersects(pmask, cpu_active_mask))4153cpumask_copy(pmask, possible_mask);4154}41554156spin_unlock_irqrestore(&callback_lock, flags);4157}41584159/**4160* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.4161* @tsk: pointer to task_struct with which the scheduler is struggling4162*4163* Description: In the case that the scheduler cannot find an allowed cpu in4164* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy4165* mode however, this value is the same as task_cs(tsk)->effective_cpus,4166* which will not contain a sane cpumask during cases such as cpu hotplugging.4167* This is the absolute last resort for the scheduler and it is only used if4168* _every_ other avenue has been traveled.4169*4170* Returns true if the affinity of @tsk was changed, false otherwise.4171**/41724173bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)4174{4175const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);4176const struct cpumask *cs_mask;4177bool changed = false;41784179rcu_read_lock();4180cs_mask = task_cs(tsk)->cpus_allowed;4181if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {4182do_set_cpus_allowed(tsk, cs_mask);4183changed = true;4184}4185rcu_read_unlock();41864187/*4188* We own tsk->cpus_allowed, nobody can change it under us.4189*4190* But we used cs && cs->cpus_allowed lockless and thus can4191* race with cgroup_attach_task() or update_cpumask() and get4192* the wrong tsk->cpus_allowed. However, both cases imply the4193* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()4194* which takes task_rq_lock().4195*4196* If we are called after it dropped the lock we must see all4197* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary4198* set any mask even if it is not right from task_cs() pov,4199* the pending set_cpus_allowed_ptr() will fix things.4200*4201* select_fallback_rq() will fix things ups and set cpu_possible_mask4202* if required.4203*/4204return changed;4205}42064207void __init cpuset_init_current_mems_allowed(void)4208{4209nodes_setall(current->mems_allowed);4210}42114212/**4213* cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.4214* @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.4215*4216* Description: Returns the nodemask_t mems_allowed of the cpuset4217* attached to the specified @tsk. Guaranteed to return some non-empty4218* subset of node_states[N_MEMORY], even if this means going outside the4219* tasks cpuset.4220**/42214222nodemask_t cpuset_mems_allowed(struct task_struct *tsk)4223{4224nodemask_t mask;4225unsigned long flags;42264227spin_lock_irqsave(&callback_lock, flags);4228guarantee_online_mems(task_cs(tsk), &mask);4229spin_unlock_irqrestore(&callback_lock, flags);42304231return mask;4232}42334234/**4235* cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed4236* @nodemask: the nodemask to be checked4237*4238* Are any of the nodes in the nodemask allowed in current->mems_allowed?4239*/4240int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)4241{4242return nodes_intersects(*nodemask, current->mems_allowed);4243}42444245/*4246* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or4247* mem_hardwall ancestor to the specified cpuset. Call holding4248* callback_lock. If no ancestor is mem_exclusive or mem_hardwall4249* (an unusual configuration), then returns the root cpuset.4250*/4251static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)4252{4253while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))4254cs = parent_cs(cs);4255return cs;4256}42574258/*4259* cpuset_current_node_allowed - Can current task allocate on a memory node?4260* @node: is this an allowed node?4261* @gfp_mask: memory allocation flags4262*4263* If we're in interrupt, yes, we can always allocate. If @node is set in4264* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this4265* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,4266* yes. If current has access to memory reserves as an oom victim, yes.4267* Otherwise, no.4268*4269* GFP_USER allocations are marked with the __GFP_HARDWALL bit,4270* and do not allow allocations outside the current tasks cpuset4271* unless the task has been OOM killed.4272* GFP_KERNEL allocations are not so marked, so can escape to the4273* nearest enclosing hardwalled ancestor cpuset.4274*4275* Scanning up parent cpusets requires callback_lock. The4276* __alloc_pages() routine only calls here with __GFP_HARDWALL bit4277* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the4278* current tasks mems_allowed came up empty on the first pass over4279* the zonelist. So only GFP_KERNEL allocations, if all nodes in the4280* cpuset are short of memory, might require taking the callback_lock.4281*4282* The first call here from mm/page_alloc:get_page_from_freelist()4283* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,4284* so no allocation on a node outside the cpuset is allowed (unless4285* in interrupt, of course).4286*4287* The second pass through get_page_from_freelist() doesn't even call4288* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()4289* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set4290* in alloc_flags. That logic and the checks below have the combined4291* affect that:4292* in_interrupt - any node ok (current task context irrelevant)4293* GFP_ATOMIC - any node ok4294* tsk_is_oom_victim - any node ok4295* GFP_KERNEL - any node in enclosing hardwalled cpuset ok4296* GFP_USER - only nodes in current tasks mems allowed ok.4297*/4298bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)4299{4300struct cpuset *cs; /* current cpuset ancestors */4301bool allowed; /* is allocation in zone z allowed? */4302unsigned long flags;43034304if (in_interrupt())4305return true;4306if (node_isset(node, current->mems_allowed))4307return true;4308/*4309* Allow tasks that have access to memory reserves because they have4310* been OOM killed to get memory anywhere.4311*/4312if (unlikely(tsk_is_oom_victim(current)))4313return true;4314if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */4315return false;43164317if (current->flags & PF_EXITING) /* Let dying task have memory */4318return true;43194320/* Not hardwall and node outside mems_allowed: scan up cpusets */4321spin_lock_irqsave(&callback_lock, flags);43224323cs = nearest_hardwall_ancestor(task_cs(current));4324allowed = node_isset(node, cs->mems_allowed);43254326spin_unlock_irqrestore(&callback_lock, flags);4327return allowed;4328}43294330bool cpuset_node_allowed(struct cgroup *cgroup, int nid)4331{4332struct cgroup_subsys_state *css;4333struct cpuset *cs;4334bool allowed;43354336/*4337* In v1, mem_cgroup and cpuset are unlikely in the same hierarchy4338* and mems_allowed is likely to be empty even if we could get to it,4339* so return true to avoid taking a global lock on the empty check.4340*/4341if (!cpuset_v2())4342return true;43434344css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);4345if (!css)4346return true;43474348/*4349* Normally, accessing effective_mems would require the cpuset_mutex4350* or callback_lock - but node_isset is atomic and the reference4351* taken via cgroup_get_e_css is sufficient to protect css.4352*4353* Since this interface is intended for use by migration paths, we4354* relax locking here to avoid taking global locks - while accepting4355* there may be rare scenarios where the result may be innaccurate.4356*4357* Reclaim and migration are subject to these same race conditions, and4358* cannot make strong isolation guarantees, so this is acceptable.4359*/4360cs = container_of(css, struct cpuset, css);4361allowed = node_isset(nid, cs->effective_mems);4362css_put(css);4363return allowed;4364}43654366/**4367* cpuset_spread_node() - On which node to begin search for a page4368* @rotor: round robin rotor4369*4370* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for4371* tasks in a cpuset with is_spread_page or is_spread_slab set),4372* and if the memory allocation used cpuset_mem_spread_node()4373* to determine on which node to start looking, as it will for4374* certain page cache or slab cache pages such as used for file4375* system buffers and inode caches, then instead of starting on the4376* local node to look for a free page, rather spread the starting4377* node around the tasks mems_allowed nodes.4378*4379* We don't have to worry about the returned node being offline4380* because "it can't happen", and even if it did, it would be ok.4381*4382* The routines calling guarantee_online_mems() are careful to4383* only set nodes in task->mems_allowed that are online. So it4384* should not be possible for the following code to return an4385* offline node. But if it did, that would be ok, as this routine4386* is not returning the node where the allocation must be, only4387* the node where the search should start. The zonelist passed to4388* __alloc_pages() will include all nodes. If the slab allocator4389* is passed an offline node, it will fall back to the local node.4390* See kmem_cache_alloc_node().4391*/4392static int cpuset_spread_node(int *rotor)4393{4394return *rotor = next_node_in(*rotor, current->mems_allowed);4395}43964397/**4398* cpuset_mem_spread_node() - On which node to begin search for a file page4399*/4400int cpuset_mem_spread_node(void)4401{4402if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)4403current->cpuset_mem_spread_rotor =4404node_random(¤t->mems_allowed);44054406return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);4407}44084409/**4410* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?4411* @tsk1: pointer to task_struct of some task.4412* @tsk2: pointer to task_struct of some other task.4413*4414* Description: Return true if @tsk1's mems_allowed intersects the4415* mems_allowed of @tsk2. Used by the OOM killer to determine if4416* one of the task's memory usage might impact the memory available4417* to the other.4418**/44194420int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,4421const struct task_struct *tsk2)4422{4423return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);4424}44254426/**4427* cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed4428*4429* Description: Prints current's name, cpuset name, and cached copy of its4430* mems_allowed to the kernel log.4431*/4432void cpuset_print_current_mems_allowed(void)4433{4434struct cgroup *cgrp;44354436rcu_read_lock();44374438cgrp = task_cs(current)->css.cgroup;4439pr_cont(",cpuset=");4440pr_cont_cgroup_name(cgrp);4441pr_cont(",mems_allowed=%*pbl",4442nodemask_pr_args(¤t->mems_allowed));44434444rcu_read_unlock();4445}44464447/* Display task mems_allowed in /proc/<pid>/status file. */4448void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)4449{4450seq_printf(m, "Mems_allowed:\t%*pb\n",4451nodemask_pr_args(&task->mems_allowed));4452seq_printf(m, "Mems_allowed_list:\t%*pbl\n",4453nodemask_pr_args(&task->mems_allowed));4454}445544564457