Path: blob/master/arch/powerpc/platforms/pseries/mobility.c
29274 views
// SPDX-License-Identifier: GPL-2.0-only1/*2* Support for Partition Mobility/Migration3*4* Copyright (C) 2010 Nathan Fontenot5* Copyright (C) 2010 IBM Corporation6*/789#define pr_fmt(fmt) "mobility: " fmt1011#include <linux/cpu.h>12#include <linux/kernel.h>13#include <linux/kobject.h>14#include <linux/nmi.h>15#include <linux/sched.h>16#include <linux/smp.h>17#include <linux/stat.h>18#include <linux/stop_machine.h>19#include <linux/completion.h>20#include <linux/device.h>21#include <linux/delay.h>22#include <linux/slab.h>23#include <linux/stringify.h>2425#include <asm/machdep.h>26#include <asm/nmi.h>27#include <asm/rtas.h>28#include "pseries.h"29#include "vas.h" /* vas_migration_handler() */30#include "papr-hvpipe.h" /* hvpipe_migration_handler() */31#include "../../kernel/cacheinfo.h"3233static struct kobject *mobility_kobj;3435struct update_props_workarea {36__be32 phandle;37__be32 state;38__be64 reserved;39__be32 nprops;40} __packed;4142#define NODE_ACTION_MASK 0xff00000043#define NODE_COUNT_MASK 0x00ffffff4445#define DELETE_DT_NODE 0x0100000046#define UPDATE_DT_NODE 0x0200000047#define ADD_DT_NODE 0x030000004849#define MIGRATION_SCOPE (1)50#define PRRN_SCOPE -25152#ifdef CONFIG_PPC_WATCHDOG53static unsigned int nmi_wd_lpm_factor = 200;5455#ifdef CONFIG_SYSCTL56static const struct ctl_table nmi_wd_lpm_factor_ctl_table[] = {57{58.procname = "nmi_wd_lpm_factor",59.data = &nmi_wd_lpm_factor,60.maxlen = sizeof(int),61.mode = 0644,62.proc_handler = proc_douintvec_minmax,63},64};6566static int __init register_nmi_wd_lpm_factor_sysctl(void)67{68register_sysctl("kernel", nmi_wd_lpm_factor_ctl_table);6970return 0;71}72device_initcall(register_nmi_wd_lpm_factor_sysctl);73#endif /* CONFIG_SYSCTL */74#endif /* CONFIG_PPC_WATCHDOG */7576static int mobility_rtas_call(int token, char *buf, s32 scope)77{78int rc;7980spin_lock(&rtas_data_buf_lock);8182memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);83rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);84memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);8586spin_unlock(&rtas_data_buf_lock);87return rc;88}8990static int delete_dt_node(struct device_node *dn)91{92struct device_node *pdn;93bool is_platfac;9495pdn = of_get_parent(dn);96is_platfac = of_node_is_type(dn, "ibm,platform-facilities") ||97of_node_is_type(pdn, "ibm,platform-facilities");98of_node_put(pdn);99100/*101* The drivers that bind to nodes in the platform-facilities102* hierarchy don't support node removal, and the removal directive103* from firmware is always followed by an add of an equivalent104* node. The capability (e.g. RNG, encryption, compression)105* represented by the node is never interrupted by the migration.106* So ignore changes to this part of the tree.107*/108if (is_platfac) {109pr_notice("ignoring remove operation for %pOFfp\n", dn);110return 0;111}112113pr_debug("removing node %pOFfp\n", dn);114dlpar_detach_node(dn);115return 0;116}117118static int update_dt_property(struct device_node *dn, struct property **prop,119const char *name, u32 vd, char *value)120{121struct property *new_prop = *prop;122int more = 0;123124/* A negative 'vd' value indicates that only part of the new property125* value is contained in the buffer and we need to call126* ibm,update-properties again to get the rest of the value.127*128* A negative value is also the two's compliment of the actual value.129*/130if (vd & 0x80000000) {131vd = ~vd + 1;132more = 1;133}134135if (new_prop) {136/* partial property fixup */137char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);138if (!new_data)139return -ENOMEM;140141memcpy(new_data, new_prop->value, new_prop->length);142memcpy(new_data + new_prop->length, value, vd);143144kfree(new_prop->value);145new_prop->value = new_data;146new_prop->length += vd;147} else {148new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);149if (!new_prop)150return -ENOMEM;151152new_prop->name = kstrdup(name, GFP_KERNEL);153if (!new_prop->name) {154kfree(new_prop);155return -ENOMEM;156}157158new_prop->length = vd;159new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);160if (!new_prop->value) {161kfree(new_prop->name);162kfree(new_prop);163return -ENOMEM;164}165166memcpy(new_prop->value, value, vd);167*prop = new_prop;168}169170if (!more) {171pr_debug("updating node %pOF property %s\n", dn, name);172of_update_property(dn, new_prop);173*prop = NULL;174}175176return 0;177}178179static int update_dt_node(struct device_node *dn, s32 scope)180{181struct update_props_workarea *upwa;182struct property *prop = NULL;183int i, rc, rtas_rc;184char *prop_data;185char *rtas_buf;186int update_properties_token;187u32 nprops;188u32 vd;189190update_properties_token = rtas_function_token(RTAS_FN_IBM_UPDATE_PROPERTIES);191if (update_properties_token == RTAS_UNKNOWN_SERVICE)192return -EINVAL;193194rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);195if (!rtas_buf)196return -ENOMEM;197198upwa = (struct update_props_workarea *)&rtas_buf[0];199upwa->phandle = cpu_to_be32(dn->phandle);200201do {202rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,203scope);204if (rtas_rc < 0)205break;206207prop_data = rtas_buf + sizeof(*upwa);208nprops = be32_to_cpu(upwa->nprops);209210/* On the first call to ibm,update-properties for a node the211* first property value descriptor contains an empty212* property name, the property value length encoded as u32,213* and the property value is the node path being updated.214*/215if (*prop_data == 0) {216prop_data++;217vd = be32_to_cpu(*(__be32 *)prop_data);218prop_data += vd + sizeof(vd);219nprops--;220}221222for (i = 0; i < nprops; i++) {223char *prop_name;224225prop_name = prop_data;226prop_data += strlen(prop_name) + 1;227vd = be32_to_cpu(*(__be32 *)prop_data);228prop_data += sizeof(vd);229230switch (vd) {231case 0x00000000:232/* name only property, nothing to do */233break;234235case 0x80000000:236of_remove_property(dn, of_find_property(dn,237prop_name, NULL));238prop = NULL;239break;240241default:242rc = update_dt_property(dn, &prop, prop_name,243vd, prop_data);244if (rc) {245pr_err("updating %s property failed: %d\n",246prop_name, rc);247}248249prop_data += vd;250break;251}252253cond_resched();254}255256cond_resched();257} while (rtas_rc == 1);258259kfree(rtas_buf);260return 0;261}262263static int add_dt_node(struct device_node *parent_dn, __be32 drc_index)264{265struct device_node *dn;266int rc;267268dn = dlpar_configure_connector(drc_index, parent_dn);269if (!dn)270return -ENOENT;271272/*273* Since delete_dt_node() ignores this node type, this is the274* necessary counterpart. We also know that a platform-facilities275* node returned from dlpar_configure_connector() has children276* attached, and dlpar_attach_node() only adds the parent, leaking277* the children. So ignore these on the add side for now.278*/279if (of_node_is_type(dn, "ibm,platform-facilities")) {280pr_notice("ignoring add operation for %pOF\n", dn);281dlpar_free_cc_nodes(dn);282return 0;283}284285rc = dlpar_attach_node(dn, parent_dn);286if (rc)287dlpar_free_cc_nodes(dn);288289pr_debug("added node %pOFfp\n", dn);290291return rc;292}293294static int pseries_devicetree_update(s32 scope)295{296char *rtas_buf;297__be32 *data;298int update_nodes_token;299int rc;300301update_nodes_token = rtas_function_token(RTAS_FN_IBM_UPDATE_NODES);302if (update_nodes_token == RTAS_UNKNOWN_SERVICE)303return 0;304305rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);306if (!rtas_buf)307return -ENOMEM;308309do {310rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);311if (rc && rc != 1)312break;313314data = (__be32 *)rtas_buf + 4;315while (be32_to_cpu(*data) & NODE_ACTION_MASK) {316int i;317u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;318u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;319320data++;321322for (i = 0; i < node_count; i++) {323struct device_node *np;324__be32 phandle = *data++;325__be32 drc_index;326327np = of_find_node_by_phandle(be32_to_cpu(phandle));328if (!np) {329pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n",330be32_to_cpu(phandle), action);331continue;332}333334switch (action) {335case DELETE_DT_NODE:336delete_dt_node(np);337break;338case UPDATE_DT_NODE:339update_dt_node(np, scope);340break;341case ADD_DT_NODE:342drc_index = *data++;343add_dt_node(np, drc_index);344break;345}346347of_node_put(np);348cond_resched();349}350}351352cond_resched();353} while (rc == 1);354355kfree(rtas_buf);356return rc;357}358359void post_mobility_fixup(void)360{361int rc;362363rtas_activate_firmware();364365/*366* We don't want CPUs to go online/offline while the device367* tree is being updated.368*/369cpus_read_lock();370371/*372* It's common for the destination firmware to replace cache373* nodes. Release all of the cacheinfo hierarchy's references374* before updating the device tree.375*/376cacheinfo_teardown();377378rc = pseries_devicetree_update(MIGRATION_SCOPE);379if (rc)380pr_err("device tree update failed: %d\n", rc);381382cacheinfo_rebuild();383384cpus_read_unlock();385386/* Possibly switch to a new L1 flush type */387pseries_setup_security_mitigations();388389/* Reinitialise system information for hv-24x7 */390read_24x7_sys_info();391392return;393}394395static int poll_vasi_state(u64 handle, unsigned long *res)396{397unsigned long retbuf[PLPAR_HCALL_BUFSIZE];398long hvrc;399int ret;400401hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle);402switch (hvrc) {403case H_SUCCESS:404ret = 0;405*res = retbuf[0];406break;407case H_PARAMETER:408ret = -EINVAL;409break;410case H_FUNCTION:411ret = -EOPNOTSUPP;412break;413case H_HARDWARE:414default:415pr_err("unexpected H_VASI_STATE result %ld\n", hvrc);416ret = -EIO;417break;418}419return ret;420}421422static int wait_for_vasi_session_suspending(u64 handle)423{424unsigned long state;425int ret;426427/*428* Wait for transition from H_VASI_ENABLED to429* H_VASI_SUSPENDING. Treat anything else as an error.430*/431while (true) {432ret = poll_vasi_state(handle, &state);433434if (ret != 0 || state == H_VASI_SUSPENDING) {435break;436} else if (state == H_VASI_ENABLED) {437ssleep(1);438} else {439pr_err("unexpected H_VASI_STATE result %lu\n", state);440ret = -EIO;441break;442}443}444445/*446* Proceed even if H_VASI_STATE is unavailable. If H_JOIN or447* ibm,suspend-me are also unimplemented, we'll recover then.448*/449if (ret == -EOPNOTSUPP)450ret = 0;451452return ret;453}454455static void wait_for_vasi_session_completed(u64 handle)456{457unsigned long state = 0;458int ret;459460pr_info("waiting for memory transfer to complete...\n");461462/*463* Wait for transition from H_VASI_RESUMED to H_VASI_COMPLETED.464*/465while (true) {466ret = poll_vasi_state(handle, &state);467468/*469* If the memory transfer is already complete and the migration470* has been cleaned up by the hypervisor, H_PARAMETER is return,471* which is translate in EINVAL by poll_vasi_state().472*/473if (ret == -EINVAL || (!ret && state == H_VASI_COMPLETED)) {474pr_info("memory transfer completed.\n");475break;476}477478if (ret) {479pr_err("H_VASI_STATE return error (%d)\n", ret);480break;481}482483if (state != H_VASI_RESUMED) {484pr_err("unexpected H_VASI_STATE result %lu\n", state);485break;486}487488msleep(500);489}490}491492static void prod_single(unsigned int target_cpu)493{494long hvrc;495int hwid;496497hwid = get_hard_smp_processor_id(target_cpu);498hvrc = plpar_hcall_norets(H_PROD, hwid);499if (hvrc == H_SUCCESS)500return;501pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n",502target_cpu, hwid, hvrc);503}504505static void prod_others(void)506{507unsigned int cpu;508509for_each_online_cpu(cpu) {510if (cpu != smp_processor_id())511prod_single(cpu);512}513}514515static u16 clamp_slb_size(void)516{517#ifdef CONFIG_PPC_64S_HASH_MMU518u16 prev = mmu_slb_size;519520slb_set_size(SLB_MIN_SIZE);521522return prev;523#else524return 0;525#endif526}527528static int do_suspend(void)529{530u16 saved_slb_size;531int status;532int ret;533534pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id());535536/*537* The destination processor model may have fewer SLB entries538* than the source. We reduce mmu_slb_size to a safe minimum539* before suspending in order to minimize the possibility of540* programming non-existent entries on the destination. If541* suspend fails, we restore it before returning. On success542* the OF reconfig path will update it from the new device543* tree after resuming on the destination.544*/545saved_slb_size = clamp_slb_size();546547ret = rtas_ibm_suspend_me(&status);548if (ret != 0) {549pr_err("ibm,suspend-me error: %d\n", status);550slb_set_size(saved_slb_size);551}552553return ret;554}555556/**557* struct pseries_suspend_info - State shared between CPUs for join/suspend.558* @counter: Threads are to increment this upon resuming from suspend559* or if an error is received from H_JOIN. The thread which performs560* the first increment (i.e. sets it to 1) is responsible for561* waking the other threads.562* @done: False if join/suspend is in progress. True if the operation is563* complete (successful or not).564*/565struct pseries_suspend_info {566atomic_t counter;567bool done;568};569570static int do_join(void *arg)571{572struct pseries_suspend_info *info = arg;573atomic_t *counter = &info->counter;574long hvrc;575int ret;576577retry:578/* Must ensure MSR.EE off for H_JOIN. */579hard_irq_disable();580hvrc = plpar_hcall_norets(H_JOIN);581582switch (hvrc) {583case H_CONTINUE:584/*585* All other CPUs are offline or in H_JOIN. This CPU586* attempts the suspend.587*/588ret = do_suspend();589break;590case H_SUCCESS:591/*592* The suspend is complete and this cpu has received a593* prod, or we've received a stray prod from unrelated594* code (e.g. paravirt spinlocks) and we need to join595* again.596*597* This barrier orders the return from H_JOIN above vs598* the load of info->done. It pairs with the barrier599* in the wakeup/prod path below.600*/601smp_mb();602if (READ_ONCE(info->done) == false) {603pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying",604smp_processor_id());605goto retry;606}607ret = 0;608break;609case H_BAD_MODE:610case H_HARDWARE:611default:612ret = -EIO;613pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",614hvrc, smp_processor_id());615break;616}617618if (atomic_inc_return(counter) == 1) {619pr_info("CPU %u waking all threads\n", smp_processor_id());620WRITE_ONCE(info->done, true);621/*622* This barrier orders the store to info->done vs subsequent623* H_PRODs to wake the other CPUs. It pairs with the barrier624* in the H_SUCCESS case above.625*/626smp_mb();627prod_others();628}629/*630* Execution may have been suspended for several seconds, so reset631* the watchdogs. touch_nmi_watchdog() also touches the soft lockup632* watchdog.633*/634rcu_cpu_stall_reset();635touch_nmi_watchdog();636637return ret;638}639640/*641* Abort reason code byte 0. We use only the 'Migrating partition' value.642*/643enum vasi_aborting_entity {644ORCHESTRATOR = 1,645VSP_SOURCE = 2,646PARTITION_FIRMWARE = 3,647PLATFORM_FIRMWARE = 4,648VSP_TARGET = 5,649MIGRATING_PARTITION = 6,650};651652static void pseries_cancel_migration(u64 handle, int err)653{654u32 reason_code;655u32 detail;656u8 entity;657long hvrc;658659entity = MIGRATING_PARTITION;660detail = abs(err) & 0xffffff;661reason_code = (entity << 24) | detail;662663hvrc = plpar_hcall_norets(H_VASI_SIGNAL, handle,664H_VASI_SIGNAL_CANCEL, reason_code);665if (hvrc)666pr_err("H_VASI_SIGNAL error: %ld\n", hvrc);667}668669static int pseries_suspend(u64 handle)670{671const unsigned int max_attempts = 5;672unsigned int retry_interval_ms = 1;673unsigned int attempt = 1;674int ret;675676while (true) {677struct pseries_suspend_info info;678unsigned long vasi_state;679int vasi_err;680681info = (struct pseries_suspend_info) {682.counter = ATOMIC_INIT(0),683.done = false,684};685686ret = stop_machine(do_join, &info, cpu_online_mask);687if (ret == 0)688break;689/*690* Encountered an error. If the VASI stream is still691* in Suspending state, it's likely a transient692* condition related to some device in the partition693* and we can retry in the hope that the cause has694* cleared after some delay.695*696* A better design would allow drivers etc to prepare697* for the suspend and avoid conditions which prevent698* the suspend from succeeding. For now, we have this699* mitigation.700*/701pr_notice("Partition suspend attempt %u of %u error: %d\n",702attempt, max_attempts, ret);703704if (attempt == max_attempts)705break;706707vasi_err = poll_vasi_state(handle, &vasi_state);708if (vasi_err == 0) {709if (vasi_state != H_VASI_SUSPENDING) {710pr_notice("VASI state %lu after failed suspend\n",711vasi_state);712break;713}714} else if (vasi_err != -EOPNOTSUPP) {715pr_err("VASI state poll error: %d", vasi_err);716break;717}718719pr_notice("Will retry partition suspend after %u ms\n",720retry_interval_ms);721722msleep(retry_interval_ms);723retry_interval_ms *= 10;724attempt++;725}726727return ret;728}729730static int pseries_migrate_partition(u64 handle)731{732int ret;733unsigned int factor = 0;734735#ifdef CONFIG_PPC_WATCHDOG736factor = nmi_wd_lpm_factor;737#endif738/*739* When the migration is initiated, the hypervisor changes VAS740* mappings to prepare before OS gets the notification and741* closes all VAS windows. NX generates continuous faults during742* this time and the user space can not differentiate these743* faults from the migration event. So reduce this time window744* by closing VAS windows at the beginning of this function.745*/746vas_migration_handler(VAS_SUSPEND);747hvpipe_migration_handler(HVPIPE_SUSPEND);748749ret = wait_for_vasi_session_suspending(handle);750if (ret)751goto out;752753if (factor)754watchdog_hardlockup_set_timeout_pct(factor);755756ret = pseries_suspend(handle);757if (ret == 0) {758post_mobility_fixup();759/*760* Wait until the memory transfer is complete, so that the user761* space process returns from the syscall after the transfer is762* complete. This allows the user hooks to be executed at the763* right time.764*/765wait_for_vasi_session_completed(handle);766} else767pseries_cancel_migration(handle, ret);768769if (factor)770watchdog_hardlockup_set_timeout_pct(0);771772out:773vas_migration_handler(VAS_RESUME);774hvpipe_migration_handler(HVPIPE_RESUME);775776return ret;777}778779int rtas_syscall_dispatch_ibm_suspend_me(u64 handle)780{781return pseries_migrate_partition(handle);782}783784static ssize_t migration_store(const struct class *class,785const struct class_attribute *attr, const char *buf,786size_t count)787{788u64 streamid;789int rc;790791rc = kstrtou64(buf, 0, &streamid);792if (rc)793return rc;794795rc = pseries_migrate_partition(streamid);796if (rc)797return rc;798799return count;800}801802/*803* Used by drmgr to determine the kernel behavior of the migration interface.804*805* Version 1: Performs all PAPR requirements for migration including806* firmware activation and device tree update.807*/808#define MIGRATION_API_VERSION 1809810static CLASS_ATTR_WO(migration);811static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION));812813static int __init mobility_sysfs_init(void)814{815int rc;816817mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);818if (!mobility_kobj)819return -ENOMEM;820821rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);822if (rc)823pr_err("unable to create migration sysfs file (%d)\n", rc);824825rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr);826if (rc)827pr_err("unable to create api_version sysfs file (%d)\n", rc);828829return 0;830}831machine_device_initcall(pseries, mobility_sysfs_init);832833834