blob: 85033f392c78a7959aa397903955c36e2e3189a0 [file] [log] [blame]
Thomas Gleixnerd2912cb2019-06-04 10:11:33 +02001// SPDX-License-Identifier: GPL-2.0-only
Nathan Fontenot410bccf2010-09-10 09:42:36 +00002/*
3 * Support for Partition Mobility/Migration
4 *
5 * Copyright (C) 2010 Nathan Fontenot
6 * Copyright (C) 2010 IBM Corporation
Nathan Fontenot410bccf2010-09-10 09:42:36 +00007 */
8
Nathan Lynch494a66f2019-06-27 00:30:43 -05009
10#define pr_fmt(fmt) "mobility: " fmt
11
Nathan Lynche59a1752019-06-11 23:45:05 -050012#include <linux/cpu.h>
Nathan Fontenot410bccf2010-09-10 09:42:36 +000013#include <linux/kernel.h>
14#include <linux/kobject.h>
Nathan Lynch9327dc02020-12-07 15:51:44 -060015#include <linux/nmi.h>
Nathan Lynchccfb5bd2019-08-02 14:29:26 -050016#include <linux/sched.h>
Nathan Fontenot410bccf2010-09-10 09:42:36 +000017#include <linux/smp.h>
Paul Gortmakerb56eade2011-05-27 13:27:45 -040018#include <linux/stat.h>
Nathan Lynch9327dc02020-12-07 15:51:44 -060019#include <linux/stop_machine.h>
Nathan Fontenot410bccf2010-09-10 09:42:36 +000020#include <linux/completion.h>
21#include <linux/device.h>
22#include <linux/delay.h>
23#include <linux/slab.h>
Christophe Leroy5c35a022018-07-05 16:24:59 +000024#include <linux/stringify.h>
Nathan Fontenot410bccf2010-09-10 09:42:36 +000025
Michael Ellerman8e83e902014-07-16 12:02:43 +100026#include <asm/machdep.h>
Nathan Fontenot410bccf2010-09-10 09:42:36 +000027#include <asm/rtas.h>
28#include "pseries.h"
Nathan Lynche610a462019-06-11 23:45:06 -050029#include "../../kernel/cacheinfo.h"
Nathan Fontenot410bccf2010-09-10 09:42:36 +000030
31static struct kobject *mobility_kobj;
32
33struct update_props_workarea {
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -080034 __be32 phandle;
35 __be32 state;
36 __be64 reserved;
37 __be32 nprops;
Tyrel Datwylerd0ef4402013-08-14 22:23:47 -070038} __packed;
Nathan Fontenot410bccf2010-09-10 09:42:36 +000039
40#define NODE_ACTION_MASK 0xff000000
41#define NODE_COUNT_MASK 0x00ffffff
42
43#define DELETE_DT_NODE 0x01000000
44#define UPDATE_DT_NODE 0x02000000
45#define ADD_DT_NODE 0x03000000
46
Nathan Fontenot762ec152013-04-24 05:47:11 +000047#define MIGRATION_SCOPE (1)
John Allen675d8ee2017-01-06 13:28:54 -060048#define PRRN_SCOPE -2
Nathan Fontenot762ec152013-04-24 05:47:11 +000049
50static int mobility_rtas_call(int token, char *buf, s32 scope)
Nathan Fontenot410bccf2010-09-10 09:42:36 +000051{
52 int rc;
53
54 spin_lock(&rtas_data_buf_lock);
55
56 memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
Nathan Fontenot762ec152013-04-24 05:47:11 +000057 rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
Nathan Fontenot410bccf2010-09-10 09:42:36 +000058 memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
59
60 spin_unlock(&rtas_data_buf_lock);
61 return rc;
62}
63
Nathan Lynch2efd7f62020-12-07 15:52:00 -060064static int delete_dt_node(struct device_node *dn)
Nathan Fontenot410bccf2010-09-10 09:42:36 +000065{
Nathan Lynch319fa1a2021-10-20 14:47:03 -050066 struct device_node *pdn;
67 bool is_platfac;
68
69 pdn = of_get_parent(dn);
70 is_platfac = of_node_is_type(dn, "ibm,platform-facilities") ||
71 of_node_is_type(pdn, "ibm,platform-facilities");
72 of_node_put(pdn);
73
74 /*
75 * The drivers that bind to nodes in the platform-facilities
76 * hierarchy don't support node removal, and the removal directive
77 * from firmware is always followed by an add of an equivalent
78 * node. The capability (e.g. RNG, encryption, compression)
79 * represented by the node is never interrupted by the migration.
80 * So ignore changes to this part of the tree.
81 */
82 if (is_platfac) {
83 pr_notice("ignoring remove operation for %pOFfp\n", dn);
84 return 0;
85 }
86
Nathan Lynch5d8b1f92019-06-27 00:30:44 -050087 pr_debug("removing node %pOFfp\n", dn);
Nathan Fontenot410bccf2010-09-10 09:42:36 +000088 dlpar_detach_node(dn);
89 return 0;
90}
91
92static int update_dt_property(struct device_node *dn, struct property **prop,
93 const char *name, u32 vd, char *value)
94{
95 struct property *new_prop = *prop;
Nathan Fontenot410bccf2010-09-10 09:42:36 +000096 int more = 0;
97
98 /* A negative 'vd' value indicates that only part of the new property
99 * value is contained in the buffer and we need to call
100 * ibm,update-properties again to get the rest of the value.
101 *
102 * A negative value is also the two's compliment of the actual value.
103 */
104 if (vd & 0x80000000) {
105 vd = ~vd + 1;
106 more = 1;
107 }
108
109 if (new_prop) {
110 /* partial property fixup */
111 char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
112 if (!new_data)
113 return -ENOMEM;
114
115 memcpy(new_data, new_prop->value, new_prop->length);
116 memcpy(new_data + new_prop->length, value, vd);
117
118 kfree(new_prop->value);
119 new_prop->value = new_data;
120 new_prop->length += vd;
121 } else {
122 new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
123 if (!new_prop)
124 return -ENOMEM;
125
126 new_prop->name = kstrdup(name, GFP_KERNEL);
127 if (!new_prop->name) {
128 kfree(new_prop);
129 return -ENOMEM;
130 }
131
132 new_prop->length = vd;
133 new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
134 if (!new_prop->value) {
135 kfree(new_prop->name);
136 kfree(new_prop);
137 return -ENOMEM;
138 }
139
140 memcpy(new_prop->value, value, vd);
141 *prop = new_prop;
142 }
143
144 if (!more) {
Nathan Lynch5d8b1f92019-06-27 00:30:44 -0500145 pr_debug("updating node %pOF property %s\n", dn, name);
Nathan Fontenot79d1c712012-10-02 16:58:46 +0000146 of_update_property(dn, new_prop);
Tyrel Datwylerd8e533b2013-08-14 22:23:45 -0700147 *prop = NULL;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000148 }
149
150 return 0;
151}
152
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600153static int update_dt_node(struct device_node *dn, s32 scope)
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000154{
155 struct update_props_workarea *upwa;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000156 struct property *prop = NULL;
Tyrel Datwyler638a4052013-08-14 22:23:46 -0700157 int i, rc, rtas_rc;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000158 char *prop_data;
159 char *rtas_buf;
160 int update_properties_token;
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800161 u32 nprops;
Nathan Fontenot2e9b7b02013-04-24 05:49:36 +0000162 u32 vd;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000163
164 update_properties_token = rtas_token("ibm,update-properties");
165 if (update_properties_token == RTAS_UNKNOWN_SERVICE)
166 return -EINVAL;
167
168 rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
169 if (!rtas_buf)
170 return -ENOMEM;
171
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000172 upwa = (struct update_props_workarea *)&rtas_buf[0];
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600173 upwa->phandle = cpu_to_be32(dn->phandle);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000174
175 do {
Tyrel Datwyler638a4052013-08-14 22:23:46 -0700176 rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,
Nathan Fontenot762ec152013-04-24 05:47:11 +0000177 scope);
Tyrel Datwyler638a4052013-08-14 22:23:46 -0700178 if (rtas_rc < 0)
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000179 break;
180
181 prop_data = rtas_buf + sizeof(*upwa);
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800182 nprops = be32_to_cpu(upwa->nprops);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000183
Tyrel Datwylerc8f5a572013-08-14 22:23:48 -0700184 /* On the first call to ibm,update-properties for a node the
185 * the first property value descriptor contains an empty
186 * property name, the property value length encoded as u32,
187 * and the property value is the node path being updated.
Nathan Fontenot2e9b7b02013-04-24 05:49:36 +0000188 */
Tyrel Datwylerc8f5a572013-08-14 22:23:48 -0700189 if (*prop_data == 0) {
190 prop_data++;
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800191 vd = be32_to_cpu(*(__be32 *)prop_data);
Tyrel Datwylerc8f5a572013-08-14 22:23:48 -0700192 prop_data += vd + sizeof(vd);
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800193 nprops--;
Tyrel Datwylerc8f5a572013-08-14 22:23:48 -0700194 }
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000195
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800196 for (i = 0; i < nprops; i++) {
Nathan Fontenot2e9b7b02013-04-24 05:49:36 +0000197 char *prop_name;
198
199 prop_name = prop_data;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000200 prop_data += strlen(prop_name) + 1;
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800201 vd = be32_to_cpu(*(__be32 *)prop_data);
Nathan Fontenot2e9b7b02013-04-24 05:49:36 +0000202 prop_data += sizeof(vd);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000203
204 switch (vd) {
205 case 0x00000000:
206 /* name only property, nothing to do */
207 break;
208
209 case 0x80000000:
Suraj Jitindar Singh925e2d12016-04-28 15:34:55 +1000210 of_remove_property(dn, of_find_property(dn,
211 prop_name, NULL));
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000212 prop = NULL;
213 break;
214
215 default:
216 rc = update_dt_property(dn, &prop, prop_name,
217 vd, prop_data);
218 if (rc) {
Nathan Lynch2d5be6f2020-12-07 15:51:41 -0600219 pr_err("updating %s property failed: %d\n",
220 prop_name, rc);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000221 }
222
223 prop_data += vd;
Nathan Lynchaa5e5c92020-12-07 15:51:40 -0600224 break;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000225 }
Nathan Lynchccfb5bd2019-08-02 14:29:26 -0500226
227 cond_resched();
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000228 }
Nathan Lynchccfb5bd2019-08-02 14:29:26 -0500229
230 cond_resched();
Tyrel Datwyler638a4052013-08-14 22:23:46 -0700231 } while (rtas_rc == 1);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000232
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000233 kfree(rtas_buf);
234 return 0;
235}
236
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600237static int add_dt_node(struct device_node *parent_dn, __be32 drc_index)
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000238{
239 struct device_node *dn;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000240 int rc;
241
Tyrel Datwyler8d5ff322013-08-14 22:23:50 -0700242 dn = dlpar_configure_connector(drc_index, parent_dn);
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600243 if (!dn)
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000244 return -ENOENT;
245
Nathan Lynch319fa1a2021-10-20 14:47:03 -0500246 /*
247 * Since delete_dt_node() ignores this node type, this is the
248 * necessary counterpart. We also know that a platform-facilities
249 * node returned from dlpar_configure_connector() has children
250 * attached, and dlpar_attach_node() only adds the parent, leaking
251 * the children. So ignore these on the add side for now.
252 */
253 if (of_node_is_type(dn, "ibm,platform-facilities")) {
254 pr_notice("ignoring add operation for %pOF\n", dn);
255 dlpar_free_cc_nodes(dn);
256 return 0;
257 }
258
Rob Herring215ee762017-08-21 10:16:49 -0500259 rc = dlpar_attach_node(dn, parent_dn);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000260 if (rc)
261 dlpar_free_cc_nodes(dn);
262
Nathan Lynch5d8b1f92019-06-27 00:30:44 -0500263 pr_debug("added node %pOFfp\n", dn);
264
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000265 return rc;
266}
267
Nathan Fontenot762ec152013-04-24 05:47:11 +0000268int pseries_devicetree_update(s32 scope)
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000269{
270 char *rtas_buf;
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800271 __be32 *data;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000272 int update_nodes_token;
273 int rc;
274
275 update_nodes_token = rtas_token("ibm,update-nodes");
276 if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
Nathan Lynchb06a6712020-12-07 15:51:39 -0600277 return 0;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000278
279 rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
280 if (!rtas_buf)
281 return -ENOMEM;
282
283 do {
Nathan Fontenot762ec152013-04-24 05:47:11 +0000284 rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000285 if (rc && rc != 1)
286 break;
287
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800288 data = (__be32 *)rtas_buf + 4;
289 while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000290 int i;
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800291 u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
292 u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000293
294 data++;
295
296 for (i = 0; i < node_count; i++) {
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600297 struct device_node *np;
Tyrel Datwylerf6ff0412015-03-04 11:59:33 -0800298 __be32 phandle = *data++;
299 __be32 drc_index;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000300
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600301 np = of_find_node_by_phandle(be32_to_cpu(phandle));
302 if (!np) {
303 pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n",
304 be32_to_cpu(phandle), action);
305 continue;
306 }
307
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000308 switch (action) {
309 case DELETE_DT_NODE:
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600310 delete_dt_node(np);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000311 break;
312 case UPDATE_DT_NODE:
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600313 update_dt_node(np, scope);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000314 break;
315 case ADD_DT_NODE:
316 drc_index = *data++;
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600317 add_dt_node(np, drc_index);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000318 break;
319 }
Nathan Lynchccfb5bd2019-08-02 14:29:26 -0500320
Nathan Lynch2efd7f62020-12-07 15:52:00 -0600321 of_node_put(np);
Nathan Lynchccfb5bd2019-08-02 14:29:26 -0500322 cond_resched();
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000323 }
324 }
Nathan Lynchccfb5bd2019-08-02 14:29:26 -0500325
326 cond_resched();
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000327 } while (rc == 1);
328
329 kfree(rtas_buf);
330 return rc;
331}
332
333void post_mobility_fixup(void)
334{
335 int rc;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000336
Nathan Lynchc3ae9782020-12-07 15:51:42 -0600337 rtas_activate_firmware();
Haren Myneni39a33b52014-02-19 12:56:52 -0800338
Nathan Lynche59a1752019-06-11 23:45:05 -0500339 /*
340 * We don't want CPUs to go online/offline while the device
341 * tree is being updated.
342 */
343 cpus_read_lock();
344
Nathan Lynche610a462019-06-11 23:45:06 -0500345 /*
346 * It's common for the destination firmware to replace cache
347 * nodes. Release all of the cacheinfo hierarchy's references
348 * before updating the device tree.
349 */
350 cacheinfo_teardown();
351
Haren Myneni39a33b52014-02-19 12:56:52 -0800352 rc = pseries_devicetree_update(MIGRATION_SCOPE);
353 if (rc)
Nathan Lynch2d5be6f2020-12-07 15:51:41 -0600354 pr_err("device tree update failed: %d\n", rc);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000355
Nathan Lynche610a462019-06-11 23:45:06 -0500356 cacheinfo_rebuild();
357
Nathan Lynche59a1752019-06-11 23:45:05 -0500358 cpus_read_unlock();
359
Daniel Axtensda631f72020-11-17 16:59:16 +1100360 /* Possibly switch to a new L1 flush type */
361 pseries_setup_security_mitigations();
Michael Ellerman921bc6c2018-03-14 19:40:42 -0300362
Kajol Jain373b3732020-05-25 16:13:07 +0530363 /* Reinitialise system information for hv-24x7 */
364 read_24x7_sys_info();
365
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000366 return;
367}
368
Nathan Lynchd9213312020-12-07 15:51:43 -0600369static int poll_vasi_state(u64 handle, unsigned long *res)
370{
371 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
372 long hvrc;
373 int ret;
374
375 hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle);
376 switch (hvrc) {
377 case H_SUCCESS:
378 ret = 0;
379 *res = retbuf[0];
380 break;
381 case H_PARAMETER:
382 ret = -EINVAL;
383 break;
384 case H_FUNCTION:
385 ret = -EOPNOTSUPP;
386 break;
387 case H_HARDWARE:
388 default:
389 pr_err("unexpected H_VASI_STATE result %ld\n", hvrc);
390 ret = -EIO;
391 break;
392 }
393 return ret;
394}
395
396static int wait_for_vasi_session_suspending(u64 handle)
397{
398 unsigned long state;
399 int ret;
400
401 /*
402 * Wait for transition from H_VASI_ENABLED to
403 * H_VASI_SUSPENDING. Treat anything else as an error.
404 */
405 while (true) {
406 ret = poll_vasi_state(handle, &state);
407
408 if (ret != 0 || state == H_VASI_SUSPENDING) {
409 break;
410 } else if (state == H_VASI_ENABLED) {
411 ssleep(1);
412 } else {
413 pr_err("unexpected H_VASI_STATE result %lu\n", state);
414 ret = -EIO;
415 break;
416 }
417 }
418
419 /*
420 * Proceed even if H_VASI_STATE is unavailable. If H_JOIN or
421 * ibm,suspend-me are also unimplemented, we'll recover then.
422 */
423 if (ret == -EOPNOTSUPP)
424 ret = 0;
425
426 return ret;
427}
428
Nathan Lynch9327dc02020-12-07 15:51:44 -0600429static void prod_single(unsigned int target_cpu)
430{
431 long hvrc;
432 int hwid;
433
434 hwid = get_hard_smp_processor_id(target_cpu);
435 hvrc = plpar_hcall_norets(H_PROD, hwid);
436 if (hvrc == H_SUCCESS)
437 return;
438 pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n",
439 target_cpu, hwid, hvrc);
440}
441
442static void prod_others(void)
443{
444 unsigned int cpu;
445
446 for_each_online_cpu(cpu) {
447 if (cpu != smp_processor_id())
448 prod_single(cpu);
449 }
450}
451
452static u16 clamp_slb_size(void)
453{
Nicholas Piggin387e2202021-12-02 00:41:52 +1000454#ifdef CONFIG_PPC_64S_HASH_MMU
Nathan Lynch9327dc02020-12-07 15:51:44 -0600455 u16 prev = mmu_slb_size;
456
457 slb_set_size(SLB_MIN_SIZE);
458
459 return prev;
Nicholas Piggin387e2202021-12-02 00:41:52 +1000460#else
461 return 0;
462#endif
Nathan Lynch9327dc02020-12-07 15:51:44 -0600463}
464
465static int do_suspend(void)
466{
467 u16 saved_slb_size;
468 int status;
469 int ret;
470
471 pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id());
472
473 /*
474 * The destination processor model may have fewer SLB entries
475 * than the source. We reduce mmu_slb_size to a safe minimum
476 * before suspending in order to minimize the possibility of
477 * programming non-existent entries on the destination. If
478 * suspend fails, we restore it before returning. On success
479 * the OF reconfig path will update it from the new device
480 * tree after resuming on the destination.
481 */
482 saved_slb_size = clamp_slb_size();
483
484 ret = rtas_ibm_suspend_me(&status);
485 if (ret != 0) {
486 pr_err("ibm,suspend-me error: %d\n", status);
487 slb_set_size(saved_slb_size);
488 }
489
490 return ret;
491}
492
Nathan Lynche834df62021-03-15 03:00:44 -0500493/**
494 * struct pseries_suspend_info - State shared between CPUs for join/suspend.
495 * @counter: Threads are to increment this upon resuming from suspend
496 * or if an error is received from H_JOIN. The thread which performs
497 * the first increment (i.e. sets it to 1) is responsible for
498 * waking the other threads.
Nathan Lynch274cb1c2021-03-15 03:00:45 -0500499 * @done: False if join/suspend is in progress. True if the operation is
500 * complete (successful or not).
Nathan Lynche834df62021-03-15 03:00:44 -0500501 */
502struct pseries_suspend_info {
503 atomic_t counter;
Nathan Lynch274cb1c2021-03-15 03:00:45 -0500504 bool done;
Nathan Lynche834df62021-03-15 03:00:44 -0500505};
506
Nathan Lynch9327dc02020-12-07 15:51:44 -0600507static int do_join(void *arg)
508{
Nathan Lynche834df62021-03-15 03:00:44 -0500509 struct pseries_suspend_info *info = arg;
510 atomic_t *counter = &info->counter;
Nathan Lynch9327dc02020-12-07 15:51:44 -0600511 long hvrc;
512 int ret;
513
Nathan Lynch274cb1c2021-03-15 03:00:45 -0500514retry:
Nathan Lynch9327dc02020-12-07 15:51:44 -0600515 /* Must ensure MSR.EE off for H_JOIN. */
516 hard_irq_disable();
517 hvrc = plpar_hcall_norets(H_JOIN);
518
519 switch (hvrc) {
520 case H_CONTINUE:
521 /*
522 * All other CPUs are offline or in H_JOIN. This CPU
523 * attempts the suspend.
524 */
525 ret = do_suspend();
526 break;
527 case H_SUCCESS:
528 /*
529 * The suspend is complete and this cpu has received a
Nathan Lynch274cb1c2021-03-15 03:00:45 -0500530 * prod, or we've received a stray prod from unrelated
531 * code (e.g. paravirt spinlocks) and we need to join
532 * again.
533 *
534 * This barrier orders the return from H_JOIN above vs
535 * the load of info->done. It pairs with the barrier
536 * in the wakeup/prod path below.
Nathan Lynch9327dc02020-12-07 15:51:44 -0600537 */
Nathan Lynch274cb1c2021-03-15 03:00:45 -0500538 smp_mb();
539 if (READ_ONCE(info->done) == false) {
540 pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying",
541 smp_processor_id());
542 goto retry;
543 }
Nathan Lynch9327dc02020-12-07 15:51:44 -0600544 ret = 0;
545 break;
546 case H_BAD_MODE:
547 case H_HARDWARE:
548 default:
549 ret = -EIO;
550 pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",
551 hvrc, smp_processor_id());
552 break;
553 }
554
555 if (atomic_inc_return(counter) == 1) {
556 pr_info("CPU %u waking all threads\n", smp_processor_id());
Nathan Lynch274cb1c2021-03-15 03:00:45 -0500557 WRITE_ONCE(info->done, true);
558 /*
559 * This barrier orders the store to info->done vs subsequent
560 * H_PRODs to wake the other CPUs. It pairs with the barrier
561 * in the H_SUCCESS case above.
562 */
563 smp_mb();
Nathan Lynch9327dc02020-12-07 15:51:44 -0600564 prod_others();
565 }
566 /*
567 * Execution may have been suspended for several seconds, so
568 * reset the watchdog.
569 */
570 touch_nmi_watchdog();
571 return ret;
572}
573
Nathan Lynch37cddc72020-12-07 15:51:45 -0600574/*
575 * Abort reason code byte 0. We use only the 'Migrating partition' value.
576 */
577enum vasi_aborting_entity {
578 ORCHESTRATOR = 1,
579 VSP_SOURCE = 2,
580 PARTITION_FIRMWARE = 3,
581 PLATFORM_FIRMWARE = 4,
582 VSP_TARGET = 5,
583 MIGRATING_PARTITION = 6,
584};
585
586static void pseries_cancel_migration(u64 handle, int err)
587{
588 u32 reason_code;
589 u32 detail;
590 u8 entity;
591 long hvrc;
592
593 entity = MIGRATING_PARTITION;
594 detail = abs(err) & 0xffffff;
595 reason_code = (entity << 24) | detail;
596
597 hvrc = plpar_hcall_norets(H_VASI_SIGNAL, handle,
598 H_VASI_SIGNAL_CANCEL, reason_code);
599 if (hvrc)
600 pr_err("H_VASI_SIGNAL error: %ld\n", hvrc);
601}
602
Nathan Lynchaeca35b2020-12-07 15:51:46 -0600603static int pseries_suspend(u64 handle)
604{
605 const unsigned int max_attempts = 5;
606 unsigned int retry_interval_ms = 1;
607 unsigned int attempt = 1;
608 int ret;
609
610 while (true) {
Nathan Lynche834df62021-03-15 03:00:44 -0500611 struct pseries_suspend_info info;
Nathan Lynchaeca35b2020-12-07 15:51:46 -0600612 unsigned long vasi_state;
613 int vasi_err;
614
Nathan Lynche834df62021-03-15 03:00:44 -0500615 info = (struct pseries_suspend_info) {
616 .counter = ATOMIC_INIT(0),
Nathan Lynch274cb1c2021-03-15 03:00:45 -0500617 .done = false,
Nathan Lynche834df62021-03-15 03:00:44 -0500618 };
619
620 ret = stop_machine(do_join, &info, cpu_online_mask);
Nathan Lynchaeca35b2020-12-07 15:51:46 -0600621 if (ret == 0)
622 break;
623 /*
624 * Encountered an error. If the VASI stream is still
625 * in Suspending state, it's likely a transient
626 * condition related to some device in the partition
627 * and we can retry in the hope that the cause has
628 * cleared after some delay.
629 *
630 * A better design would allow drivers etc to prepare
631 * for the suspend and avoid conditions which prevent
632 * the suspend from succeeding. For now, we have this
633 * mitigation.
634 */
635 pr_notice("Partition suspend attempt %u of %u error: %d\n",
636 attempt, max_attempts, ret);
637
638 if (attempt == max_attempts)
639 break;
640
641 vasi_err = poll_vasi_state(handle, &vasi_state);
642 if (vasi_err == 0) {
643 if (vasi_state != H_VASI_SUSPENDING) {
644 pr_notice("VASI state %lu after failed suspend\n",
645 vasi_state);
646 break;
647 }
648 } else if (vasi_err != -EOPNOTSUPP) {
649 pr_err("VASI state poll error: %d", vasi_err);
650 break;
651 }
652
653 pr_notice("Will retry partition suspend after %u ms\n",
654 retry_interval_ms);
655
656 msleep(retry_interval_ms);
657 retry_interval_ms *= 10;
658 attempt++;
659 }
660
661 return ret;
662}
663
Nathan Lynch9327dc02020-12-07 15:51:44 -0600664static int pseries_migrate_partition(u64 handle)
665{
Nathan Lynch9327dc02020-12-07 15:51:44 -0600666 int ret;
667
668 ret = wait_for_vasi_session_suspending(handle);
669 if (ret)
670 return ret;
671
Nathan Lynchaeca35b2020-12-07 15:51:46 -0600672 ret = pseries_suspend(handle);
Nathan Lynch9327dc02020-12-07 15:51:44 -0600673 if (ret == 0)
674 post_mobility_fixup();
Nathan Lynch37cddc72020-12-07 15:51:45 -0600675 else
676 pseries_cancel_migration(handle, ret);
Nathan Lynch9327dc02020-12-07 15:51:44 -0600677
678 return ret;
679}
680
Nathan Lynch4d756892020-12-07 15:51:47 -0600681int rtas_syscall_dispatch_ibm_suspend_me(u64 handle)
682{
683 return pseries_migrate_partition(handle);
684}
685
Greg Kroah-Hartman6f428092017-06-06 15:32:03 +0200686static ssize_t migration_store(struct class *class,
687 struct class_attribute *attr, const char *buf,
688 size_t count)
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000689{
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000690 u64 streamid;
691 int rc;
692
Daniel Walter1618bd52014-08-08 14:24:01 -0700693 rc = kstrtou64(buf, 0, &streamid);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000694 if (rc)
695 return rc;
696
Nathan Lynch9327dc02020-12-07 15:51:44 -0600697 rc = pseries_migrate_partition(streamid);
Nathan Lynchd9213312020-12-07 15:51:43 -0600698 if (rc)
699 return rc;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000700
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000701 return count;
702}
703
Tyrel Datwyler288a2982015-03-04 18:25:38 -0800704/*
705 * Used by drmgr to determine the kernel behavior of the migration interface.
706 *
707 * Version 1: Performs all PAPR requirements for migration including
708 * firmware activation and device tree update.
709 */
710#define MIGRATION_API_VERSION 1
711
Greg Kroah-Hartman6f428092017-06-06 15:32:03 +0200712static CLASS_ATTR_WO(migration);
Russell Currey57ad583f2017-01-12 14:54:13 +1100713static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION));
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000714
715static int __init mobility_sysfs_init(void)
716{
717 int rc;
718
719 mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
720 if (!mobility_kobj)
721 return -ENOMEM;
722
723 rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
Tyrel Datwyler288a2982015-03-04 18:25:38 -0800724 if (rc)
Nathan Lynch494a66f2019-06-27 00:30:43 -0500725 pr_err("unable to create migration sysfs file (%d)\n", rc);
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000726
Tyrel Datwyler288a2982015-03-04 18:25:38 -0800727 rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr);
728 if (rc)
Nathan Lynch494a66f2019-06-27 00:30:43 -0500729 pr_err("unable to create api_version sysfs file (%d)\n", rc);
Tyrel Datwyler288a2982015-03-04 18:25:38 -0800730
731 return 0;
Nathan Fontenot410bccf2010-09-10 09:42:36 +0000732}
Michael Ellerman8e83e902014-07-16 12:02:43 +1000733machine_device_initcall(pseries, mobility_sysfs_init);