blob: 8e489a6d022e3684f449abb88a440f85ebd67662 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010088MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070090MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Rusty Russell90ab5ee2012-01-13 09:32:20 +1030120bool disable_sendpage;
121bool allow_oos;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
142
143/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149struct page *drbd_pp_pool;
150spinlock_t drbd_pp_lock;
151int drbd_pp_vacant;
152wait_queue_head_t drbd_pp_wait;
153
154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100156static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160};
161
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169{
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179}
180
181#endif
182
183/**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193static int tl_init(struct drbd_conf *mdev)
194{
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200205 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100211 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700212
213 mdev->tl_hash = NULL;
214 mdev->tl_hash_s = 0;
215
216 return 1;
217}
218
219static void tl_cleanup(struct drbd_conf *mdev)
220{
221 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
222 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
223 kfree(mdev->oldest_tle);
224 mdev->oldest_tle = NULL;
225 kfree(mdev->unused_spare_tle);
226 mdev->unused_spare_tle = NULL;
227 kfree(mdev->tl_hash);
228 mdev->tl_hash = NULL;
229 mdev->tl_hash_s = 0;
230}
231
232/**
233 * _tl_add_barrier() - Adds a barrier to the transfer log
234 * @mdev: DRBD device.
235 * @new: Barrier to be added before the current head of the TL.
236 *
237 * The caller must hold the req_lock.
238 */
239void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
240{
241 struct drbd_tl_epoch *newest_before;
242
243 INIT_LIST_HEAD(&new->requests);
244 INIT_LIST_HEAD(&new->w.list);
245 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
246 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200247 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700248
249 newest_before = mdev->newest_tle;
250 /* never send a barrier number == 0, because that is special-cased
251 * when using TCQ for our write ordering code */
252 new->br_number = (newest_before->br_number+1) ?: 1;
253 if (mdev->newest_tle != new) {
254 mdev->newest_tle->next = new;
255 mdev->newest_tle = new;
256 }
257}
258
259/**
260 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
261 * @mdev: DRBD device.
262 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
263 * @set_size: Expected number of requests before that barrier.
264 *
265 * In case the passed barrier_nr or set_size does not match the oldest
266 * &struct drbd_tl_epoch objects this function will cause a termination
267 * of the connection.
268 */
269void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
270 unsigned int set_size)
271{
272 struct drbd_tl_epoch *b, *nob; /* next old barrier */
273 struct list_head *le, *tle;
274 struct drbd_request *r;
275
276 spin_lock_irq(&mdev->req_lock);
277
278 b = mdev->oldest_tle;
279
280 /* first some paranoia code */
281 if (b == NULL) {
282 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
283 barrier_nr);
284 goto bail;
285 }
286 if (b->br_number != barrier_nr) {
287 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
288 barrier_nr, b->br_number);
289 goto bail;
290 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200291 if (b->n_writes != set_size) {
292 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
293 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700294 goto bail;
295 }
296
297 /* Clean up list of requests processed during current epoch */
298 list_for_each_safe(le, tle, &b->requests) {
299 r = list_entry(le, struct drbd_request, tl_requests);
300 _req_mod(r, barrier_acked);
301 }
302 /* There could be requests on the list waiting for completion
303 of the write to the local disk. To avoid corruptions of
304 slab's data structures we have to remove the lists head.
305
306 Also there could have been a barrier ack out of sequence, overtaking
307 the write acks - which would be a bug and violating write ordering.
308 To not deadlock in case we lose connection while such requests are
309 still pending, we need some way to find them for the
310 _req_mode(connection_lost_while_pending).
311
312 These have been list_move'd to the out_of_sequence_requests list in
313 _req_mod(, barrier_acked) above.
314 */
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100315 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700316
317 nob = b->next;
318 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
319 _tl_add_barrier(mdev, b);
320 if (nob)
321 mdev->oldest_tle = nob;
322 /* if nob == NULL b was the only barrier, and becomes the new
323 barrier. Therefore mdev->oldest_tle points already to b */
324 } else {
325 D_ASSERT(nob != NULL);
326 mdev->oldest_tle = nob;
327 kfree(b);
328 }
329
330 spin_unlock_irq(&mdev->req_lock);
331 dec_ap_pending(mdev);
332
333 return;
334
335bail:
336 spin_unlock_irq(&mdev->req_lock);
337 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
338}
339
Philipp Reisner617049a2010-12-22 12:48:31 +0100340
Philipp Reisner11b58e72010-05-12 17:08:26 +0200341/**
342 * _tl_restart() - Walks the transfer log, and applies an action to all requests
343 * @mdev: DRBD device.
344 * @what: The action/event to perform with all request objects
345 *
346 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100347 * restart_frozen_disk_io, abort_disk_io.
Philipp Reisner11b58e72010-05-12 17:08:26 +0200348 */
349static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
350{
351 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200352 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200353 struct drbd_request *req;
354 int rv, n_writes, n_reads;
355
356 b = mdev->oldest_tle;
357 pn = &mdev->oldest_tle;
358 while (b) {
359 n_writes = 0;
360 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200361 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200362 list_for_each_safe(le, tle, &b->requests) {
363 req = list_entry(le, struct drbd_request, tl_requests);
364 rv = _req_mod(req, what);
365
366 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
367 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
368 }
369 tmp = b->next;
370
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200371 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200372 if (what == resend) {
373 b->n_writes = n_writes;
374 if (b->w.cb == NULL) {
375 b->w.cb = w_send_barrier;
376 inc_ap_pending(mdev);
377 set_bit(CREATE_BARRIER, &mdev->flags);
378 }
379
380 drbd_queue_work(&mdev->data.work, &b->w);
381 }
382 pn = &b->next;
383 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200384 if (n_reads)
385 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200386 /* there could still be requests on that ring list,
387 * in case local io is still pending */
388 list_del(&b->requests);
389
390 /* dec_ap_pending corresponding to queue_barrier.
391 * the newest barrier may not have been queued yet,
392 * in which case w.cb is still NULL. */
393 if (b->w.cb != NULL)
394 dec_ap_pending(mdev);
395
396 if (b == mdev->newest_tle) {
397 /* recycle, but reinit! */
398 D_ASSERT(tmp == NULL);
399 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200400 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200401 INIT_LIST_HEAD(&b->w.list);
402 b->w.cb = NULL;
403 b->br_number = net_random();
404 b->n_writes = 0;
405
406 *pn = b;
407 break;
408 }
409 *pn = tmp;
410 kfree(b);
411 }
412 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200413 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200414 }
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100415
416 /* Actions operating on the disk state, also want to work on
417 requests that got barrier acked. */
418 switch (what) {
419 case abort_disk_io:
420 case fail_frozen_disk_io:
421 case restart_frozen_disk_io:
422 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
423 req = list_entry(le, struct drbd_request, tl_requests);
424 _req_mod(req, what);
425 }
426
427 case connection_lost_while_pending:
428 case resend:
429 break;
430 default:
431 dev_err(DEV, "what = %d in _tl_restart()\n", what);
432 }
Philipp Reisner11b58e72010-05-12 17:08:26 +0200433}
434
Philipp Reisnerb411b362009-09-25 16:07:19 -0700435
436/**
437 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
438 * @mdev: DRBD device.
439 *
440 * This is called after the connection to the peer was lost. The storage covered
441 * by the requests on the transfer gets marked as our of sync. Called from the
442 * receiver thread and the worker thread.
443 */
444void tl_clear(struct drbd_conf *mdev)
445{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700446 struct list_head *le, *tle;
447 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700448
449 spin_lock_irq(&mdev->req_lock);
450
Philipp Reisner11b58e72010-05-12 17:08:26 +0200451 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700452
453 /* we expect this list to be empty. */
454 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
455
456 /* but just in case, clean it up anyways! */
457 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
458 r = list_entry(le, struct drbd_request, tl_requests);
459 /* It would be nice to complete outside of spinlock.
460 * But this is easier for now. */
461 _req_mod(r, connection_lost_while_pending);
462 }
463
464 /* ensure bit indicating barrier is required is clear */
465 clear_bit(CREATE_BARRIER, &mdev->flags);
466
Philipp Reisner288f4222010-05-27 15:07:43 +0200467 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
468
Philipp Reisnerb411b362009-09-25 16:07:19 -0700469 spin_unlock_irq(&mdev->req_lock);
470}
471
Philipp Reisner11b58e72010-05-12 17:08:26 +0200472void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
473{
474 spin_lock_irq(&mdev->req_lock);
475 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700476 spin_unlock_irq(&mdev->req_lock);
477}
478
479/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100480 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700481 * @mdev: DRBD device.
482 * @os: old (current) state.
483 * @ns: new (wanted) state.
484 */
485static int cl_wide_st_chg(struct drbd_conf *mdev,
486 union drbd_state os, union drbd_state ns)
487{
488 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
489 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
490 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
491 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
492 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
493 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
494 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
495}
496
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100497enum drbd_state_rv
498drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
499 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700500{
501 unsigned long flags;
502 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100503 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700504
505 spin_lock_irqsave(&mdev->req_lock, flags);
506 os = mdev->state;
507 ns.i = (os.i & ~mask.i) | val.i;
508 rv = _drbd_set_state(mdev, ns, f, NULL);
509 ns = mdev->state;
510 spin_unlock_irqrestore(&mdev->req_lock, flags);
511
512 return rv;
513}
514
515/**
516 * drbd_force_state() - Impose a change which happens outside our control on our state
517 * @mdev: DRBD device.
518 * @mask: mask of state bits to change.
519 * @val: value of new state bits.
520 */
521void drbd_force_state(struct drbd_conf *mdev,
522 union drbd_state mask, union drbd_state val)
523{
524 drbd_change_state(mdev, CS_HARD, mask, val);
525}
526
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100527static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
528static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
529 union drbd_state,
530 union drbd_state);
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200531enum sanitize_state_warnings {
532 NO_WARNING,
533 ABORTED_ONLINE_VERIFY,
534 ABORTED_RESYNC,
535 CONNECTION_LOST_NEGOTIATING,
536 IMPLICITLY_UPGRADED_DISK,
537 IMPLICITLY_UPGRADED_PDSK,
538};
Philipp Reisnerb411b362009-09-25 16:07:19 -0700539static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200540 union drbd_state ns, enum sanitize_state_warnings *warn);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700541int drbd_send_state_req(struct drbd_conf *,
542 union drbd_state, union drbd_state);
543
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100544static enum drbd_state_rv
545_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
546 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700547{
548 union drbd_state os, ns;
549 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100550 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700551
552 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
553 return SS_CW_SUCCESS;
554
555 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
556 return SS_CW_FAILED_BY_PEER;
557
558 rv = 0;
559 spin_lock_irqsave(&mdev->req_lock, flags);
560 os = mdev->state;
561 ns.i = (os.i & ~mask.i) | val.i;
562 ns = sanitize_state(mdev, os, ns, NULL);
563
564 if (!cl_wide_st_chg(mdev, os, ns))
565 rv = SS_CW_NO_NEED;
566 if (!rv) {
567 rv = is_valid_state(mdev, ns);
568 if (rv == SS_SUCCESS) {
569 rv = is_valid_state_transition(mdev, ns, os);
570 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100571 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700572 }
573 }
574 spin_unlock_irqrestore(&mdev->req_lock, flags);
575
576 return rv;
577}
578
579/**
580 * drbd_req_state() - Perform an eventually cluster wide state change
581 * @mdev: DRBD device.
582 * @mask: mask of state bits to change.
583 * @val: value of new state bits.
584 * @f: flags
585 *
586 * Should not be called directly, use drbd_request_state() or
587 * _drbd_request_state().
588 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100589static enum drbd_state_rv
590drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
591 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700592{
593 struct completion done;
594 unsigned long flags;
595 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100596 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700597
598 init_completion(&done);
599
600 if (f & CS_SERIALIZE)
601 mutex_lock(&mdev->state_mutex);
602
603 spin_lock_irqsave(&mdev->req_lock, flags);
604 os = mdev->state;
605 ns.i = (os.i & ~mask.i) | val.i;
606 ns = sanitize_state(mdev, os, ns, NULL);
607
608 if (cl_wide_st_chg(mdev, os, ns)) {
609 rv = is_valid_state(mdev, ns);
610 if (rv == SS_SUCCESS)
611 rv = is_valid_state_transition(mdev, ns, os);
612 spin_unlock_irqrestore(&mdev->req_lock, flags);
613
614 if (rv < SS_SUCCESS) {
615 if (f & CS_VERBOSE)
616 print_st_err(mdev, os, ns, rv);
617 goto abort;
618 }
619
620 drbd_state_lock(mdev);
621 if (!drbd_send_state_req(mdev, mask, val)) {
622 drbd_state_unlock(mdev);
623 rv = SS_CW_FAILED_BY_PEER;
624 if (f & CS_VERBOSE)
625 print_st_err(mdev, os, ns, rv);
626 goto abort;
627 }
628
629 wait_event(mdev->state_wait,
630 (rv = _req_st_cond(mdev, mask, val)));
631
632 if (rv < SS_SUCCESS) {
633 drbd_state_unlock(mdev);
634 if (f & CS_VERBOSE)
635 print_st_err(mdev, os, ns, rv);
636 goto abort;
637 }
638 spin_lock_irqsave(&mdev->req_lock, flags);
639 os = mdev->state;
640 ns.i = (os.i & ~mask.i) | val.i;
641 rv = _drbd_set_state(mdev, ns, f, &done);
642 drbd_state_unlock(mdev);
643 } else {
644 rv = _drbd_set_state(mdev, ns, f, &done);
645 }
646
647 spin_unlock_irqrestore(&mdev->req_lock, flags);
648
649 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
650 D_ASSERT(current != mdev->worker.task);
651 wait_for_completion(&done);
652 }
653
654abort:
655 if (f & CS_SERIALIZE)
656 mutex_unlock(&mdev->state_mutex);
657
658 return rv;
659}
660
661/**
662 * _drbd_request_state() - Request a state change (with flags)
663 * @mdev: DRBD device.
664 * @mask: mask of state bits to change.
665 * @val: value of new state bits.
666 * @f: flags
667 *
668 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
669 * flag, or when logging of failed state change requests is not desired.
670 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100671enum drbd_state_rv
672_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
673 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700674{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100675 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700676
677 wait_event(mdev->state_wait,
678 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
679
680 return rv;
681}
682
683static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
684{
685 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
686 name,
687 drbd_conn_str(ns.conn),
688 drbd_role_str(ns.role),
689 drbd_role_str(ns.peer),
690 drbd_disk_str(ns.disk),
691 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200692 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700693 ns.aftr_isp ? 'a' : '-',
694 ns.peer_isp ? 'p' : '-',
695 ns.user_isp ? 'u' : '-'
696 );
697}
698
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100699void print_st_err(struct drbd_conf *mdev, union drbd_state os,
700 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700701{
702 if (err == SS_IN_TRANSIENT_STATE)
703 return;
704 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
705 print_st(mdev, " state", os);
706 print_st(mdev, "wanted", ns);
707}
708
709
Philipp Reisnerb411b362009-09-25 16:07:19 -0700710/**
711 * is_valid_state() - Returns an SS_ error code if ns is not valid
712 * @mdev: DRBD device.
713 * @ns: State to consider.
714 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100715static enum drbd_state_rv
716is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700717{
718 /* See drbd_state_sw_errors in drbd_strings.c */
719
720 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100721 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700722
723 fp = FP_DONT_CARE;
724 if (get_ldev(mdev)) {
725 fp = mdev->ldev->dc.fencing;
726 put_ldev(mdev);
727 }
728
729 if (get_net_conf(mdev)) {
730 if (!mdev->net_conf->two_primaries &&
731 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
732 rv = SS_TWO_PRIMARIES;
733 put_net_conf(mdev);
734 }
735
736 if (rv <= 0)
737 /* already found a reason to abort */;
738 else if (ns.role == R_SECONDARY && mdev->open_cnt)
739 rv = SS_DEVICE_IN_USE;
740
741 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
742 rv = SS_NO_UP_TO_DATE_DISK;
743
744 else if (fp >= FP_RESOURCE &&
745 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
746 rv = SS_PRIMARY_NOP;
747
748 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
749 rv = SS_NO_UP_TO_DATE_DISK;
750
751 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
752 rv = SS_NO_LOCAL_DISK;
753
754 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
755 rv = SS_NO_REMOTE_DISK;
756
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200757 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
758 rv = SS_NO_UP_TO_DATE_DISK;
759
Philipp Reisnerb411b362009-09-25 16:07:19 -0700760 else if ((ns.conn == C_CONNECTED ||
761 ns.conn == C_WF_BITMAP_S ||
762 ns.conn == C_SYNC_SOURCE ||
763 ns.conn == C_PAUSED_SYNC_S) &&
764 ns.disk == D_OUTDATED)
765 rv = SS_CONNECTED_OUTDATES;
766
767 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
768 (mdev->sync_conf.verify_alg[0] == 0))
769 rv = SS_NO_VERIFY_ALG;
770
771 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
772 mdev->agreed_pro_version < 88)
773 rv = SS_NOT_SUPPORTED;
774
Philipp Reisnerfa7d9392011-05-17 14:48:55 +0200775 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
776 rv = SS_CONNECTED_OUTDATES;
777
Philipp Reisnerb411b362009-09-25 16:07:19 -0700778 return rv;
779}
780
781/**
782 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
783 * @mdev: DRBD device.
784 * @ns: new state.
785 * @os: old state.
786 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100787static enum drbd_state_rv
788is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
789 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700790{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100791 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700792
793 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
794 os.conn > C_CONNECTED)
795 rv = SS_RESYNC_RUNNING;
796
797 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
798 rv = SS_ALREADY_STANDALONE;
799
800 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
801 rv = SS_IS_DISKLESS;
802
803 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
804 rv = SS_NO_NET_CONFIG;
805
806 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
807 rv = SS_LOWER_THAN_OUTDATED;
808
809 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
810 rv = SS_IN_TRANSIENT_STATE;
811
812 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
813 rv = SS_IN_TRANSIENT_STATE;
814
815 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
816 rv = SS_NEED_CONNECTION;
817
818 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
819 ns.conn != os.conn && os.conn > C_CONNECTED)
820 rv = SS_RESYNC_RUNNING;
821
822 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
823 os.conn < C_CONNECTED)
824 rv = SS_NEED_CONNECTION;
825
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100826 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
827 && os.conn < C_WF_REPORT_PARAMS)
828 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
829
Philipp Reisnerb411b362009-09-25 16:07:19 -0700830 return rv;
831}
832
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200833static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
834{
835 static const char *msg_table[] = {
836 [NO_WARNING] = "",
837 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
838 [ABORTED_RESYNC] = "Resync aborted.",
839 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
840 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
841 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
842 };
843
844 if (warn != NO_WARNING)
845 dev_warn(DEV, "%s\n", msg_table[warn]);
846}
847
Philipp Reisnerb411b362009-09-25 16:07:19 -0700848/**
849 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
850 * @mdev: DRBD device.
851 * @os: old state.
852 * @ns: new state.
853 * @warn_sync_abort:
854 *
855 * When we loose connection, we have to set the state of the peers disk (pdsk)
856 * to D_UNKNOWN. This rule and many more along those lines are in this function.
857 */
858static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200859 union drbd_state ns, enum sanitize_state_warnings *warn)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700860{
861 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100862 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700863
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200864 if (warn)
865 *warn = NO_WARNING;
866
Philipp Reisnerb411b362009-09-25 16:07:19 -0700867 fp = FP_DONT_CARE;
868 if (get_ldev(mdev)) {
869 fp = mdev->ldev->dc.fencing;
870 put_ldev(mdev);
871 }
872
873 /* Disallow Network errors to configure a device's network part */
874 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
875 os.conn <= C_DISCONNECTING)
876 ns.conn = os.conn;
877
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200878 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
879 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700880 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200881 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700882 ns.conn = os.conn;
883
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200884 /* we cannot fail (again) if we already detached */
885 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
886 ns.disk = D_DISKLESS;
887
888 /* if we are only D_ATTACHING yet,
889 * we can (and should) go directly to D_DISKLESS. */
890 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
891 ns.disk = D_DISKLESS;
892
Philipp Reisnerb411b362009-09-25 16:07:19 -0700893 /* After C_DISCONNECTING only C_STANDALONE may follow */
894 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
895 ns.conn = os.conn;
896
897 if (ns.conn < C_CONNECTED) {
898 ns.peer_isp = 0;
899 ns.peer = R_UNKNOWN;
900 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
901 ns.pdsk = D_UNKNOWN;
902 }
903
904 /* Clear the aftr_isp when becoming unconfigured */
905 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
906 ns.aftr_isp = 0;
907
Philipp Reisnerb411b362009-09-25 16:07:19 -0700908 /* Abort resync if a disk fails/detaches */
909 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
910 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200911 if (warn)
912 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
913 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700914 ns.conn = C_CONNECTED;
915 }
916
Philipp Reisnerb411b362009-09-25 16:07:19 -0700917 /* Connection breaks down before we finished "Negotiating" */
918 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
919 get_ldev_if_state(mdev, D_NEGOTIATING)) {
920 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
921 ns.disk = mdev->new_state_tmp.disk;
922 ns.pdsk = mdev->new_state_tmp.pdsk;
923 } else {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200924 if (warn)
925 *warn = CONNECTION_LOST_NEGOTIATING;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700926 ns.disk = D_DISKLESS;
927 ns.pdsk = D_UNKNOWN;
928 }
929 put_ldev(mdev);
930 }
931
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100932 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
933 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
934 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
935 ns.disk = D_UP_TO_DATE;
936 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
937 ns.pdsk = D_UP_TO_DATE;
938 }
939
940 /* Implications of the connection stat on the disk states */
941 disk_min = D_DISKLESS;
942 disk_max = D_UP_TO_DATE;
943 pdsk_min = D_INCONSISTENT;
944 pdsk_max = D_UNKNOWN;
945 switch ((enum drbd_conns)ns.conn) {
946 case C_WF_BITMAP_T:
947 case C_PAUSED_SYNC_T:
948 case C_STARTING_SYNC_T:
949 case C_WF_SYNC_UUID:
950 case C_BEHIND:
951 disk_min = D_INCONSISTENT;
952 disk_max = D_OUTDATED;
953 pdsk_min = D_UP_TO_DATE;
954 pdsk_max = D_UP_TO_DATE;
955 break;
956 case C_VERIFY_S:
957 case C_VERIFY_T:
958 disk_min = D_UP_TO_DATE;
959 disk_max = D_UP_TO_DATE;
960 pdsk_min = D_UP_TO_DATE;
961 pdsk_max = D_UP_TO_DATE;
962 break;
963 case C_CONNECTED:
964 disk_min = D_DISKLESS;
965 disk_max = D_UP_TO_DATE;
966 pdsk_min = D_DISKLESS;
967 pdsk_max = D_UP_TO_DATE;
968 break;
969 case C_WF_BITMAP_S:
970 case C_PAUSED_SYNC_S:
971 case C_STARTING_SYNC_S:
972 case C_AHEAD:
973 disk_min = D_UP_TO_DATE;
974 disk_max = D_UP_TO_DATE;
975 pdsk_min = D_INCONSISTENT;
976 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
977 break;
978 case C_SYNC_TARGET:
979 disk_min = D_INCONSISTENT;
980 disk_max = D_INCONSISTENT;
981 pdsk_min = D_UP_TO_DATE;
982 pdsk_max = D_UP_TO_DATE;
983 break;
984 case C_SYNC_SOURCE:
985 disk_min = D_UP_TO_DATE;
986 disk_max = D_UP_TO_DATE;
987 pdsk_min = D_INCONSISTENT;
988 pdsk_max = D_INCONSISTENT;
989 break;
990 case C_STANDALONE:
991 case C_DISCONNECTING:
992 case C_UNCONNECTED:
993 case C_TIMEOUT:
994 case C_BROKEN_PIPE:
995 case C_NETWORK_FAILURE:
996 case C_PROTOCOL_ERROR:
997 case C_TEAR_DOWN:
998 case C_WF_CONNECTION:
999 case C_WF_REPORT_PARAMS:
1000 case C_MASK:
1001 break;
1002 }
1003 if (ns.disk > disk_max)
1004 ns.disk = disk_max;
1005
1006 if (ns.disk < disk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001007 if (warn)
1008 *warn = IMPLICITLY_UPGRADED_DISK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +01001009 ns.disk = disk_min;
1010 }
1011 if (ns.pdsk > pdsk_max)
1012 ns.pdsk = pdsk_max;
1013
1014 if (ns.pdsk < pdsk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001015 if (warn)
1016 *warn = IMPLICITLY_UPGRADED_PDSK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +01001017 ns.pdsk = pdsk_min;
1018 }
1019
Philipp Reisnerb411b362009-09-25 16:07:19 -07001020 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +02001021 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1022 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001023 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001024
1025 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1026 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1027 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001028 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001029
1030 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1031 if (ns.conn == C_SYNC_SOURCE)
1032 ns.conn = C_PAUSED_SYNC_S;
1033 if (ns.conn == C_SYNC_TARGET)
1034 ns.conn = C_PAUSED_SYNC_T;
1035 } else {
1036 if (ns.conn == C_PAUSED_SYNC_S)
1037 ns.conn = C_SYNC_SOURCE;
1038 if (ns.conn == C_PAUSED_SYNC_T)
1039 ns.conn = C_SYNC_TARGET;
1040 }
1041
1042 return ns;
1043}
1044
1045/* helper for __drbd_set_state */
1046static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1047{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001048 if (mdev->agreed_pro_version < 90)
1049 mdev->ov_start_sector = 0;
1050 mdev->rs_total = drbd_bm_bits(mdev);
1051 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001052 if (cs == C_VERIFY_T) {
1053 /* starting online verify from an arbitrary position
1054 * does not fit well into the existing protocol.
1055 * on C_VERIFY_T, we initialize ov_left and friends
1056 * implicitly in receive_DataRequest once the
1057 * first P_OV_REQUEST is received */
1058 mdev->ov_start_sector = ~(sector_t)0;
1059 } else {
1060 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001061 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001062 mdev->ov_start_sector =
1063 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001064 mdev->rs_total = 1;
1065 } else
1066 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001067 mdev->ov_position = mdev->ov_start_sector;
1068 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001069 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001070}
1071
Philipp Reisner07782862010-08-31 12:00:50 +02001072static void drbd_resume_al(struct drbd_conf *mdev)
1073{
1074 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1075 dev_info(DEV, "Resumed AL updates\n");
1076}
1077
Philipp Reisnerb411b362009-09-25 16:07:19 -07001078/**
1079 * __drbd_set_state() - Set a new DRBD state
1080 * @mdev: DRBD device.
1081 * @ns: new state.
1082 * @flags: Flags
1083 * @done: Optional completion, that will get completed after the after_state_ch() finished
1084 *
1085 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1086 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001087enum drbd_state_rv
1088__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1089 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001090{
1091 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001092 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001093 enum sanitize_state_warnings ssw;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001094 struct after_state_chg_work *ascw;
1095
1096 os = mdev->state;
1097
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001098 ns = sanitize_state(mdev, os, ns, &ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001099
1100 if (ns.i == os.i)
1101 return SS_NOTHING_TO_DO;
1102
1103 if (!(flags & CS_HARD)) {
1104 /* pre-state-change checks ; only look at ns */
1105 /* See drbd_state_sw_errors in drbd_strings.c */
1106
1107 rv = is_valid_state(mdev, ns);
1108 if (rv < SS_SUCCESS) {
1109 /* If the old state was illegal as well, then let
1110 this happen...*/
1111
Philipp Reisner1616a252010-06-10 16:55:15 +02001112 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001113 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001114 } else
1115 rv = is_valid_state_transition(mdev, ns, os);
1116 }
1117
1118 if (rv < SS_SUCCESS) {
1119 if (flags & CS_VERBOSE)
1120 print_st_err(mdev, os, ns, rv);
1121 return rv;
1122 }
1123
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001124 print_sanitize_warnings(mdev, ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001125
1126 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001127 char *pbp, pb[300];
1128 pbp = pb;
1129 *pbp = 0;
1130 if (ns.role != os.role)
1131 pbp += sprintf(pbp, "role( %s -> %s ) ",
1132 drbd_role_str(os.role),
1133 drbd_role_str(ns.role));
1134 if (ns.peer != os.peer)
1135 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1136 drbd_role_str(os.peer),
1137 drbd_role_str(ns.peer));
1138 if (ns.conn != os.conn)
1139 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1140 drbd_conn_str(os.conn),
1141 drbd_conn_str(ns.conn));
1142 if (ns.disk != os.disk)
1143 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1144 drbd_disk_str(os.disk),
1145 drbd_disk_str(ns.disk));
1146 if (ns.pdsk != os.pdsk)
1147 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1148 drbd_disk_str(os.pdsk),
1149 drbd_disk_str(ns.pdsk));
1150 if (is_susp(ns) != is_susp(os))
1151 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1152 is_susp(os),
1153 is_susp(ns));
1154 if (ns.aftr_isp != os.aftr_isp)
1155 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1156 os.aftr_isp,
1157 ns.aftr_isp);
1158 if (ns.peer_isp != os.peer_isp)
1159 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1160 os.peer_isp,
1161 ns.peer_isp);
1162 if (ns.user_isp != os.user_isp)
1163 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1164 os.user_isp,
1165 ns.user_isp);
1166 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001167 }
1168
1169 /* solve the race between becoming unconfigured,
1170 * worker doing the cleanup, and
1171 * admin reconfiguring us:
1172 * on (re)configure, first set CONFIG_PENDING,
1173 * then wait for a potentially exiting worker,
1174 * start the worker, and schedule one no_op.
1175 * then proceed with configuration.
1176 */
1177 if (ns.disk == D_DISKLESS &&
1178 ns.conn == C_STANDALONE &&
1179 ns.role == R_SECONDARY &&
1180 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1181 set_bit(DEVICE_DYING, &mdev->flags);
1182
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001183 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1184 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1185 * drbd_ldev_destroy() won't happen before our corresponding
1186 * after_state_ch works run, where we put_ldev again. */
1187 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1188 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1189 atomic_inc(&mdev->local_cnt);
1190
1191 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001192
1193 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1194 drbd_print_uuids(mdev, "attached to UUIDs");
1195
Philipp Reisnerb411b362009-09-25 16:07:19 -07001196 wake_up(&mdev->misc_wait);
1197 wake_up(&mdev->state_wait);
1198
Philipp Reisnerb411b362009-09-25 16:07:19 -07001199 /* aborted verify run. log the last position */
1200 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1201 ns.conn < C_CONNECTED) {
1202 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001203 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001204 dev_info(DEV, "Online Verify reached sector %llu\n",
1205 (unsigned long long)mdev->ov_start_sector);
1206 }
1207
1208 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1209 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1210 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001211 mdev->rs_paused += (long)jiffies
1212 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001213 if (ns.conn == C_SYNC_TARGET)
1214 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001215 }
1216
1217 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1218 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1219 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001220 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001221 }
1222
1223 if (os.conn == C_CONNECTED &&
1224 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001225 unsigned long now = jiffies;
1226 int i;
1227
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001228 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001229 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001230 mdev->rs_last_events = 0;
1231 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001232 mdev->ov_last_oos_size = 0;
1233 mdev->ov_last_oos_start = 0;
1234
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001235 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001236 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001237 mdev->rs_mark_time[i] = now;
1238 }
1239
Lars Ellenberg2649f082010-11-05 10:05:47 +01001240 drbd_rs_controller_reset(mdev);
1241
Philipp Reisnerb411b362009-09-25 16:07:19 -07001242 if (ns.conn == C_VERIFY_S) {
1243 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1244 (unsigned long long)mdev->ov_position);
1245 mod_timer(&mdev->resync_timer, jiffies);
1246 }
1247 }
1248
1249 if (get_ldev(mdev)) {
1250 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1251 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1252 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1253
1254 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1255 mdf |= MDF_CRASHED_PRIMARY;
1256 if (mdev->state.role == R_PRIMARY ||
1257 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1258 mdf |= MDF_PRIMARY_IND;
1259 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1260 mdf |= MDF_CONNECTED_IND;
1261 if (mdev->state.disk > D_INCONSISTENT)
1262 mdf |= MDF_CONSISTENT;
1263 if (mdev->state.disk > D_OUTDATED)
1264 mdf |= MDF_WAS_UP_TO_DATE;
1265 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1266 mdf |= MDF_PEER_OUT_DATED;
1267 if (mdf != mdev->ldev->md.flags) {
1268 mdev->ldev->md.flags = mdf;
1269 drbd_md_mark_dirty(mdev);
1270 }
1271 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1272 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1273 put_ldev(mdev);
1274 }
1275
1276 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1277 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1278 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1279 set_bit(CONSIDER_RESYNC, &mdev->flags);
1280
1281 /* Receiver should clean up itself */
1282 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1283 drbd_thread_stop_nowait(&mdev->receiver);
1284
1285 /* Now the receiver finished cleaning up itself, it should die */
1286 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1287 drbd_thread_stop_nowait(&mdev->receiver);
1288
1289 /* Upon network failure, we need to restart the receiver. */
1290 if (os.conn > C_TEAR_DOWN &&
1291 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1292 drbd_thread_restart_nowait(&mdev->receiver);
1293
Philipp Reisner07782862010-08-31 12:00:50 +02001294 /* Resume AL writing if we get a connection */
1295 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1296 drbd_resume_al(mdev);
1297
Philipp Reisnerb411b362009-09-25 16:07:19 -07001298 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1299 if (ascw) {
1300 ascw->os = os;
1301 ascw->ns = ns;
1302 ascw->flags = flags;
1303 ascw->w.cb = w_after_state_ch;
1304 ascw->done = done;
1305 drbd_queue_work(&mdev->data.work, &ascw->w);
1306 } else {
1307 dev_warn(DEV, "Could not kmalloc an ascw\n");
1308 }
1309
1310 return rv;
1311}
1312
1313static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1314{
1315 struct after_state_chg_work *ascw =
1316 container_of(w, struct after_state_chg_work, w);
1317 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1318 if (ascw->flags & CS_WAIT_COMPLETE) {
1319 D_ASSERT(ascw->done != NULL);
1320 complete(ascw->done);
1321 }
1322 kfree(ascw);
1323
1324 return 1;
1325}
1326
1327static void abw_start_sync(struct drbd_conf *mdev, int rv)
1328{
1329 if (rv) {
1330 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1331 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1332 return;
1333 }
1334
1335 switch (mdev->state.conn) {
1336 case C_STARTING_SYNC_T:
1337 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1338 break;
1339 case C_STARTING_SYNC_S:
1340 drbd_start_resync(mdev, C_SYNC_SOURCE);
1341 break;
1342 }
1343}
1344
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001345int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1346 int (*io_fn)(struct drbd_conf *),
1347 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001348{
1349 int rv;
1350
1351 D_ASSERT(current == mdev->worker.task);
1352
1353 /* open coded non-blocking drbd_suspend_io(mdev); */
1354 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001355
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001356 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001357 rv = io_fn(mdev);
1358 drbd_bm_unlock(mdev);
1359
1360 drbd_resume_io(mdev);
1361
1362 return rv;
1363}
1364
Philipp Reisnerb411b362009-09-25 16:07:19 -07001365/**
1366 * after_state_ch() - Perform after state change actions that may sleep
1367 * @mdev: DRBD device.
1368 * @os: old state.
1369 * @ns: new state.
1370 * @flags: Flags
1371 */
1372static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1373 union drbd_state ns, enum chg_state_flags flags)
1374{
1375 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001376 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001377 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001378
1379 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1380 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1381 if (mdev->p_uuid)
1382 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1383 }
1384
1385 fp = FP_DONT_CARE;
1386 if (get_ldev(mdev)) {
1387 fp = mdev->ldev->dc.fencing;
1388 put_ldev(mdev);
1389 }
1390
1391 /* Inform userspace about the change... */
1392 drbd_bcast_state(mdev, ns);
1393
1394 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1395 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1396 drbd_khelper(mdev, "pri-on-incon-degr");
1397
1398 /* Here we have the actions that are performed after a
1399 state change. This function might sleep */
1400
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001401 nsm.i = -1;
1402 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001403 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1404 what = resend;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001405
Philipp Reisner67098932010-06-24 16:24:25 +02001406 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001407 what = restart_frozen_disk_io;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001408
Philipp Reisner3f986882010-12-20 14:48:20 +01001409 if (what != nothing)
1410 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001411 }
1412
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001413 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001414 /* case1: The outdate peer handler is successful: */
1415 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001416 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001417 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1418 drbd_uuid_new_current(mdev);
1419 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001420 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001421 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001422 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001423 spin_unlock_irq(&mdev->req_lock);
1424 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001425 /* case2: The connection was established again: */
1426 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1427 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001428 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001429 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001430 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001431 }
Philipp Reisner67098932010-06-24 16:24:25 +02001432
1433 if (what != nothing) {
1434 spin_lock_irq(&mdev->req_lock);
1435 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001436 nsm.i &= mdev->state.i;
1437 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001438 spin_unlock_irq(&mdev->req_lock);
1439 }
1440
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001441 /* Became sync source. With protocol >= 96, we still need to send out
1442 * the sync uuid now. Need to do that before any drbd_send_state, or
1443 * the other side may go "paused sync" before receiving the sync uuids,
1444 * which is unexpected. */
1445 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1446 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1447 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1448 drbd_gen_and_send_sync_uuid(mdev);
1449 put_ldev(mdev);
1450 }
1451
Philipp Reisnerb411b362009-09-25 16:07:19 -07001452 /* Do not change the order of the if above and the two below... */
1453 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1454 drbd_send_uuids(mdev);
1455 drbd_send_state(mdev);
1456 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001457 /* No point in queuing send_bitmap if we don't have a connection
1458 * anymore, so check also the _current_ state, not only the new state
1459 * at the time this work was queued. */
1460 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1461 mdev->state.conn == C_WF_BITMAP_S)
1462 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001463 "send_bitmap (WFBitMapS)",
1464 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001465
1466 /* Lost contact to peer's copy of the data */
1467 if ((os.pdsk >= D_INCONSISTENT &&
1468 os.pdsk != D_UNKNOWN &&
1469 os.pdsk != D_OUTDATED)
1470 && (ns.pdsk < D_INCONSISTENT ||
1471 ns.pdsk == D_UNKNOWN ||
1472 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001473 if (get_ldev(mdev)) {
1474 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001475 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001476 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001477 set_bit(NEW_CUR_UUID, &mdev->flags);
1478 } else {
1479 drbd_uuid_new_current(mdev);
1480 drbd_send_uuids(mdev);
1481 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001482 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001483 put_ldev(mdev);
1484 }
1485 }
1486
1487 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001488 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001489 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001490 drbd_send_uuids(mdev);
1491 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001492
1493 /* D_DISKLESS Peer becomes secondary */
1494 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001495 /* We may still be Primary ourselves.
1496 * No harm done if the bitmap still changes,
1497 * redirtied pages will follow later. */
1498 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1499 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001500 put_ldev(mdev);
1501 }
1502
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001503 /* Write out all changed bits on demote.
1504 * Though, no need to da that just yet
1505 * if there is a resync going on still */
1506 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1507 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001508 /* No changes to the bitmap expected this time, so assert that,
1509 * even though no harm was done if it did change. */
1510 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1511 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001512 put_ldev(mdev);
1513 }
1514
1515 /* Last part of the attaching process ... */
1516 if (ns.conn >= C_CONNECTED &&
1517 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001518 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001519 drbd_send_uuids(mdev);
1520 drbd_send_state(mdev);
1521 }
1522
1523 /* We want to pause/continue resync, tell peer. */
1524 if (ns.conn >= C_CONNECTED &&
1525 ((os.aftr_isp != ns.aftr_isp) ||
1526 (os.user_isp != ns.user_isp)))
1527 drbd_send_state(mdev);
1528
1529 /* In case one of the isp bits got set, suspend other devices. */
1530 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1531 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1532 suspend_other_sg(mdev);
1533
1534 /* Make sure the peer gets informed about eventual state
1535 changes (ISP bits) while we were in WFReportParams. */
1536 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1537 drbd_send_state(mdev);
1538
Philipp Reisner67531712010-10-27 12:21:30 +02001539 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1540 drbd_send_state(mdev);
1541
Philipp Reisnerb411b362009-09-25 16:07:19 -07001542 /* We are in the progress to start a full sync... */
1543 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1544 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001545 /* no other bitmap changes expected during this phase */
1546 drbd_queue_bitmap_io(mdev,
1547 &drbd_bmio_set_n_write, &abw_start_sync,
1548 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001549
1550 /* We are invalidating our self... */
1551 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1552 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001553 /* other bitmap operation expected during this phase */
1554 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1555 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001556
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001557 /* first half of local IO error, failure to attach,
1558 * or administrative detach */
1559 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1560 enum drbd_io_error_p eh;
1561 int was_io_error;
1562 /* corresponding get_ldev was in __drbd_set_state, to serialize
1563 * our cleanup here with the transition to D_DISKLESS,
1564 * so it is safe to dreference ldev here. */
1565 eh = mdev->ldev->dc.on_io_error;
1566 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1567
1568 /* current state still has to be D_FAILED,
1569 * there is only one way out: to D_DISKLESS,
1570 * and that may only happen after our put_ldev below. */
1571 if (mdev->state.disk != D_FAILED)
1572 dev_err(DEV,
1573 "ASSERT FAILED: disk is %s during detach\n",
1574 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001575
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001576 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001577 dev_info(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001578
1579 drbd_rs_cancel_all(mdev);
1580
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001581 /* In case we want to get something to stable storage still,
1582 * this may be the last chance.
1583 * Following put_ldev may transition to D_DISKLESS. */
1584 drbd_md_sync(mdev);
1585 put_ldev(mdev);
1586
1587 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001588 drbd_khelper(mdev, "local-io-error");
1589 }
1590
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001591 /* second half of local IO error, failure to attach,
1592 * or administrative detach,
1593 * after local_cnt references have reached zero again */
1594 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1595 /* We must still be diskless,
1596 * re-attach has to be serialized with this! */
1597 if (mdev->state.disk != D_DISKLESS)
1598 dev_err(DEV,
1599 "ASSERT FAILED: disk is %s while going diskless\n",
1600 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001601
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001602 mdev->rs_total = 0;
1603 mdev->rs_failed = 0;
1604 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001605
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001606 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001607 dev_info(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001608 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001609 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001610 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001611 }
1612
Philipp Reisner738a84b2011-03-03 00:21:30 +01001613 /* Notify peer that I had a local IO error, and did not detached.. */
1614 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1615 drbd_send_state(mdev);
1616
Philipp Reisnerb411b362009-09-25 16:07:19 -07001617 /* Disks got bigger while they were detached */
1618 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1619 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1620 if (ns.conn == C_CONNECTED)
1621 resync_after_online_grow(mdev);
1622 }
1623
1624 /* A resync finished or aborted, wake paused devices... */
1625 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1626 (os.peer_isp && !ns.peer_isp) ||
1627 (os.user_isp && !ns.user_isp))
1628 resume_next_sg(mdev);
1629
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001630 /* sync target done with resync. Explicitly notify peer, even though
1631 * it should (at least for non-empty resyncs) already know itself. */
1632 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1633 drbd_send_state(mdev);
1634
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001635 /* This triggers bitmap writeout of potentially still unwritten pages
1636 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001637 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001638 * For resync aborted because of local disk failure, we cannot do
1639 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001640 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001641 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001642 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1643 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1644 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001645 put_ldev(mdev);
1646 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001647
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001648 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001649 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001650 drbd_free_tl_hash(mdev);
1651
Philipp Reisnerb411b362009-09-25 16:07:19 -07001652 /* Upon network connection, we need to start the receiver */
1653 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1654 drbd_thread_start(&mdev->receiver);
1655
1656 /* Terminate worker thread if we are unconfigured - it will be
1657 restarted as needed... */
1658 if (ns.disk == D_DISKLESS &&
1659 ns.conn == C_STANDALONE &&
1660 ns.role == R_SECONDARY) {
1661 if (os.aftr_isp != ns.aftr_isp)
1662 resume_next_sg(mdev);
1663 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1664 if (test_bit(DEVICE_DYING, &mdev->flags))
1665 drbd_thread_stop_nowait(&mdev->worker);
1666 }
1667
1668 drbd_md_sync(mdev);
1669}
1670
1671
1672static int drbd_thread_setup(void *arg)
1673{
1674 struct drbd_thread *thi = (struct drbd_thread *) arg;
1675 struct drbd_conf *mdev = thi->mdev;
1676 unsigned long flags;
1677 int retval;
1678
1679restart:
1680 retval = thi->function(thi);
1681
1682 spin_lock_irqsave(&thi->t_lock, flags);
1683
1684 /* if the receiver has been "Exiting", the last thing it did
1685 * was set the conn state to "StandAlone",
1686 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1687 * and receiver thread will be "started".
1688 * drbd_thread_start needs to set "Restarting" in that case.
1689 * t_state check and assignment needs to be within the same spinlock,
1690 * so either thread_start sees Exiting, and can remap to Restarting,
1691 * or thread_start see None, and can proceed as normal.
1692 */
1693
1694 if (thi->t_state == Restarting) {
1695 dev_info(DEV, "Restarting %s\n", current->comm);
1696 thi->t_state = Running;
1697 spin_unlock_irqrestore(&thi->t_lock, flags);
1698 goto restart;
1699 }
1700
1701 thi->task = NULL;
1702 thi->t_state = None;
1703 smp_mb();
1704 complete(&thi->stop);
1705 spin_unlock_irqrestore(&thi->t_lock, flags);
1706
1707 dev_info(DEV, "Terminating %s\n", current->comm);
1708
1709 /* Release mod reference taken when thread was started */
1710 module_put(THIS_MODULE);
1711 return retval;
1712}
1713
1714static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1715 int (*func) (struct drbd_thread *))
1716{
1717 spin_lock_init(&thi->t_lock);
1718 thi->task = NULL;
1719 thi->t_state = None;
1720 thi->function = func;
1721 thi->mdev = mdev;
1722}
1723
1724int drbd_thread_start(struct drbd_thread *thi)
1725{
1726 struct drbd_conf *mdev = thi->mdev;
1727 struct task_struct *nt;
1728 unsigned long flags;
1729
1730 const char *me =
1731 thi == &mdev->receiver ? "receiver" :
1732 thi == &mdev->asender ? "asender" :
1733 thi == &mdev->worker ? "worker" : "NONSENSE";
1734
1735 /* is used from state engine doing drbd_thread_stop_nowait,
1736 * while holding the req lock irqsave */
1737 spin_lock_irqsave(&thi->t_lock, flags);
1738
1739 switch (thi->t_state) {
1740 case None:
1741 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1742 me, current->comm, current->pid);
1743
1744 /* Get ref on module for thread - this is released when thread exits */
1745 if (!try_module_get(THIS_MODULE)) {
1746 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1747 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001748 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001749 }
1750
1751 init_completion(&thi->stop);
1752 D_ASSERT(thi->task == NULL);
1753 thi->reset_cpu_mask = 1;
1754 thi->t_state = Running;
1755 spin_unlock_irqrestore(&thi->t_lock, flags);
1756 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1757
1758 nt = kthread_create(drbd_thread_setup, (void *) thi,
1759 "drbd%d_%s", mdev_to_minor(mdev), me);
1760
1761 if (IS_ERR(nt)) {
1762 dev_err(DEV, "Couldn't start thread\n");
1763
1764 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001765 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001766 }
1767 spin_lock_irqsave(&thi->t_lock, flags);
1768 thi->task = nt;
1769 thi->t_state = Running;
1770 spin_unlock_irqrestore(&thi->t_lock, flags);
1771 wake_up_process(nt);
1772 break;
1773 case Exiting:
1774 thi->t_state = Restarting;
1775 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1776 me, current->comm, current->pid);
1777 /* fall through */
1778 case Running:
1779 case Restarting:
1780 default:
1781 spin_unlock_irqrestore(&thi->t_lock, flags);
1782 break;
1783 }
1784
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001785 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001786}
1787
1788
1789void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1790{
1791 unsigned long flags;
1792
1793 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1794
1795 /* may be called from state engine, holding the req lock irqsave */
1796 spin_lock_irqsave(&thi->t_lock, flags);
1797
1798 if (thi->t_state == None) {
1799 spin_unlock_irqrestore(&thi->t_lock, flags);
1800 if (restart)
1801 drbd_thread_start(thi);
1802 return;
1803 }
1804
1805 if (thi->t_state != ns) {
1806 if (thi->task == NULL) {
1807 spin_unlock_irqrestore(&thi->t_lock, flags);
1808 return;
1809 }
1810
1811 thi->t_state = ns;
1812 smp_mb();
1813 init_completion(&thi->stop);
1814 if (thi->task != current)
1815 force_sig(DRBD_SIGKILL, thi->task);
1816
1817 }
1818
1819 spin_unlock_irqrestore(&thi->t_lock, flags);
1820
1821 if (wait)
1822 wait_for_completion(&thi->stop);
1823}
1824
1825#ifdef CONFIG_SMP
1826/**
1827 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1828 * @mdev: DRBD device.
1829 *
1830 * Forces all threads of a device onto the same CPU. This is beneficial for
1831 * DRBD's performance. May be overwritten by user's configuration.
1832 */
1833void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1834{
1835 int ord, cpu;
1836
1837 /* user override. */
1838 if (cpumask_weight(mdev->cpu_mask))
1839 return;
1840
1841 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1842 for_each_online_cpu(cpu) {
1843 if (ord-- == 0) {
1844 cpumask_set_cpu(cpu, mdev->cpu_mask);
1845 return;
1846 }
1847 }
1848 /* should not be reached */
1849 cpumask_setall(mdev->cpu_mask);
1850}
1851
1852/**
1853 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1854 * @mdev: DRBD device.
1855 *
1856 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1857 * prematurely.
1858 */
1859void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1860{
1861 struct task_struct *p = current;
1862 struct drbd_thread *thi =
1863 p == mdev->asender.task ? &mdev->asender :
1864 p == mdev->receiver.task ? &mdev->receiver :
1865 p == mdev->worker.task ? &mdev->worker :
1866 NULL;
1867 ERR_IF(thi == NULL)
1868 return;
1869 if (!thi->reset_cpu_mask)
1870 return;
1871 thi->reset_cpu_mask = 0;
1872 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1873}
1874#endif
1875
1876/* the appropriate socket mutex must be held already */
1877int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001878 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001879 size_t size, unsigned msg_flags)
1880{
1881 int sent, ok;
1882
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001883 ERR_IF(!h) return false;
1884 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001885
1886 h->magic = BE_DRBD_MAGIC;
1887 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001888 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001889
Philipp Reisnerb411b362009-09-25 16:07:19 -07001890 sent = drbd_send(mdev, sock, h, size, msg_flags);
1891
1892 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001893 if (!ok && !signal_pending(current))
1894 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001895 cmdname(cmd), (int)size, sent);
1896 return ok;
1897}
1898
1899/* don't pass the socket. we may only look at it
1900 * when we hold the appropriate socket mutex.
1901 */
1902int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001903 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001904{
1905 int ok = 0;
1906 struct socket *sock;
1907
1908 if (use_data_socket) {
1909 mutex_lock(&mdev->data.mutex);
1910 sock = mdev->data.socket;
1911 } else {
1912 mutex_lock(&mdev->meta.mutex);
1913 sock = mdev->meta.socket;
1914 }
1915
1916 /* drbd_disconnect() could have called drbd_free_sock()
1917 * while we were waiting in down()... */
1918 if (likely(sock != NULL))
1919 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1920
1921 if (use_data_socket)
1922 mutex_unlock(&mdev->data.mutex);
1923 else
1924 mutex_unlock(&mdev->meta.mutex);
1925 return ok;
1926}
1927
1928int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1929 size_t size)
1930{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001931 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001932 int ok;
1933
1934 h.magic = BE_DRBD_MAGIC;
1935 h.command = cpu_to_be16(cmd);
1936 h.length = cpu_to_be16(size);
1937
1938 if (!drbd_get_data_sock(mdev))
1939 return 0;
1940
Philipp Reisnerb411b362009-09-25 16:07:19 -07001941 ok = (sizeof(h) ==
1942 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1943 ok = ok && (size ==
1944 drbd_send(mdev, mdev->data.socket, data, size, 0));
1945
1946 drbd_put_data_sock(mdev);
1947
1948 return ok;
1949}
1950
1951int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1952{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001953 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001954 struct socket *sock;
1955 int size, rv;
1956 const int apv = mdev->agreed_pro_version;
1957
1958 size = apv <= 87 ? sizeof(struct p_rs_param)
1959 : apv == 88 ? sizeof(struct p_rs_param)
1960 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001961 : apv <= 94 ? sizeof(struct p_rs_param_89)
1962 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001963
1964 /* used from admin command context and receiver/worker context.
1965 * to avoid kmalloc, grab the socket right here,
1966 * then use the pre-allocated sbuf there */
1967 mutex_lock(&mdev->data.mutex);
1968 sock = mdev->data.socket;
1969
1970 if (likely(sock != NULL)) {
1971 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1972
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001973 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001974
1975 /* initialize verify_alg and csums_alg */
1976 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1977
1978 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001979 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1980 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1981 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1982 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001983
1984 if (apv >= 88)
1985 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1986 if (apv >= 89)
1987 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1988
1989 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1990 } else
1991 rv = 0; /* not ok */
1992
1993 mutex_unlock(&mdev->data.mutex);
1994
1995 return rv;
1996}
1997
1998int drbd_send_protocol(struct drbd_conf *mdev)
1999{
2000 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002001 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002002
2003 size = sizeof(struct p_protocol);
2004
2005 if (mdev->agreed_pro_version >= 87)
2006 size += strlen(mdev->net_conf->integrity_alg) + 1;
2007
2008 /* we must not recurse into our own queue,
2009 * as that is blocked during handshake */
2010 p = kmalloc(size, GFP_NOIO);
2011 if (p == NULL)
2012 return 0;
2013
2014 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2015 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2016 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2017 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002018 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2019
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002020 cf = 0;
2021 if (mdev->net_conf->want_lose)
2022 cf |= CF_WANT_LOSE;
2023 if (mdev->net_conf->dry_run) {
2024 if (mdev->agreed_pro_version >= 92)
2025 cf |= CF_DRY_RUN;
2026 else {
2027 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02002028 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01002029 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002030 }
2031 }
2032 p->conn_flags = cpu_to_be32(cf);
2033
Philipp Reisnerb411b362009-09-25 16:07:19 -07002034 if (mdev->agreed_pro_version >= 87)
2035 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2036
2037 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002038 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002039 kfree(p);
2040 return rv;
2041}
2042
2043int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2044{
2045 struct p_uuids p;
2046 int i;
2047
2048 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2049 return 1;
2050
2051 for (i = UI_CURRENT; i < UI_SIZE; i++)
2052 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2053
2054 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2055 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2056 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2057 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2058 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2059 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2060
2061 put_ldev(mdev);
2062
2063 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002064 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002065}
2066
2067int drbd_send_uuids(struct drbd_conf *mdev)
2068{
2069 return _drbd_send_uuids(mdev, 0);
2070}
2071
2072int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2073{
2074 return _drbd_send_uuids(mdev, 8);
2075}
2076
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002077void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2078{
2079 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2080 u64 *uuid = mdev->ldev->md.uuid;
2081 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2082 text,
2083 (unsigned long long)uuid[UI_CURRENT],
2084 (unsigned long long)uuid[UI_BITMAP],
2085 (unsigned long long)uuid[UI_HISTORY_START],
2086 (unsigned long long)uuid[UI_HISTORY_END]);
2087 put_ldev(mdev);
2088 } else {
2089 dev_info(DEV, "%s effective data uuid: %016llX\n",
2090 text,
2091 (unsigned long long)mdev->ed_uuid);
2092 }
2093}
2094
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002095int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002096{
2097 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002098 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002099
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002100 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2101
Philipp Reisner4a23f262011-01-11 17:42:17 +01002102 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002103 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002104 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002105 drbd_md_sync(mdev);
2106 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002107
2108 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002109 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002110}
2111
Philipp Reisnere89b5912010-03-24 17:11:33 +01002112int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002113{
2114 struct p_sizes p;
2115 sector_t d_size, u_size;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002116 int q_order_type, max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002117 int ok;
2118
2119 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2120 D_ASSERT(mdev->ldev->backing_bdev);
2121 d_size = drbd_get_max_capacity(mdev->ldev);
2122 u_size = mdev->ldev->dc.disk_size;
2123 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisner99432fc2011-05-20 16:39:13 +02002124 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2125 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002126 put_ldev(mdev);
2127 } else {
2128 d_size = 0;
2129 u_size = 0;
2130 q_order_type = QUEUE_ORDERED_NONE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002131 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002132 }
2133
Philipp Reisner68093842011-06-30 15:43:06 +02002134 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2135 if (mdev->agreed_pro_version <= 94)
2136 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2137
Philipp Reisnerb411b362009-09-25 16:07:19 -07002138 p.d_size = cpu_to_be64(d_size);
2139 p.u_size = cpu_to_be64(u_size);
2140 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Philipp Reisner99432fc2011-05-20 16:39:13 +02002141 p.max_bio_size = cpu_to_be32(max_bio_size);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002142 p.queue_order_type = cpu_to_be16(q_order_type);
2143 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002144
2145 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002146 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002147 return ok;
2148}
2149
2150/**
2151 * drbd_send_state() - Sends the drbd state to the peer
2152 * @mdev: DRBD device.
2153 */
2154int drbd_send_state(struct drbd_conf *mdev)
2155{
2156 struct socket *sock;
2157 struct p_state p;
2158 int ok = 0;
2159
2160 /* Grab state lock so we wont send state if we're in the middle
2161 * of a cluster wide state change on another thread */
2162 drbd_state_lock(mdev);
2163
2164 mutex_lock(&mdev->data.mutex);
2165
2166 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2167 sock = mdev->data.socket;
2168
2169 if (likely(sock != NULL)) {
2170 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002171 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002172 }
2173
2174 mutex_unlock(&mdev->data.mutex);
2175
2176 drbd_state_unlock(mdev);
2177 return ok;
2178}
2179
2180int drbd_send_state_req(struct drbd_conf *mdev,
2181 union drbd_state mask, union drbd_state val)
2182{
2183 struct p_req_state p;
2184
2185 p.mask = cpu_to_be32(mask.i);
2186 p.val = cpu_to_be32(val.i);
2187
2188 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002189 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002190}
2191
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002192int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002193{
2194 struct p_req_state_reply p;
2195
2196 p.retcode = cpu_to_be32(retcode);
2197
2198 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002199 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002200}
2201
2202int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2203 struct p_compressed_bm *p,
2204 struct bm_xfer_ctx *c)
2205{
2206 struct bitstream bs;
2207 unsigned long plain_bits;
2208 unsigned long tmp;
2209 unsigned long rl;
2210 unsigned len;
2211 unsigned toggle;
2212 int bits;
2213
2214 /* may we use this feature? */
2215 if ((mdev->sync_conf.use_rle == 0) ||
2216 (mdev->agreed_pro_version < 90))
2217 return 0;
2218
2219 if (c->bit_offset >= c->bm_bits)
2220 return 0; /* nothing to do. */
2221
2222 /* use at most thus many bytes */
2223 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2224 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2225 /* plain bits covered in this code string */
2226 plain_bits = 0;
2227
2228 /* p->encoding & 0x80 stores whether the first run length is set.
2229 * bit offset is implicit.
2230 * start with toggle == 2 to be able to tell the first iteration */
2231 toggle = 2;
2232
2233 /* see how much plain bits we can stuff into one packet
2234 * using RLE and VLI. */
2235 do {
2236 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2237 : _drbd_bm_find_next(mdev, c->bit_offset);
2238 if (tmp == -1UL)
2239 tmp = c->bm_bits;
2240 rl = tmp - c->bit_offset;
2241
2242 if (toggle == 2) { /* first iteration */
2243 if (rl == 0) {
2244 /* the first checked bit was set,
2245 * store start value, */
2246 DCBP_set_start(p, 1);
2247 /* but skip encoding of zero run length */
2248 toggle = !toggle;
2249 continue;
2250 }
2251 DCBP_set_start(p, 0);
2252 }
2253
2254 /* paranoia: catch zero runlength.
2255 * can only happen if bitmap is modified while we scan it. */
2256 if (rl == 0) {
2257 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2258 "t:%u bo:%lu\n", toggle, c->bit_offset);
2259 return -1;
2260 }
2261
2262 bits = vli_encode_bits(&bs, rl);
2263 if (bits == -ENOBUFS) /* buffer full */
2264 break;
2265 if (bits <= 0) {
2266 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2267 return 0;
2268 }
2269
2270 toggle = !toggle;
2271 plain_bits += rl;
2272 c->bit_offset = tmp;
2273 } while (c->bit_offset < c->bm_bits);
2274
2275 len = bs.cur.b - p->code + !!bs.cur.bit;
2276
2277 if (plain_bits < (len << 3)) {
2278 /* incompressible with this method.
2279 * we need to rewind both word and bit position. */
2280 c->bit_offset -= plain_bits;
2281 bm_xfer_ctx_bit_to_word_offset(c);
2282 c->bit_offset = c->word_offset * BITS_PER_LONG;
2283 return 0;
2284 }
2285
2286 /* RLE + VLI was able to compress it just fine.
2287 * update c->word_offset. */
2288 bm_xfer_ctx_bit_to_word_offset(c);
2289
2290 /* store pad_bits */
2291 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2292
2293 return len;
2294}
2295
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002296/**
2297 * send_bitmap_rle_or_plain
2298 *
2299 * Return 0 when done, 1 when another iteration is needed, and a negative error
2300 * code upon failure.
2301 */
2302static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002303send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002304 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002305{
2306 struct p_compressed_bm *p = (void*)h;
2307 unsigned long num_words;
2308 int len;
2309 int ok;
2310
2311 len = fill_bitmap_rle_bits(mdev, p, c);
2312
2313 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002314 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002315
2316 if (len) {
2317 DCBP_set_code(p, RLE_VLI_Bits);
2318 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2319 sizeof(*p) + len, 0);
2320
2321 c->packets[0]++;
2322 c->bytes[0] += sizeof(*p) + len;
2323
2324 if (c->bit_offset >= c->bm_bits)
2325 len = 0; /* DONE */
2326 } else {
2327 /* was not compressible.
2328 * send a buffer full of plain text bits instead. */
2329 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2330 len = num_words * sizeof(long);
2331 if (len)
2332 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2333 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002334 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002335 c->word_offset += num_words;
2336 c->bit_offset = c->word_offset * BITS_PER_LONG;
2337
2338 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002339 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002340
2341 if (c->bit_offset > c->bm_bits)
2342 c->bit_offset = c->bm_bits;
2343 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002344 if (ok) {
2345 if (len == 0) {
2346 INFO_bm_xfer_stats(mdev, "send", c);
2347 return 0;
2348 } else
2349 return 1;
2350 }
2351 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002352}
2353
2354/* See the comment at receive_bitmap() */
2355int _drbd_send_bitmap(struct drbd_conf *mdev)
2356{
2357 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002358 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002359 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002360
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002361 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002362
2363 /* maybe we should use some per thread scratch page,
2364 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002365 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002366 if (!p) {
2367 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002368 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002369 }
2370
2371 if (get_ldev(mdev)) {
2372 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2373 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2374 drbd_bm_set_all(mdev);
2375 if (drbd_bm_write(mdev)) {
2376 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2377 * but otherwise process as per normal - need to tell other
2378 * side that a full resync is required! */
2379 dev_err(DEV, "Failed to write bitmap to disk!\n");
2380 } else {
2381 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2382 drbd_md_sync(mdev);
2383 }
2384 }
2385 put_ldev(mdev);
2386 }
2387
2388 c = (struct bm_xfer_ctx) {
2389 .bm_bits = drbd_bm_bits(mdev),
2390 .bm_words = drbd_bm_words(mdev),
2391 };
2392
2393 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002394 err = send_bitmap_rle_or_plain(mdev, p, &c);
2395 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002396
2397 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002398 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002399}
2400
2401int drbd_send_bitmap(struct drbd_conf *mdev)
2402{
2403 int err;
2404
2405 if (!drbd_get_data_sock(mdev))
2406 return -1;
2407 err = !_drbd_send_bitmap(mdev);
2408 drbd_put_data_sock(mdev);
2409 return err;
2410}
2411
2412int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2413{
2414 int ok;
2415 struct p_barrier_ack p;
2416
2417 p.barrier = barrier_nr;
2418 p.set_size = cpu_to_be32(set_size);
2419
2420 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002421 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002422 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002423 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002424 return ok;
2425}
2426
2427/**
2428 * _drbd_send_ack() - Sends an ack packet
2429 * @mdev: DRBD device.
2430 * @cmd: Packet command code.
2431 * @sector: sector, needs to be in big endian byte order
2432 * @blksize: size in byte, needs to be in big endian byte order
2433 * @block_id: Id, big endian byte order
2434 */
2435static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2436 u64 sector,
2437 u32 blksize,
2438 u64 block_id)
2439{
2440 int ok;
2441 struct p_block_ack p;
2442
2443 p.sector = sector;
2444 p.block_id = block_id;
2445 p.blksize = blksize;
2446 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2447
2448 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002449 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002450 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002451 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002452 return ok;
2453}
2454
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002455/* dp->sector and dp->block_id already/still in network byte order,
2456 * data_size is payload size according to dp->head,
2457 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002458int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002459 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002460{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002461 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2462 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002463 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2464 dp->block_id);
2465}
2466
2467int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2468 struct p_block_req *rp)
2469{
2470 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2471}
2472
2473/**
2474 * drbd_send_ack() - Sends an ack packet
2475 * @mdev: DRBD device.
2476 * @cmd: Packet command code.
2477 * @e: Epoch entry.
2478 */
2479int drbd_send_ack(struct drbd_conf *mdev,
2480 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2481{
2482 return _drbd_send_ack(mdev, cmd,
2483 cpu_to_be64(e->sector),
2484 cpu_to_be32(e->size),
2485 e->block_id);
2486}
2487
2488/* This function misuses the block_id field to signal if the blocks
2489 * are is sync or not. */
2490int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2491 sector_t sector, int blksize, u64 block_id)
2492{
2493 return _drbd_send_ack(mdev, cmd,
2494 cpu_to_be64(sector),
2495 cpu_to_be32(blksize),
2496 cpu_to_be64(block_id));
2497}
2498
2499int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2500 sector_t sector, int size, u64 block_id)
2501{
2502 int ok;
2503 struct p_block_req p;
2504
2505 p.sector = cpu_to_be64(sector);
2506 p.block_id = block_id;
2507 p.blksize = cpu_to_be32(size);
2508
2509 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002510 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002511 return ok;
2512}
2513
2514int drbd_send_drequest_csum(struct drbd_conf *mdev,
2515 sector_t sector, int size,
2516 void *digest, int digest_size,
2517 enum drbd_packets cmd)
2518{
2519 int ok;
2520 struct p_block_req p;
2521
2522 p.sector = cpu_to_be64(sector);
2523 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2524 p.blksize = cpu_to_be32(size);
2525
2526 p.head.magic = BE_DRBD_MAGIC;
2527 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002528 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002529
2530 mutex_lock(&mdev->data.mutex);
2531
2532 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2533 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2534
2535 mutex_unlock(&mdev->data.mutex);
2536
2537 return ok;
2538}
2539
2540int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2541{
2542 int ok;
2543 struct p_block_req p;
2544
2545 p.sector = cpu_to_be64(sector);
2546 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2547 p.blksize = cpu_to_be32(size);
2548
2549 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002550 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002551 return ok;
2552}
2553
2554/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002555 * returns false if we should retry,
2556 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002557 */
2558static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2559{
2560 int drop_it;
2561 /* long elapsed = (long)(jiffies - mdev->last_received); */
2562
2563 drop_it = mdev->meta.socket == sock
2564 || !mdev->asender.task
2565 || get_t_state(&mdev->asender) != Running
2566 || mdev->state.conn < C_CONNECTED;
2567
2568 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002569 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002570
2571 drop_it = !--mdev->ko_count;
2572 if (!drop_it) {
2573 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2574 current->comm, current->pid, mdev->ko_count);
2575 request_ping(mdev);
2576 }
2577
2578 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2579}
2580
2581/* The idea of sendpage seems to be to put some kind of reference
2582 * to the page into the skb, and to hand it over to the NIC. In
2583 * this process get_page() gets called.
2584 *
2585 * As soon as the page was really sent over the network put_page()
2586 * gets called by some part of the network layer. [ NIC driver? ]
2587 *
2588 * [ get_page() / put_page() increment/decrement the count. If count
2589 * reaches 0 the page will be freed. ]
2590 *
2591 * This works nicely with pages from FSs.
2592 * But this means that in protocol A we might signal IO completion too early!
2593 *
2594 * In order not to corrupt data during a resync we must make sure
2595 * that we do not reuse our own buffer pages (EEs) to early, therefore
2596 * we have the net_ee list.
2597 *
2598 * XFS seems to have problems, still, it submits pages with page_count == 0!
2599 * As a workaround, we disable sendpage on pages
2600 * with page_count == 0 or PageSlab.
2601 */
2602static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002603 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002604{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002605 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002606 kunmap(page);
2607 if (sent == size)
2608 mdev->send_cnt += size>>9;
2609 return sent == size;
2610}
2611
2612static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002613 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002614{
2615 mm_segment_t oldfs = get_fs();
2616 int sent, ok;
2617 int len = size;
2618
2619 /* e.g. XFS meta- & log-data is in slab pages, which have a
2620 * page_count of 0 and/or have PageSlab() set.
2621 * we cannot use send_page for those, as that does get_page();
2622 * put_page(); and would cause either a VM_BUG directly, or
2623 * __page_cache_release a page that would actually still be referenced
2624 * by someone, leading to some obscure delayed Oops somewhere else. */
2625 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002626 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002627
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002628 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002629 drbd_update_congested(mdev);
2630 set_fs(KERNEL_DS);
2631 do {
2632 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2633 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002634 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002635 if (sent == -EAGAIN) {
2636 if (we_should_drop_the_connection(mdev,
2637 mdev->data.socket))
2638 break;
2639 else
2640 continue;
2641 }
2642 if (sent <= 0) {
2643 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2644 __func__, (int)size, len, sent);
2645 break;
2646 }
2647 len -= sent;
2648 offset += sent;
2649 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2650 set_fs(oldfs);
2651 clear_bit(NET_CONGESTED, &mdev->flags);
2652
2653 ok = (len == 0);
2654 if (likely(ok))
2655 mdev->send_cnt += size>>9;
2656 return ok;
2657}
2658
2659static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2660{
2661 struct bio_vec *bvec;
2662 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002663 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002664 __bio_for_each_segment(bvec, bio, i, 0) {
2665 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002666 bvec->bv_offset, bvec->bv_len,
2667 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002668 return 0;
2669 }
2670 return 1;
2671}
2672
2673static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2674{
2675 struct bio_vec *bvec;
2676 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002677 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002678 __bio_for_each_segment(bvec, bio, i, 0) {
2679 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002680 bvec->bv_offset, bvec->bv_len,
2681 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002682 return 0;
2683 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002684 return 1;
2685}
2686
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002687static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2688{
2689 struct page *page = e->pages;
2690 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002691 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002692 page_chain_for_each(page) {
2693 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002694 if (!_drbd_send_page(mdev, page, 0, l,
2695 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002696 return 0;
2697 len -= l;
2698 }
2699 return 1;
2700}
2701
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002702static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2703{
2704 if (mdev->agreed_pro_version >= 95)
2705 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002706 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2707 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2708 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2709 else
Jens Axboe721a9602011-03-09 11:56:30 +01002710 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002711}
2712
Philipp Reisnerb411b362009-09-25 16:07:19 -07002713/* Used to send write requests
2714 * R_PRIMARY -> Peer (P_DATA)
2715 */
2716int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2717{
2718 int ok = 1;
2719 struct p_data p;
2720 unsigned int dp_flags = 0;
2721 void *dgb;
2722 int dgs;
2723
2724 if (!drbd_get_data_sock(mdev))
2725 return 0;
2726
2727 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2728 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2729
Philipp Reisnerd5373382010-08-23 15:18:33 +02002730 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002731 p.head.h80.magic = BE_DRBD_MAGIC;
2732 p.head.h80.command = cpu_to_be16(P_DATA);
2733 p.head.h80.length =
2734 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2735 } else {
2736 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2737 p.head.h95.command = cpu_to_be16(P_DATA);
2738 p.head.h95.length =
2739 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2740 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002741
2742 p.sector = cpu_to_be64(req->sector);
2743 p.block_id = (unsigned long)req;
2744 p.seq_num = cpu_to_be32(req->seq_num =
2745 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002746
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002747 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2748
Philipp Reisnerb411b362009-09-25 16:07:19 -07002749 if (mdev->state.conn >= C_SYNC_SOURCE &&
2750 mdev->state.conn <= C_PAUSED_SYNC_T)
2751 dp_flags |= DP_MAY_SET_IN_SYNC;
2752
2753 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002754 set_bit(UNPLUG_REMOTE, &mdev->flags);
2755 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002756 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002757 if (ok && dgs) {
2758 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002759 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002760 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002761 }
2762 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002763 /* For protocol A, we have to memcpy the payload into
2764 * socket buffers, as we may complete right away
2765 * as soon as we handed it over to tcp, at which point the data
2766 * pages may become invalid.
2767 *
2768 * For data-integrity enabled, we copy it as well, so we can be
2769 * sure that even if the bio pages may still be modified, it
2770 * won't change the data on the wire, thus if the digest checks
2771 * out ok after sending on this side, but does not fit on the
2772 * receiving side, we sure have detected corruption elsewhere.
2773 */
2774 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002775 ok = _drbd_send_bio(mdev, req->master_bio);
2776 else
2777 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002778
2779 /* double check digest, sometimes buffers have been modified in flight. */
2780 if (dgs > 0 && dgs <= 64) {
Bart Van Assche24c48302011-05-21 18:32:29 +02002781 /* 64 byte, 512 bit, is the largest digest size
Lars Ellenberg470be442010-11-10 10:36:52 +01002782 * currently supported in kernel crypto. */
2783 unsigned char digest[64];
2784 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2785 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2786 dev_warn(DEV,
2787 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2788 (unsigned long long)req->sector, req->size);
2789 }
2790 } /* else if (dgs > 64) {
2791 ... Be noisy about digest too large ...
2792 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002793 }
2794
2795 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002796
Philipp Reisnerb411b362009-09-25 16:07:19 -07002797 return ok;
2798}
2799
2800/* answer packet, used to send data back for read requests:
2801 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2802 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2803 */
2804int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2805 struct drbd_epoch_entry *e)
2806{
2807 int ok;
2808 struct p_data p;
2809 void *dgb;
2810 int dgs;
2811
2812 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2813 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2814
Philipp Reisnerd5373382010-08-23 15:18:33 +02002815 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002816 p.head.h80.magic = BE_DRBD_MAGIC;
2817 p.head.h80.command = cpu_to_be16(cmd);
2818 p.head.h80.length =
2819 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2820 } else {
2821 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2822 p.head.h95.command = cpu_to_be16(cmd);
2823 p.head.h95.length =
2824 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2825 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002826
2827 p.sector = cpu_to_be64(e->sector);
2828 p.block_id = e->block_id;
2829 /* p.seq_num = 0; No sequence numbers here.. */
2830
2831 /* Only called by our kernel thread.
2832 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2833 * in response to admin command or module unload.
2834 */
2835 if (!drbd_get_data_sock(mdev))
2836 return 0;
2837
Philipp Reisner0b70a132010-08-20 13:36:10 +02002838 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002839 if (ok && dgs) {
2840 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002841 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002842 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002843 }
2844 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002845 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002846
2847 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002848
Philipp Reisnerb411b362009-09-25 16:07:19 -07002849 return ok;
2850}
2851
Philipp Reisner73a01a12010-10-27 14:33:00 +02002852int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2853{
2854 struct p_block_desc p;
2855
2856 p.sector = cpu_to_be64(req->sector);
2857 p.blksize = cpu_to_be32(req->size);
2858
2859 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2860}
2861
Philipp Reisnerb411b362009-09-25 16:07:19 -07002862/*
2863 drbd_send distinguishes two cases:
2864
2865 Packets sent via the data socket "sock"
2866 and packets sent via the meta data socket "msock"
2867
2868 sock msock
2869 -----------------+-------------------------+------------------------------
2870 timeout conf.timeout / 2 conf.timeout / 2
2871 timeout action send a ping via msock Abort communication
2872 and close all sockets
2873*/
2874
2875/*
2876 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2877 */
2878int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2879 void *buf, size_t size, unsigned msg_flags)
2880{
2881 struct kvec iov;
2882 struct msghdr msg;
2883 int rv, sent = 0;
2884
2885 if (!sock)
2886 return -1000;
2887
2888 /* THINK if (signal_pending) return ... ? */
2889
2890 iov.iov_base = buf;
2891 iov.iov_len = size;
2892
2893 msg.msg_name = NULL;
2894 msg.msg_namelen = 0;
2895 msg.msg_control = NULL;
2896 msg.msg_controllen = 0;
2897 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2898
2899 if (sock == mdev->data.socket) {
2900 mdev->ko_count = mdev->net_conf->ko_count;
2901 drbd_update_congested(mdev);
2902 }
2903 do {
2904 /* STRANGE
2905 * tcp_sendmsg does _not_ use its size parameter at all ?
2906 *
2907 * -EAGAIN on timeout, -EINTR on signal.
2908 */
2909/* THINK
2910 * do we need to block DRBD_SIG if sock == &meta.socket ??
2911 * otherwise wake_asender() might interrupt some send_*Ack !
2912 */
2913 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2914 if (rv == -EAGAIN) {
2915 if (we_should_drop_the_connection(mdev, sock))
2916 break;
2917 else
2918 continue;
2919 }
2920 D_ASSERT(rv != 0);
2921 if (rv == -EINTR) {
2922 flush_signals(current);
2923 rv = 0;
2924 }
2925 if (rv < 0)
2926 break;
2927 sent += rv;
2928 iov.iov_base += rv;
2929 iov.iov_len -= rv;
2930 } while (sent < size);
2931
2932 if (sock == mdev->data.socket)
2933 clear_bit(NET_CONGESTED, &mdev->flags);
2934
2935 if (rv <= 0) {
2936 if (rv != -EAGAIN) {
2937 dev_err(DEV, "%s_sendmsg returned %d\n",
2938 sock == mdev->meta.socket ? "msock" : "sock",
2939 rv);
2940 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2941 } else
2942 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2943 }
2944
2945 return sent;
2946}
2947
2948static int drbd_open(struct block_device *bdev, fmode_t mode)
2949{
2950 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2951 unsigned long flags;
2952 int rv = 0;
2953
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002954 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002955 spin_lock_irqsave(&mdev->req_lock, flags);
2956 /* to have a stable mdev->state.role
2957 * and no race with updating open_cnt */
2958
2959 if (mdev->state.role != R_PRIMARY) {
2960 if (mode & FMODE_WRITE)
2961 rv = -EROFS;
2962 else if (!allow_oos)
2963 rv = -EMEDIUMTYPE;
2964 }
2965
2966 if (!rv)
2967 mdev->open_cnt++;
2968 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002969 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002970
2971 return rv;
2972}
2973
2974static int drbd_release(struct gendisk *gd, fmode_t mode)
2975{
2976 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002977 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002978 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002979 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002980 return 0;
2981}
2982
Philipp Reisnerb411b362009-09-25 16:07:19 -07002983static void drbd_set_defaults(struct drbd_conf *mdev)
2984{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002985 /* This way we get a compile error when sync_conf grows,
2986 and we forgot to initialize it here */
2987 mdev->sync_conf = (struct syncer_conf) {
2988 /* .rate = */ DRBD_RATE_DEF,
2989 /* .after = */ DRBD_AFTER_DEF,
2990 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002991 /* .verify_alg = */ {}, 0,
2992 /* .cpu_mask = */ {}, 0,
2993 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002994 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002995 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2996 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2997 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2998 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002999 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3000 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02003001 };
3002
3003 /* Have to use that way, because the layout differs between
3004 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003005 mdev->state = (union drbd_state) {
3006 { .role = R_SECONDARY,
3007 .peer = R_UNKNOWN,
3008 .conn = C_STANDALONE,
3009 .disk = D_DISKLESS,
3010 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02003011 .susp = 0,
3012 .susp_nod = 0,
3013 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07003014 } };
3015}
3016
3017void drbd_init_set_defaults(struct drbd_conf *mdev)
3018{
3019 /* the memset(,0,) did most of this.
3020 * note: only assignments, no allocation in here */
3021
3022 drbd_set_defaults(mdev);
3023
Philipp Reisnerb411b362009-09-25 16:07:19 -07003024 atomic_set(&mdev->ap_bio_cnt, 0);
3025 atomic_set(&mdev->ap_pending_cnt, 0);
3026 atomic_set(&mdev->rs_pending_cnt, 0);
3027 atomic_set(&mdev->unacked_cnt, 0);
3028 atomic_set(&mdev->local_cnt, 0);
3029 atomic_set(&mdev->net_cnt, 0);
3030 atomic_set(&mdev->packet_seq, 0);
3031 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003032 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02003033 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003034 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02003035 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003036
3037 mutex_init(&mdev->md_io_mutex);
3038 mutex_init(&mdev->data.mutex);
3039 mutex_init(&mdev->meta.mutex);
3040 sema_init(&mdev->data.work.s, 0);
3041 sema_init(&mdev->meta.work.s, 0);
3042 mutex_init(&mdev->state_mutex);
3043
3044 spin_lock_init(&mdev->data.work.q_lock);
3045 spin_lock_init(&mdev->meta.work.q_lock);
3046
3047 spin_lock_init(&mdev->al_lock);
3048 spin_lock_init(&mdev->req_lock);
3049 spin_lock_init(&mdev->peer_seq_lock);
3050 spin_lock_init(&mdev->epoch_lock);
3051
3052 INIT_LIST_HEAD(&mdev->active_ee);
3053 INIT_LIST_HEAD(&mdev->sync_ee);
3054 INIT_LIST_HEAD(&mdev->done_ee);
3055 INIT_LIST_HEAD(&mdev->read_ee);
3056 INIT_LIST_HEAD(&mdev->net_ee);
3057 INIT_LIST_HEAD(&mdev->resync_reads);
3058 INIT_LIST_HEAD(&mdev->data.work.q);
3059 INIT_LIST_HEAD(&mdev->meta.work.q);
3060 INIT_LIST_HEAD(&mdev->resync_work.list);
3061 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003062 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003063 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003064 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003065 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003066
Philipp Reisner794abb72010-12-27 11:51:23 +01003067 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003068 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003069 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003070 mdev->md_sync_work.cb = w_md_sync;
3071 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003072 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003073 init_timer(&mdev->resync_timer);
3074 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003075 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003076 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003077 mdev->resync_timer.function = resync_timer_fn;
3078 mdev->resync_timer.data = (unsigned long) mdev;
3079 mdev->md_sync_timer.function = md_sync_timer_fn;
3080 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003081 mdev->start_resync_timer.function = start_resync_timer_fn;
3082 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003083 mdev->request_timer.function = request_timer_fn;
3084 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003085
3086 init_waitqueue_head(&mdev->misc_wait);
3087 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003088 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003089 init_waitqueue_head(&mdev->ee_wait);
3090 init_waitqueue_head(&mdev->al_wait);
3091 init_waitqueue_head(&mdev->seq_wait);
3092
3093 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3094 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3095 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3096
3097 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02003098 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003099 mdev->resync_wenr = LC_FREE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003100 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3101 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003102}
3103
3104void drbd_mdev_cleanup(struct drbd_conf *mdev)
3105{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003106 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003107 if (mdev->receiver.t_state != None)
3108 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3109 mdev->receiver.t_state);
3110
3111 /* no need to lock it, I'm the only thread alive */
3112 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3113 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3114 mdev->al_writ_cnt =
3115 mdev->bm_writ_cnt =
3116 mdev->read_cnt =
3117 mdev->recv_cnt =
3118 mdev->send_cnt =
3119 mdev->writ_cnt =
3120 mdev->p_size =
3121 mdev->rs_start =
3122 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003123 mdev->rs_failed = 0;
3124 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003125 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003126 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3127 mdev->rs_mark_left[i] = 0;
3128 mdev->rs_mark_time[i] = 0;
3129 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003130 D_ASSERT(mdev->net_conf == NULL);
3131
3132 drbd_set_my_capacity(mdev, 0);
3133 if (mdev->bitmap) {
3134 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003135 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003136 drbd_bm_cleanup(mdev);
3137 }
3138
3139 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003140 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003141
3142 /*
3143 * currently we drbd_init_ee only on module load, so
3144 * we may do drbd_release_ee only on module unload!
3145 */
3146 D_ASSERT(list_empty(&mdev->active_ee));
3147 D_ASSERT(list_empty(&mdev->sync_ee));
3148 D_ASSERT(list_empty(&mdev->done_ee));
3149 D_ASSERT(list_empty(&mdev->read_ee));
3150 D_ASSERT(list_empty(&mdev->net_ee));
3151 D_ASSERT(list_empty(&mdev->resync_reads));
3152 D_ASSERT(list_empty(&mdev->data.work.q));
3153 D_ASSERT(list_empty(&mdev->meta.work.q));
3154 D_ASSERT(list_empty(&mdev->resync_work.list));
3155 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003156 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003157
3158 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003159}
3160
3161
3162static void drbd_destroy_mempools(void)
3163{
3164 struct page *page;
3165
3166 while (drbd_pp_pool) {
3167 page = drbd_pp_pool;
3168 drbd_pp_pool = (struct page *)page_private(page);
3169 __free_page(page);
3170 drbd_pp_vacant--;
3171 }
3172
3173 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3174
3175 if (drbd_ee_mempool)
3176 mempool_destroy(drbd_ee_mempool);
3177 if (drbd_request_mempool)
3178 mempool_destroy(drbd_request_mempool);
3179 if (drbd_ee_cache)
3180 kmem_cache_destroy(drbd_ee_cache);
3181 if (drbd_request_cache)
3182 kmem_cache_destroy(drbd_request_cache);
3183 if (drbd_bm_ext_cache)
3184 kmem_cache_destroy(drbd_bm_ext_cache);
3185 if (drbd_al_ext_cache)
3186 kmem_cache_destroy(drbd_al_ext_cache);
3187
3188 drbd_ee_mempool = NULL;
3189 drbd_request_mempool = NULL;
3190 drbd_ee_cache = NULL;
3191 drbd_request_cache = NULL;
3192 drbd_bm_ext_cache = NULL;
3193 drbd_al_ext_cache = NULL;
3194
3195 return;
3196}
3197
3198static int drbd_create_mempools(void)
3199{
3200 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003201 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003202 int i;
3203
3204 /* prepare our caches and mempools */
3205 drbd_request_mempool = NULL;
3206 drbd_ee_cache = NULL;
3207 drbd_request_cache = NULL;
3208 drbd_bm_ext_cache = NULL;
3209 drbd_al_ext_cache = NULL;
3210 drbd_pp_pool = NULL;
3211
3212 /* caches */
3213 drbd_request_cache = kmem_cache_create(
3214 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3215 if (drbd_request_cache == NULL)
3216 goto Enomem;
3217
3218 drbd_ee_cache = kmem_cache_create(
3219 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3220 if (drbd_ee_cache == NULL)
3221 goto Enomem;
3222
3223 drbd_bm_ext_cache = kmem_cache_create(
3224 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3225 if (drbd_bm_ext_cache == NULL)
3226 goto Enomem;
3227
3228 drbd_al_ext_cache = kmem_cache_create(
3229 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3230 if (drbd_al_ext_cache == NULL)
3231 goto Enomem;
3232
3233 /* mempools */
3234 drbd_request_mempool = mempool_create(number,
3235 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3236 if (drbd_request_mempool == NULL)
3237 goto Enomem;
3238
3239 drbd_ee_mempool = mempool_create(number,
3240 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003241 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003242 goto Enomem;
3243
3244 /* drbd's page pool */
3245 spin_lock_init(&drbd_pp_lock);
3246
3247 for (i = 0; i < number; i++) {
3248 page = alloc_page(GFP_HIGHUSER);
3249 if (!page)
3250 goto Enomem;
3251 set_page_private(page, (unsigned long)drbd_pp_pool);
3252 drbd_pp_pool = page;
3253 }
3254 drbd_pp_vacant = number;
3255
3256 return 0;
3257
3258Enomem:
3259 drbd_destroy_mempools(); /* in case we allocated some */
3260 return -ENOMEM;
3261}
3262
3263static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3264 void *unused)
3265{
3266 /* just so we have it. you never know what interesting things we
3267 * might want to do here some day...
3268 */
3269
3270 return NOTIFY_DONE;
3271}
3272
3273static struct notifier_block drbd_notifier = {
3274 .notifier_call = drbd_notify_sys,
3275};
3276
3277static void drbd_release_ee_lists(struct drbd_conf *mdev)
3278{
3279 int rr;
3280
3281 rr = drbd_release_ee(mdev, &mdev->active_ee);
3282 if (rr)
3283 dev_err(DEV, "%d EEs in active list found!\n", rr);
3284
3285 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3286 if (rr)
3287 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3288
3289 rr = drbd_release_ee(mdev, &mdev->read_ee);
3290 if (rr)
3291 dev_err(DEV, "%d EEs in read list found!\n", rr);
3292
3293 rr = drbd_release_ee(mdev, &mdev->done_ee);
3294 if (rr)
3295 dev_err(DEV, "%d EEs in done list found!\n", rr);
3296
3297 rr = drbd_release_ee(mdev, &mdev->net_ee);
3298 if (rr)
3299 dev_err(DEV, "%d EEs in net list found!\n", rr);
3300}
3301
3302/* caution. no locking.
3303 * currently only used from module cleanup code. */
3304static void drbd_delete_device(unsigned int minor)
3305{
3306 struct drbd_conf *mdev = minor_to_mdev(minor);
3307
3308 if (!mdev)
3309 return;
3310
3311 /* paranoia asserts */
3312 if (mdev->open_cnt != 0)
3313 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3314 __FILE__ , __LINE__);
3315
3316 ERR_IF (!list_empty(&mdev->data.work.q)) {
3317 struct list_head *lp;
3318 list_for_each(lp, &mdev->data.work.q) {
3319 dev_err(DEV, "lp = %p\n", lp);
3320 }
3321 };
3322 /* end paranoia asserts */
3323
3324 del_gendisk(mdev->vdisk);
3325
3326 /* cleanup stuff that may have been allocated during
3327 * device (re-)configuration or state changes */
3328
3329 if (mdev->this_bdev)
3330 bdput(mdev->this_bdev);
3331
3332 drbd_free_resources(mdev);
3333
3334 drbd_release_ee_lists(mdev);
3335
Bart Van Assche24c48302011-05-21 18:32:29 +02003336 /* should be freed on disconnect? */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003337 kfree(mdev->ee_hash);
3338 /*
3339 mdev->ee_hash_s = 0;
3340 mdev->ee_hash = NULL;
3341 */
3342
3343 lc_destroy(mdev->act_log);
3344 lc_destroy(mdev->resync);
3345
3346 kfree(mdev->p_uuid);
3347 /* mdev->p_uuid = NULL; */
3348
3349 kfree(mdev->int_dig_out);
3350 kfree(mdev->int_dig_in);
3351 kfree(mdev->int_dig_vv);
3352
3353 /* cleanup the rest that has been
3354 * allocated from drbd_new_device
3355 * and actually free the mdev itself */
3356 drbd_free_mdev(mdev);
3357}
3358
3359static void drbd_cleanup(void)
3360{
3361 unsigned int i;
3362
3363 unregister_reboot_notifier(&drbd_notifier);
3364
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003365 /* first remove proc,
3366 * drbdsetup uses it's presence to detect
3367 * whether DRBD is loaded.
3368 * If we would get stuck in proc removal,
3369 * but have netlink already deregistered,
3370 * some drbdsetup commands may wait forever
3371 * for an answer.
3372 */
3373 if (drbd_proc)
3374 remove_proc_entry("drbd", NULL);
3375
Philipp Reisnerb411b362009-09-25 16:07:19 -07003376 drbd_nl_cleanup();
3377
3378 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003379 i = minor_count;
3380 while (i--)
3381 drbd_delete_device(i);
3382 drbd_destroy_mempools();
3383 }
3384
3385 kfree(minor_table);
3386
3387 unregister_blkdev(DRBD_MAJOR, "drbd");
3388
3389 printk(KERN_INFO "drbd: module cleanup done.\n");
3390}
3391
3392/**
3393 * drbd_congested() - Callback for pdflush
3394 * @congested_data: User data
3395 * @bdi_bits: Bits pdflush is currently interested in
3396 *
3397 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3398 */
3399static int drbd_congested(void *congested_data, int bdi_bits)
3400{
3401 struct drbd_conf *mdev = congested_data;
3402 struct request_queue *q;
3403 char reason = '-';
3404 int r = 0;
3405
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003406 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003407 /* DRBD has frozen IO */
3408 r = bdi_bits;
3409 reason = 'd';
3410 goto out;
3411 }
3412
3413 if (get_ldev(mdev)) {
3414 q = bdev_get_queue(mdev->ldev->backing_bdev);
3415 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3416 put_ldev(mdev);
3417 if (r)
3418 reason = 'b';
3419 }
3420
3421 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3422 r |= (1 << BDI_async_congested);
3423 reason = reason == 'b' ? 'a' : 'n';
3424 }
3425
3426out:
3427 mdev->congestion_reason = reason;
3428 return r;
3429}
3430
3431struct drbd_conf *drbd_new_device(unsigned int minor)
3432{
3433 struct drbd_conf *mdev;
3434 struct gendisk *disk;
3435 struct request_queue *q;
3436
3437 /* GFP_KERNEL, we are outside of all write-out paths */
3438 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3439 if (!mdev)
3440 return NULL;
3441 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3442 goto out_no_cpumask;
3443
3444 mdev->minor = minor;
3445
3446 drbd_init_set_defaults(mdev);
3447
3448 q = blk_alloc_queue(GFP_KERNEL);
3449 if (!q)
3450 goto out_no_q;
3451 mdev->rq_queue = q;
3452 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003453
3454 disk = alloc_disk(1);
3455 if (!disk)
3456 goto out_no_disk;
3457 mdev->vdisk = disk;
3458
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003459 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003460
3461 disk->queue = q;
3462 disk->major = DRBD_MAJOR;
3463 disk->first_minor = minor;
3464 disk->fops = &drbd_ops;
3465 sprintf(disk->disk_name, "drbd%d", minor);
3466 disk->private_data = mdev;
3467
3468 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3469 /* we have no partitions. we contain only ourselves. */
3470 mdev->this_bdev->bd_contains = mdev->this_bdev;
3471
3472 q->backing_dev_info.congested_fn = drbd_congested;
3473 q->backing_dev_info.congested_data = mdev;
3474
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003475 blk_queue_make_request(q, drbd_make_request);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003476 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3477 This triggers a max_bio_size message upon first attach or connect */
3478 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003479 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3480 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003481 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003482
3483 mdev->md_io_page = alloc_page(GFP_KERNEL);
3484 if (!mdev->md_io_page)
3485 goto out_no_io_page;
3486
3487 if (drbd_bm_init(mdev))
3488 goto out_no_bitmap;
3489 /* no need to lock access, we are still initializing this minor device. */
3490 if (!tl_init(mdev))
3491 goto out_no_tl;
3492
3493 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3494 if (!mdev->app_reads_hash)
3495 goto out_no_app_reads;
3496
3497 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3498 if (!mdev->current_epoch)
3499 goto out_no_epoch;
3500
3501 INIT_LIST_HEAD(&mdev->current_epoch->list);
3502 mdev->epochs = 1;
3503
3504 return mdev;
3505
3506/* out_whatever_else:
3507 kfree(mdev->current_epoch); */
3508out_no_epoch:
3509 kfree(mdev->app_reads_hash);
3510out_no_app_reads:
3511 tl_cleanup(mdev);
3512out_no_tl:
3513 drbd_bm_cleanup(mdev);
3514out_no_bitmap:
3515 __free_page(mdev->md_io_page);
3516out_no_io_page:
3517 put_disk(disk);
3518out_no_disk:
3519 blk_cleanup_queue(q);
3520out_no_q:
3521 free_cpumask_var(mdev->cpu_mask);
3522out_no_cpumask:
3523 kfree(mdev);
3524 return NULL;
3525}
3526
3527/* counterpart of drbd_new_device.
3528 * last part of drbd_delete_device. */
3529void drbd_free_mdev(struct drbd_conf *mdev)
3530{
3531 kfree(mdev->current_epoch);
3532 kfree(mdev->app_reads_hash);
3533 tl_cleanup(mdev);
3534 if (mdev->bitmap) /* should no longer be there. */
3535 drbd_bm_cleanup(mdev);
3536 __free_page(mdev->md_io_page);
3537 put_disk(mdev->vdisk);
3538 blk_cleanup_queue(mdev->rq_queue);
3539 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003540 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003541 kfree(mdev);
3542}
3543
3544
3545int __init drbd_init(void)
3546{
3547 int err;
3548
3549 if (sizeof(struct p_handshake) != 80) {
3550 printk(KERN_ERR
3551 "drbd: never change the size or layout "
3552 "of the HandShake packet.\n");
3553 return -EINVAL;
3554 }
3555
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003556 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003557 printk(KERN_ERR
3558 "drbd: invalid minor_count (%d)\n", minor_count);
3559#ifdef MODULE
3560 return -EINVAL;
3561#else
3562 minor_count = 8;
3563#endif
3564 }
3565
3566 err = drbd_nl_init();
3567 if (err)
3568 return err;
3569
3570 err = register_blkdev(DRBD_MAJOR, "drbd");
3571 if (err) {
3572 printk(KERN_ERR
3573 "drbd: unable to register block device major %d\n",
3574 DRBD_MAJOR);
3575 return err;
3576 }
3577
3578 register_reboot_notifier(&drbd_notifier);
3579
3580 /*
3581 * allocate all necessary structs
3582 */
3583 err = -ENOMEM;
3584
3585 init_waitqueue_head(&drbd_pp_wait);
3586
3587 drbd_proc = NULL; /* play safe for drbd_cleanup */
3588 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3589 GFP_KERNEL);
3590 if (!minor_table)
3591 goto Enomem;
3592
3593 err = drbd_create_mempools();
3594 if (err)
3595 goto Enomem;
3596
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003597 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003598 if (!drbd_proc) {
3599 printk(KERN_ERR "drbd: unable to register proc file\n");
3600 goto Enomem;
3601 }
3602
3603 rwlock_init(&global_state_lock);
3604
3605 printk(KERN_INFO "drbd: initialized. "
3606 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3607 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3608 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3609 printk(KERN_INFO "drbd: registered as block device major %d\n",
3610 DRBD_MAJOR);
3611 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3612
3613 return 0; /* Success! */
3614
3615Enomem:
3616 drbd_cleanup();
3617 if (err == -ENOMEM)
3618 /* currently always the case */
3619 printk(KERN_ERR "drbd: ran out of memory\n");
3620 else
3621 printk(KERN_ERR "drbd: initialization failure\n");
3622 return err;
3623}
3624
3625void drbd_free_bc(struct drbd_backing_dev *ldev)
3626{
3627 if (ldev == NULL)
3628 return;
3629
Tejun Heoe525fd82010-11-13 11:55:17 +01003630 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3631 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003632
3633 kfree(ldev);
3634}
3635
3636void drbd_free_sock(struct drbd_conf *mdev)
3637{
3638 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003639 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003640 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3641 sock_release(mdev->data.socket);
3642 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003643 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003644 }
3645 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003646 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003647 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3648 sock_release(mdev->meta.socket);
3649 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003650 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003651 }
3652}
3653
3654
3655void drbd_free_resources(struct drbd_conf *mdev)
3656{
3657 crypto_free_hash(mdev->csums_tfm);
3658 mdev->csums_tfm = NULL;
3659 crypto_free_hash(mdev->verify_tfm);
3660 mdev->verify_tfm = NULL;
3661 crypto_free_hash(mdev->cram_hmac_tfm);
3662 mdev->cram_hmac_tfm = NULL;
3663 crypto_free_hash(mdev->integrity_w_tfm);
3664 mdev->integrity_w_tfm = NULL;
3665 crypto_free_hash(mdev->integrity_r_tfm);
3666 mdev->integrity_r_tfm = NULL;
3667
3668 drbd_free_sock(mdev);
3669
3670 __no_warn(local,
3671 drbd_free_bc(mdev->ldev);
3672 mdev->ldev = NULL;);
3673}
3674
3675/* meta data management */
3676
3677struct meta_data_on_disk {
3678 u64 la_size; /* last agreed size. */
3679 u64 uuid[UI_SIZE]; /* UUIDs. */
3680 u64 device_uuid;
3681 u64 reserved_u64_1;
3682 u32 flags; /* MDF */
3683 u32 magic;
3684 u32 md_size_sect;
3685 u32 al_offset; /* offset to this block */
3686 u32 al_nr_extents; /* important for restoring the AL */
3687 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3688 u32 bm_offset; /* offset to the bitmap, from here */
3689 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
Philipp Reisner99432fc2011-05-20 16:39:13 +02003690 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3691 u32 reserved_u32[3];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003692
3693} __packed;
3694
3695/**
3696 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3697 * @mdev: DRBD device.
3698 */
3699void drbd_md_sync(struct drbd_conf *mdev)
3700{
3701 struct meta_data_on_disk *buffer;
3702 sector_t sector;
3703 int i;
3704
Lars Ellenbergee15b032010-09-03 10:00:09 +02003705 del_timer(&mdev->md_sync_timer);
3706 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003707 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3708 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003709
3710 /* We use here D_FAILED and not D_ATTACHING because we try to write
3711 * metadata even if we detach due to a disk failure! */
3712 if (!get_ldev_if_state(mdev, D_FAILED))
3713 return;
3714
Philipp Reisnerb411b362009-09-25 16:07:19 -07003715 mutex_lock(&mdev->md_io_mutex);
3716 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3717 memset(buffer, 0, 512);
3718
3719 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3720 for (i = UI_CURRENT; i < UI_SIZE; i++)
3721 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3722 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3723 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3724
3725 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3726 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3727 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3728 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3729 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3730
3731 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003732 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003733
3734 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3735 sector = mdev->ldev->md.md_offset;
3736
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003737 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003738 /* this was a try anyways ... */
3739 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003740 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003741 }
3742
3743 /* Update mdev->ldev->md.la_size_sect,
3744 * since we updated it on metadata. */
3745 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3746
3747 mutex_unlock(&mdev->md_io_mutex);
3748 put_ldev(mdev);
3749}
3750
3751/**
3752 * drbd_md_read() - Reads in the meta data super block
3753 * @mdev: DRBD device.
3754 * @bdev: Device from which the meta data should be read in.
3755 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003756 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003757 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3758 */
3759int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3760{
3761 struct meta_data_on_disk *buffer;
3762 int i, rv = NO_ERROR;
3763
3764 if (!get_ldev_if_state(mdev, D_ATTACHING))
3765 return ERR_IO_MD_DISK;
3766
Philipp Reisnerb411b362009-09-25 16:07:19 -07003767 mutex_lock(&mdev->md_io_mutex);
3768 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3769
3770 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003771 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003772 called BEFORE disk is attached */
3773 dev_err(DEV, "Error while reading metadata.\n");
3774 rv = ERR_IO_MD_DISK;
3775 goto err;
3776 }
3777
3778 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3779 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3780 rv = ERR_MD_INVALID;
3781 goto err;
3782 }
3783 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3784 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3785 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3786 rv = ERR_MD_INVALID;
3787 goto err;
3788 }
3789 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3790 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3791 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3792 rv = ERR_MD_INVALID;
3793 goto err;
3794 }
3795 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3796 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3797 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3798 rv = ERR_MD_INVALID;
3799 goto err;
3800 }
3801
3802 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3803 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3804 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3805 rv = ERR_MD_INVALID;
3806 goto err;
3807 }
3808
3809 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3810 for (i = UI_CURRENT; i < UI_SIZE; i++)
3811 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3812 bdev->md.flags = be32_to_cpu(buffer->flags);
3813 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3814 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3815
Philipp Reisner99432fc2011-05-20 16:39:13 +02003816 spin_lock_irq(&mdev->req_lock);
3817 if (mdev->state.conn < C_CONNECTED) {
3818 int peer;
3819 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3820 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3821 mdev->peer_max_bio_size = peer;
3822 }
3823 spin_unlock_irq(&mdev->req_lock);
3824
Philipp Reisnerb411b362009-09-25 16:07:19 -07003825 if (mdev->sync_conf.al_extents < 7)
3826 mdev->sync_conf.al_extents = 127;
3827
3828 err:
3829 mutex_unlock(&mdev->md_io_mutex);
3830 put_ldev(mdev);
3831
3832 return rv;
3833}
3834
3835/**
3836 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3837 * @mdev: DRBD device.
3838 *
3839 * Call this function if you change anything that should be written to
3840 * the meta-data super block. This function sets MD_DIRTY, and starts a
3841 * timer that ensures that within five seconds you have to call drbd_md_sync().
3842 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003843#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003844void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3845{
3846 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3847 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3848 mdev->last_md_mark_dirty.line = line;
3849 mdev->last_md_mark_dirty.func = func;
3850 }
3851}
3852#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003853void drbd_md_mark_dirty(struct drbd_conf *mdev)
3854{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003855 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003856 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003857}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003858#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003859
3860static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3861{
3862 int i;
3863
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003864 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003865 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003866}
3867
3868void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3869{
3870 if (idx == UI_CURRENT) {
3871 if (mdev->state.role == R_PRIMARY)
3872 val |= 1;
3873 else
3874 val &= ~((u64)1);
3875
3876 drbd_set_ed_uuid(mdev, val);
3877 }
3878
3879 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003880 drbd_md_mark_dirty(mdev);
3881}
3882
3883
3884void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3885{
3886 if (mdev->ldev->md.uuid[idx]) {
3887 drbd_uuid_move_history(mdev);
3888 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003889 }
3890 _drbd_uuid_set(mdev, idx, val);
3891}
3892
3893/**
3894 * drbd_uuid_new_current() - Creates a new current UUID
3895 * @mdev: DRBD device.
3896 *
3897 * Creates a new current UUID, and rotates the old current UUID into
3898 * the bitmap slot. Causes an incremental resync upon next connect.
3899 */
3900void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3901{
3902 u64 val;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003903 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003904
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003905 if (bm_uuid)
3906 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3907
Philipp Reisnerb411b362009-09-25 16:07:19 -07003908 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003909
3910 get_random_bytes(&val, sizeof(u64));
3911 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003912 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003913 /* get it to stable storage _now_ */
3914 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003915}
3916
3917void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3918{
3919 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3920 return;
3921
3922 if (val == 0) {
3923 drbd_uuid_move_history(mdev);
3924 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3925 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003926 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003927 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3928 if (bm_uuid)
3929 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003930
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003931 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003932 }
3933 drbd_md_mark_dirty(mdev);
3934}
3935
3936/**
3937 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3938 * @mdev: DRBD device.
3939 *
3940 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3941 */
3942int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3943{
3944 int rv = -EIO;
3945
3946 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3947 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3948 drbd_md_sync(mdev);
3949 drbd_bm_set_all(mdev);
3950
3951 rv = drbd_bm_write(mdev);
3952
3953 if (!rv) {
3954 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3955 drbd_md_sync(mdev);
3956 }
3957
3958 put_ldev(mdev);
3959 }
3960
3961 return rv;
3962}
3963
3964/**
3965 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3966 * @mdev: DRBD device.
3967 *
3968 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3969 */
3970int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3971{
3972 int rv = -EIO;
3973
Philipp Reisner07782862010-08-31 12:00:50 +02003974 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003975 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3976 drbd_bm_clear_all(mdev);
3977 rv = drbd_bm_write(mdev);
3978 put_ldev(mdev);
3979 }
3980
3981 return rv;
3982}
3983
3984static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3985{
3986 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003987 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003988
3989 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3990
Lars Ellenberg02851e92010-12-16 14:47:39 +01003991 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003992 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003993 rv = work->io_fn(mdev);
3994 drbd_bm_unlock(mdev);
3995 put_ldev(mdev);
3996 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003997
3998 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003999 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07004000 wake_up(&mdev->misc_wait);
4001
4002 if (work->done)
4003 work->done(mdev, rv);
4004
4005 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
4006 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004007 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004008
4009 return 1;
4010}
4011
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004012void drbd_ldev_destroy(struct drbd_conf *mdev)
4013{
4014 lc_destroy(mdev->resync);
4015 mdev->resync = NULL;
4016 lc_destroy(mdev->act_log);
4017 mdev->act_log = NULL;
4018 __no_warn(local,
4019 drbd_free_bc(mdev->ldev);
4020 mdev->ldev = NULL;);
4021
4022 if (mdev->md_io_tmpp) {
4023 __free_page(mdev->md_io_tmpp);
4024 mdev->md_io_tmpp = NULL;
4025 }
4026 clear_bit(GO_DISKLESS, &mdev->flags);
4027}
4028
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004029static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4030{
4031 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02004032 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4033 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004034 * the protected members anymore, though, so once put_ldev reaches zero
4035 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004036 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004037 return 1;
4038}
4039
4040void drbd_go_diskless(struct drbd_conf *mdev)
4041{
4042 D_ASSERT(mdev->state.disk == D_FAILED);
4043 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02004044 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004045}
4046
Philipp Reisnerb411b362009-09-25 16:07:19 -07004047/**
4048 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4049 * @mdev: DRBD device.
4050 * @io_fn: IO callback to be called when bitmap IO is possible
4051 * @done: callback to be called after the bitmap IO was performed
4052 * @why: Descriptive text of the reason for doing the IO
4053 *
4054 * While IO on the bitmap happens we freeze application IO thus we ensure
4055 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4056 * called from worker context. It MUST NOT be used while a previous such
4057 * work is still pending!
4058 */
4059void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4060 int (*io_fn)(struct drbd_conf *),
4061 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004062 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004063{
4064 D_ASSERT(current == mdev->worker.task);
4065
4066 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4067 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4068 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4069 if (mdev->bm_io_work.why)
4070 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4071 why, mdev->bm_io_work.why);
4072
4073 mdev->bm_io_work.io_fn = io_fn;
4074 mdev->bm_io_work.done = done;
4075 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004076 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004077
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004078 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004079 set_bit(BITMAP_IO, &mdev->flags);
4080 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004081 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004082 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004083 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004084 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004085}
4086
4087/**
4088 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4089 * @mdev: DRBD device.
4090 * @io_fn: IO callback to be called when bitmap IO is possible
4091 * @why: Descriptive text of the reason for doing the IO
4092 *
4093 * freezes application IO while that the actual IO operations runs. This
4094 * functions MAY NOT be called from worker context.
4095 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004096int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4097 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004098{
4099 int rv;
4100
4101 D_ASSERT(current != mdev->worker.task);
4102
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004103 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4104 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004105
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004106 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004107 rv = io_fn(mdev);
4108 drbd_bm_unlock(mdev);
4109
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004110 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4111 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004112
4113 return rv;
4114}
4115
4116void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4117{
4118 if ((mdev->ldev->md.flags & flag) != flag) {
4119 drbd_md_mark_dirty(mdev);
4120 mdev->ldev->md.flags |= flag;
4121 }
4122}
4123
4124void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4125{
4126 if ((mdev->ldev->md.flags & flag) != 0) {
4127 drbd_md_mark_dirty(mdev);
4128 mdev->ldev->md.flags &= ~flag;
4129 }
4130}
4131int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4132{
4133 return (bdev->md.flags & flag) != 0;
4134}
4135
4136static void md_sync_timer_fn(unsigned long data)
4137{
4138 struct drbd_conf *mdev = (struct drbd_conf *) data;
4139
4140 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4141}
4142
4143static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4144{
4145 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004146#ifdef DEBUG
4147 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4148 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4149#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004150 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004151 return 1;
4152}
4153
4154#ifdef CONFIG_DRBD_FAULT_INJECTION
4155/* Fault insertion support including random number generator shamelessly
4156 * stolen from kernel/rcutorture.c */
4157struct fault_random_state {
4158 unsigned long state;
4159 unsigned long count;
4160};
4161
4162#define FAULT_RANDOM_MULT 39916801 /* prime */
4163#define FAULT_RANDOM_ADD 479001701 /* prime */
4164#define FAULT_RANDOM_REFRESH 10000
4165
4166/*
4167 * Crude but fast random-number generator. Uses a linear congruential
4168 * generator, with occasional help from get_random_bytes().
4169 */
4170static unsigned long
4171_drbd_fault_random(struct fault_random_state *rsp)
4172{
4173 long refresh;
4174
Roel Kluin49829ea2009-12-15 22:55:44 +01004175 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004176 get_random_bytes(&refresh, sizeof(refresh));
4177 rsp->state += refresh;
4178 rsp->count = FAULT_RANDOM_REFRESH;
4179 }
4180 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4181 return swahw32(rsp->state);
4182}
4183
4184static char *
4185_drbd_fault_str(unsigned int type) {
4186 static char *_faults[] = {
4187 [DRBD_FAULT_MD_WR] = "Meta-data write",
4188 [DRBD_FAULT_MD_RD] = "Meta-data read",
4189 [DRBD_FAULT_RS_WR] = "Resync write",
4190 [DRBD_FAULT_RS_RD] = "Resync read",
4191 [DRBD_FAULT_DT_WR] = "Data write",
4192 [DRBD_FAULT_DT_RD] = "Data read",
4193 [DRBD_FAULT_DT_RA] = "Data read ahead",
4194 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004195 [DRBD_FAULT_AL_EE] = "EE allocation",
4196 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004197 };
4198
4199 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4200}
4201
4202unsigned int
4203_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4204{
4205 static struct fault_random_state rrs = {0, 0};
4206
4207 unsigned int ret = (
4208 (fault_devs == 0 ||
4209 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4210 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4211
4212 if (ret) {
4213 fault_count++;
4214
Lars Ellenberg73835062010-05-27 11:51:56 +02004215 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004216 dev_warn(DEV, "***Simulating %s failure\n",
4217 _drbd_fault_str(type));
4218 }
4219
4220 return ret;
4221}
4222#endif
4223
4224const char *drbd_buildtag(void)
4225{
4226 /* DRBD built from external sources has here a reference to the
4227 git hash of the source code. */
4228
4229 static char buildtag[38] = "\0uilt-in";
4230
4231 if (buildtag[0] == 0) {
4232#ifdef CONFIG_MODULES
4233 if (THIS_MODULE != NULL)
4234 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4235 else
4236#endif
4237 buildtag[0] = 'b';
4238 }
4239
4240 return buildtag;
4241}
4242
4243module_init(drbd_init)
4244module_exit(drbd_cleanup)
4245
Philipp Reisnerb411b362009-09-25 16:07:19 -07004246EXPORT_SYMBOL(drbd_conn_str);
4247EXPORT_SYMBOL(drbd_role_str);
4248EXPORT_SYMBOL(drbd_disk_str);
4249EXPORT_SYMBOL(drbd_set_st_err_str);