blob: 8d69e3a1b3c2ce4221600ad0fe90428d67e6ee88 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
89MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90
91#include <linux/moduleparam.h>
92/* allow_open_on_secondary */
93MODULE_PARM_DESC(allow_oos, "DONT USE!");
94/* thanks to these macros, if compiled into the kernel (not-module),
95 * this becomes the boot parameter drbd.minor_count */
96module_param(minor_count, uint, 0444);
97module_param(disable_sendpage, bool, 0644);
98module_param(allow_oos, bool, 0);
99module_param(cn_idx, uint, 0444);
100module_param(proc_details, int, 0644);
101
102#ifdef CONFIG_DRBD_FAULT_INJECTION
103int enable_faults;
104int fault_rate;
105static int fault_count;
106int fault_devs;
107/* bitmap of enabled faults */
108module_param(enable_faults, int, 0664);
109/* fault rate % value - applies to all enabled faults */
110module_param(fault_rate, int, 0664);
111/* count of faults inserted */
112module_param(fault_count, int, 0664);
113/* bitmap of devices to insert faults on */
114module_param(fault_devs, int, 0644);
115#endif
116
117/* module parameter, defined */
118unsigned int minor_count = 32;
119int disable_sendpage;
120int allow_oos;
121unsigned int cn_idx = CN_IDX_DRBD;
122int proc_details; /* Detail level in proc drbd*/
123
124/* Module parameter for setting the user mode helper program
125 * to run. Default is /sbin/drbdadm */
126char usermode_helper[80] = "/sbin/drbdadm";
127
128module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
129
130/* in 2.6.x, our device mapping and config info contains our virtual gendisks
131 * as member "struct gendisk *vdisk;"
132 */
133struct drbd_conf **minor_table;
134
135struct kmem_cache *drbd_request_cache;
136struct kmem_cache *drbd_ee_cache; /* epoch entries */
137struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
138struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
139mempool_t *drbd_request_mempool;
140mempool_t *drbd_ee_mempool;
141
142/* I do not use a standard mempool, because:
143 1) I want to hand out the pre-allocated objects first.
144 2) I want to be able to interrupt sleeping allocation with a signal.
145 Note: This is a single linked list, the next pointer is the private
146 member of struct page.
147 */
148struct page *drbd_pp_pool;
149spinlock_t drbd_pp_lock;
150int drbd_pp_vacant;
151wait_queue_head_t drbd_pp_wait;
152
153DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
154
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100155static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700156 .owner = THIS_MODULE,
157 .open = drbd_open,
158 .release = drbd_release,
159};
160
161#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
162
163#ifdef __CHECKER__
164/* When checking with sparse, and this is an inline function, sparse will
165 give tons of false positives. When this is a real functions sparse works.
166 */
167int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
168{
169 int io_allowed;
170
171 atomic_inc(&mdev->local_cnt);
172 io_allowed = (mdev->state.disk >= mins);
173 if (!io_allowed) {
174 if (atomic_dec_and_test(&mdev->local_cnt))
175 wake_up(&mdev->misc_wait);
176 }
177 return io_allowed;
178}
179
180#endif
181
182/**
183 * DOC: The transfer log
184 *
185 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
186 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
187 * of the list. There is always at least one &struct drbd_tl_epoch object.
188 *
189 * Each &struct drbd_tl_epoch has a circular double linked list of requests
190 * attached.
191 */
192static int tl_init(struct drbd_conf *mdev)
193{
194 struct drbd_tl_epoch *b;
195
196 /* during device minor initialization, we may well use GFP_KERNEL */
197 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
198 if (!b)
199 return 0;
200 INIT_LIST_HEAD(&b->requests);
201 INIT_LIST_HEAD(&b->w.list);
202 b->next = NULL;
203 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200204 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700205 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
206
207 mdev->oldest_tle = b;
208 mdev->newest_tle = b;
209 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
210
211 mdev->tl_hash = NULL;
212 mdev->tl_hash_s = 0;
213
214 return 1;
215}
216
217static void tl_cleanup(struct drbd_conf *mdev)
218{
219 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
220 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
221 kfree(mdev->oldest_tle);
222 mdev->oldest_tle = NULL;
223 kfree(mdev->unused_spare_tle);
224 mdev->unused_spare_tle = NULL;
225 kfree(mdev->tl_hash);
226 mdev->tl_hash = NULL;
227 mdev->tl_hash_s = 0;
228}
229
230/**
231 * _tl_add_barrier() - Adds a barrier to the transfer log
232 * @mdev: DRBD device.
233 * @new: Barrier to be added before the current head of the TL.
234 *
235 * The caller must hold the req_lock.
236 */
237void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
238{
239 struct drbd_tl_epoch *newest_before;
240
241 INIT_LIST_HEAD(&new->requests);
242 INIT_LIST_HEAD(&new->w.list);
243 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
244 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200245 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700246
247 newest_before = mdev->newest_tle;
248 /* never send a barrier number == 0, because that is special-cased
249 * when using TCQ for our write ordering code */
250 new->br_number = (newest_before->br_number+1) ?: 1;
251 if (mdev->newest_tle != new) {
252 mdev->newest_tle->next = new;
253 mdev->newest_tle = new;
254 }
255}
256
257/**
258 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259 * @mdev: DRBD device.
260 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261 * @set_size: Expected number of requests before that barrier.
262 *
263 * In case the passed barrier_nr or set_size does not match the oldest
264 * &struct drbd_tl_epoch objects this function will cause a termination
265 * of the connection.
266 */
267void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268 unsigned int set_size)
269{
270 struct drbd_tl_epoch *b, *nob; /* next old barrier */
271 struct list_head *le, *tle;
272 struct drbd_request *r;
273
274 spin_lock_irq(&mdev->req_lock);
275
276 b = mdev->oldest_tle;
277
278 /* first some paranoia code */
279 if (b == NULL) {
280 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281 barrier_nr);
282 goto bail;
283 }
284 if (b->br_number != barrier_nr) {
285 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286 barrier_nr, b->br_number);
287 goto bail;
288 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200289 if (b->n_writes != set_size) {
290 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700292 goto bail;
293 }
294
295 /* Clean up list of requests processed during current epoch */
296 list_for_each_safe(le, tle, &b->requests) {
297 r = list_entry(le, struct drbd_request, tl_requests);
298 _req_mod(r, barrier_acked);
299 }
300 /* There could be requests on the list waiting for completion
301 of the write to the local disk. To avoid corruptions of
302 slab's data structures we have to remove the lists head.
303
304 Also there could have been a barrier ack out of sequence, overtaking
305 the write acks - which would be a bug and violating write ordering.
306 To not deadlock in case we lose connection while such requests are
307 still pending, we need some way to find them for the
308 _req_mode(connection_lost_while_pending).
309
310 These have been list_move'd to the out_of_sequence_requests list in
311 _req_mod(, barrier_acked) above.
312 */
313 list_del_init(&b->requests);
314
315 nob = b->next;
316 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317 _tl_add_barrier(mdev, b);
318 if (nob)
319 mdev->oldest_tle = nob;
320 /* if nob == NULL b was the only barrier, and becomes the new
321 barrier. Therefore mdev->oldest_tle points already to b */
322 } else {
323 D_ASSERT(nob != NULL);
324 mdev->oldest_tle = nob;
325 kfree(b);
326 }
327
328 spin_unlock_irq(&mdev->req_lock);
329 dec_ap_pending(mdev);
330
331 return;
332
333bail:
334 spin_unlock_irq(&mdev->req_lock);
335 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336}
337
Philipp Reisner11b58e72010-05-12 17:08:26 +0200338/**
339 * _tl_restart() - Walks the transfer log, and applies an action to all requests
340 * @mdev: DRBD device.
341 * @what: The action/event to perform with all request objects
342 *
343 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344 * restart_frozen_disk_io.
345 */
346static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347{
348 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200349 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200350 struct drbd_request *req;
351 int rv, n_writes, n_reads;
352
353 b = mdev->oldest_tle;
354 pn = &mdev->oldest_tle;
355 while (b) {
356 n_writes = 0;
357 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200358 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200359 list_for_each_safe(le, tle, &b->requests) {
360 req = list_entry(le, struct drbd_request, tl_requests);
361 rv = _req_mod(req, what);
362
363 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
365 }
366 tmp = b->next;
367
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200368 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200369 if (what == resend) {
370 b->n_writes = n_writes;
371 if (b->w.cb == NULL) {
372 b->w.cb = w_send_barrier;
373 inc_ap_pending(mdev);
374 set_bit(CREATE_BARRIER, &mdev->flags);
375 }
376
377 drbd_queue_work(&mdev->data.work, &b->w);
378 }
379 pn = &b->next;
380 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200381 if (n_reads)
382 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200383 /* there could still be requests on that ring list,
384 * in case local io is still pending */
385 list_del(&b->requests);
386
387 /* dec_ap_pending corresponding to queue_barrier.
388 * the newest barrier may not have been queued yet,
389 * in which case w.cb is still NULL. */
390 if (b->w.cb != NULL)
391 dec_ap_pending(mdev);
392
393 if (b == mdev->newest_tle) {
394 /* recycle, but reinit! */
395 D_ASSERT(tmp == NULL);
396 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200397 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200398 INIT_LIST_HEAD(&b->w.list);
399 b->w.cb = NULL;
400 b->br_number = net_random();
401 b->n_writes = 0;
402
403 *pn = b;
404 break;
405 }
406 *pn = tmp;
407 kfree(b);
408 }
409 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200410 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200411 }
412}
413
Philipp Reisnerb411b362009-09-25 16:07:19 -0700414
415/**
416 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
417 * @mdev: DRBD device.
418 *
419 * This is called after the connection to the peer was lost. The storage covered
420 * by the requests on the transfer gets marked as our of sync. Called from the
421 * receiver thread and the worker thread.
422 */
423void tl_clear(struct drbd_conf *mdev)
424{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700425 struct list_head *le, *tle;
426 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700427
428 spin_lock_irq(&mdev->req_lock);
429
Philipp Reisner11b58e72010-05-12 17:08:26 +0200430 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700431
432 /* we expect this list to be empty. */
433 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
434
435 /* but just in case, clean it up anyways! */
436 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
437 r = list_entry(le, struct drbd_request, tl_requests);
438 /* It would be nice to complete outside of spinlock.
439 * But this is easier for now. */
440 _req_mod(r, connection_lost_while_pending);
441 }
442
443 /* ensure bit indicating barrier is required is clear */
444 clear_bit(CREATE_BARRIER, &mdev->flags);
445
Philipp Reisner288f4222010-05-27 15:07:43 +0200446 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
Philipp Reisnerb411b362009-09-25 16:07:19 -0700448 spin_unlock_irq(&mdev->req_lock);
449}
450
Philipp Reisner11b58e72010-05-12 17:08:26 +0200451void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452{
453 spin_lock_irq(&mdev->req_lock);
454 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700455 spin_unlock_irq(&mdev->req_lock);
456}
457
458/**
459 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
460 * @mdev: DRBD device.
461 * @os: old (current) state.
462 * @ns: new (wanted) state.
463 */
464static int cl_wide_st_chg(struct drbd_conf *mdev,
465 union drbd_state os, union drbd_state ns)
466{
467 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
468 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
469 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
470 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
471 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
472 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
473 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474}
475
476int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
477 union drbd_state mask, union drbd_state val)
478{
479 unsigned long flags;
480 union drbd_state os, ns;
481 int rv;
482
483 spin_lock_irqsave(&mdev->req_lock, flags);
484 os = mdev->state;
485 ns.i = (os.i & ~mask.i) | val.i;
486 rv = _drbd_set_state(mdev, ns, f, NULL);
487 ns = mdev->state;
488 spin_unlock_irqrestore(&mdev->req_lock, flags);
489
490 return rv;
491}
492
493/**
494 * drbd_force_state() - Impose a change which happens outside our control on our state
495 * @mdev: DRBD device.
496 * @mask: mask of state bits to change.
497 * @val: value of new state bits.
498 */
499void drbd_force_state(struct drbd_conf *mdev,
500 union drbd_state mask, union drbd_state val)
501{
502 drbd_change_state(mdev, CS_HARD, mask, val);
503}
504
505static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
506static int is_valid_state_transition(struct drbd_conf *,
507 union drbd_state, union drbd_state);
508static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200509 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700510int drbd_send_state_req(struct drbd_conf *,
511 union drbd_state, union drbd_state);
512
513static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
514 union drbd_state mask, union drbd_state val)
515{
516 union drbd_state os, ns;
517 unsigned long flags;
518 int rv;
519
520 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
521 return SS_CW_SUCCESS;
522
523 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
524 return SS_CW_FAILED_BY_PEER;
525
526 rv = 0;
527 spin_lock_irqsave(&mdev->req_lock, flags);
528 os = mdev->state;
529 ns.i = (os.i & ~mask.i) | val.i;
530 ns = sanitize_state(mdev, os, ns, NULL);
531
532 if (!cl_wide_st_chg(mdev, os, ns))
533 rv = SS_CW_NO_NEED;
534 if (!rv) {
535 rv = is_valid_state(mdev, ns);
536 if (rv == SS_SUCCESS) {
537 rv = is_valid_state_transition(mdev, ns, os);
538 if (rv == SS_SUCCESS)
539 rv = 0; /* cont waiting, otherwise fail. */
540 }
541 }
542 spin_unlock_irqrestore(&mdev->req_lock, flags);
543
544 return rv;
545}
546
547/**
548 * drbd_req_state() - Perform an eventually cluster wide state change
549 * @mdev: DRBD device.
550 * @mask: mask of state bits to change.
551 * @val: value of new state bits.
552 * @f: flags
553 *
554 * Should not be called directly, use drbd_request_state() or
555 * _drbd_request_state().
556 */
557static int drbd_req_state(struct drbd_conf *mdev,
558 union drbd_state mask, union drbd_state val,
559 enum chg_state_flags f)
560{
561 struct completion done;
562 unsigned long flags;
563 union drbd_state os, ns;
564 int rv;
565
566 init_completion(&done);
567
568 if (f & CS_SERIALIZE)
569 mutex_lock(&mdev->state_mutex);
570
571 spin_lock_irqsave(&mdev->req_lock, flags);
572 os = mdev->state;
573 ns.i = (os.i & ~mask.i) | val.i;
574 ns = sanitize_state(mdev, os, ns, NULL);
575
576 if (cl_wide_st_chg(mdev, os, ns)) {
577 rv = is_valid_state(mdev, ns);
578 if (rv == SS_SUCCESS)
579 rv = is_valid_state_transition(mdev, ns, os);
580 spin_unlock_irqrestore(&mdev->req_lock, flags);
581
582 if (rv < SS_SUCCESS) {
583 if (f & CS_VERBOSE)
584 print_st_err(mdev, os, ns, rv);
585 goto abort;
586 }
587
588 drbd_state_lock(mdev);
589 if (!drbd_send_state_req(mdev, mask, val)) {
590 drbd_state_unlock(mdev);
591 rv = SS_CW_FAILED_BY_PEER;
592 if (f & CS_VERBOSE)
593 print_st_err(mdev, os, ns, rv);
594 goto abort;
595 }
596
597 wait_event(mdev->state_wait,
598 (rv = _req_st_cond(mdev, mask, val)));
599
600 if (rv < SS_SUCCESS) {
601 drbd_state_unlock(mdev);
602 if (f & CS_VERBOSE)
603 print_st_err(mdev, os, ns, rv);
604 goto abort;
605 }
606 spin_lock_irqsave(&mdev->req_lock, flags);
607 os = mdev->state;
608 ns.i = (os.i & ~mask.i) | val.i;
609 rv = _drbd_set_state(mdev, ns, f, &done);
610 drbd_state_unlock(mdev);
611 } else {
612 rv = _drbd_set_state(mdev, ns, f, &done);
613 }
614
615 spin_unlock_irqrestore(&mdev->req_lock, flags);
616
617 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
618 D_ASSERT(current != mdev->worker.task);
619 wait_for_completion(&done);
620 }
621
622abort:
623 if (f & CS_SERIALIZE)
624 mutex_unlock(&mdev->state_mutex);
625
626 return rv;
627}
628
629/**
630 * _drbd_request_state() - Request a state change (with flags)
631 * @mdev: DRBD device.
632 * @mask: mask of state bits to change.
633 * @val: value of new state bits.
634 * @f: flags
635 *
636 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
637 * flag, or when logging of failed state change requests is not desired.
638 */
639int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
640 union drbd_state val, enum chg_state_flags f)
641{
642 int rv;
643
644 wait_event(mdev->state_wait,
645 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
646
647 return rv;
648}
649
650static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
651{
652 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
653 name,
654 drbd_conn_str(ns.conn),
655 drbd_role_str(ns.role),
656 drbd_role_str(ns.peer),
657 drbd_disk_str(ns.disk),
658 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200659 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700660 ns.aftr_isp ? 'a' : '-',
661 ns.peer_isp ? 'p' : '-',
662 ns.user_isp ? 'u' : '-'
663 );
664}
665
666void print_st_err(struct drbd_conf *mdev,
667 union drbd_state os, union drbd_state ns, int err)
668{
669 if (err == SS_IN_TRANSIENT_STATE)
670 return;
671 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
672 print_st(mdev, " state", os);
673 print_st(mdev, "wanted", ns);
674}
675
676
Philipp Reisnerb411b362009-09-25 16:07:19 -0700677/**
678 * is_valid_state() - Returns an SS_ error code if ns is not valid
679 * @mdev: DRBD device.
680 * @ns: State to consider.
681 */
682static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
683{
684 /* See drbd_state_sw_errors in drbd_strings.c */
685
686 enum drbd_fencing_p fp;
687 int rv = SS_SUCCESS;
688
689 fp = FP_DONT_CARE;
690 if (get_ldev(mdev)) {
691 fp = mdev->ldev->dc.fencing;
692 put_ldev(mdev);
693 }
694
695 if (get_net_conf(mdev)) {
696 if (!mdev->net_conf->two_primaries &&
697 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
698 rv = SS_TWO_PRIMARIES;
699 put_net_conf(mdev);
700 }
701
702 if (rv <= 0)
703 /* already found a reason to abort */;
704 else if (ns.role == R_SECONDARY && mdev->open_cnt)
705 rv = SS_DEVICE_IN_USE;
706
707 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
708 rv = SS_NO_UP_TO_DATE_DISK;
709
710 else if (fp >= FP_RESOURCE &&
711 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
712 rv = SS_PRIMARY_NOP;
713
714 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
715 rv = SS_NO_UP_TO_DATE_DISK;
716
717 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
718 rv = SS_NO_LOCAL_DISK;
719
720 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
721 rv = SS_NO_REMOTE_DISK;
722
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200723 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
724 rv = SS_NO_UP_TO_DATE_DISK;
725
Philipp Reisnerb411b362009-09-25 16:07:19 -0700726 else if ((ns.conn == C_CONNECTED ||
727 ns.conn == C_WF_BITMAP_S ||
728 ns.conn == C_SYNC_SOURCE ||
729 ns.conn == C_PAUSED_SYNC_S) &&
730 ns.disk == D_OUTDATED)
731 rv = SS_CONNECTED_OUTDATES;
732
733 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
734 (mdev->sync_conf.verify_alg[0] == 0))
735 rv = SS_NO_VERIFY_ALG;
736
737 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
738 mdev->agreed_pro_version < 88)
739 rv = SS_NOT_SUPPORTED;
740
741 return rv;
742}
743
744/**
745 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
746 * @mdev: DRBD device.
747 * @ns: new state.
748 * @os: old state.
749 */
750static int is_valid_state_transition(struct drbd_conf *mdev,
751 union drbd_state ns, union drbd_state os)
752{
753 int rv = SS_SUCCESS;
754
755 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
756 os.conn > C_CONNECTED)
757 rv = SS_RESYNC_RUNNING;
758
759 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
760 rv = SS_ALREADY_STANDALONE;
761
762 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
763 rv = SS_IS_DISKLESS;
764
765 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
766 rv = SS_NO_NET_CONFIG;
767
768 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
769 rv = SS_LOWER_THAN_OUTDATED;
770
771 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
772 rv = SS_IN_TRANSIENT_STATE;
773
774 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
775 rv = SS_IN_TRANSIENT_STATE;
776
777 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
778 rv = SS_NEED_CONNECTION;
779
780 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
781 ns.conn != os.conn && os.conn > C_CONNECTED)
782 rv = SS_RESYNC_RUNNING;
783
784 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
785 os.conn < C_CONNECTED)
786 rv = SS_NEED_CONNECTION;
787
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100788 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
789 && os.conn < C_WF_REPORT_PARAMS)
790 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
791
Philipp Reisnerb411b362009-09-25 16:07:19 -0700792 return rv;
793}
794
795/**
796 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
797 * @mdev: DRBD device.
798 * @os: old state.
799 * @ns: new state.
800 * @warn_sync_abort:
801 *
802 * When we loose connection, we have to set the state of the peers disk (pdsk)
803 * to D_UNKNOWN. This rule and many more along those lines are in this function.
804 */
805static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200806 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700807{
808 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100809 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700810
811 fp = FP_DONT_CARE;
812 if (get_ldev(mdev)) {
813 fp = mdev->ldev->dc.fencing;
814 put_ldev(mdev);
815 }
816
817 /* Disallow Network errors to configure a device's network part */
818 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
819 os.conn <= C_DISCONNECTING)
820 ns.conn = os.conn;
821
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200822 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
823 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700824 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200825 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700826 ns.conn = os.conn;
827
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200828 /* we cannot fail (again) if we already detached */
829 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
830 ns.disk = D_DISKLESS;
831
832 /* if we are only D_ATTACHING yet,
833 * we can (and should) go directly to D_DISKLESS. */
834 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
835 ns.disk = D_DISKLESS;
836
Philipp Reisnerb411b362009-09-25 16:07:19 -0700837 /* After C_DISCONNECTING only C_STANDALONE may follow */
838 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
839 ns.conn = os.conn;
840
841 if (ns.conn < C_CONNECTED) {
842 ns.peer_isp = 0;
843 ns.peer = R_UNKNOWN;
844 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
845 ns.pdsk = D_UNKNOWN;
846 }
847
848 /* Clear the aftr_isp when becoming unconfigured */
849 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
850 ns.aftr_isp = 0;
851
Philipp Reisnerb411b362009-09-25 16:07:19 -0700852 /* Abort resync if a disk fails/detaches */
853 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
854 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
855 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200856 *warn_sync_abort =
857 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
858 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700859 ns.conn = C_CONNECTED;
860 }
861
Philipp Reisnerb411b362009-09-25 16:07:19 -0700862 /* Connection breaks down before we finished "Negotiating" */
863 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
864 get_ldev_if_state(mdev, D_NEGOTIATING)) {
865 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
866 ns.disk = mdev->new_state_tmp.disk;
867 ns.pdsk = mdev->new_state_tmp.pdsk;
868 } else {
869 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
870 ns.disk = D_DISKLESS;
871 ns.pdsk = D_UNKNOWN;
872 }
873 put_ldev(mdev);
874 }
875
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100876 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
877 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
878 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
879 ns.disk = D_UP_TO_DATE;
880 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
881 ns.pdsk = D_UP_TO_DATE;
882 }
883
884 /* Implications of the connection stat on the disk states */
885 disk_min = D_DISKLESS;
886 disk_max = D_UP_TO_DATE;
887 pdsk_min = D_INCONSISTENT;
888 pdsk_max = D_UNKNOWN;
889 switch ((enum drbd_conns)ns.conn) {
890 case C_WF_BITMAP_T:
891 case C_PAUSED_SYNC_T:
892 case C_STARTING_SYNC_T:
893 case C_WF_SYNC_UUID:
894 case C_BEHIND:
895 disk_min = D_INCONSISTENT;
896 disk_max = D_OUTDATED;
897 pdsk_min = D_UP_TO_DATE;
898 pdsk_max = D_UP_TO_DATE;
899 break;
900 case C_VERIFY_S:
901 case C_VERIFY_T:
902 disk_min = D_UP_TO_DATE;
903 disk_max = D_UP_TO_DATE;
904 pdsk_min = D_UP_TO_DATE;
905 pdsk_max = D_UP_TO_DATE;
906 break;
907 case C_CONNECTED:
908 disk_min = D_DISKLESS;
909 disk_max = D_UP_TO_DATE;
910 pdsk_min = D_DISKLESS;
911 pdsk_max = D_UP_TO_DATE;
912 break;
913 case C_WF_BITMAP_S:
914 case C_PAUSED_SYNC_S:
915 case C_STARTING_SYNC_S:
916 case C_AHEAD:
917 disk_min = D_UP_TO_DATE;
918 disk_max = D_UP_TO_DATE;
919 pdsk_min = D_INCONSISTENT;
920 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
921 break;
922 case C_SYNC_TARGET:
923 disk_min = D_INCONSISTENT;
924 disk_max = D_INCONSISTENT;
925 pdsk_min = D_UP_TO_DATE;
926 pdsk_max = D_UP_TO_DATE;
927 break;
928 case C_SYNC_SOURCE:
929 disk_min = D_UP_TO_DATE;
930 disk_max = D_UP_TO_DATE;
931 pdsk_min = D_INCONSISTENT;
932 pdsk_max = D_INCONSISTENT;
933 break;
934 case C_STANDALONE:
935 case C_DISCONNECTING:
936 case C_UNCONNECTED:
937 case C_TIMEOUT:
938 case C_BROKEN_PIPE:
939 case C_NETWORK_FAILURE:
940 case C_PROTOCOL_ERROR:
941 case C_TEAR_DOWN:
942 case C_WF_CONNECTION:
943 case C_WF_REPORT_PARAMS:
944 case C_MASK:
945 break;
946 }
947 if (ns.disk > disk_max)
948 ns.disk = disk_max;
949
950 if (ns.disk < disk_min) {
951 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
952 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
953 ns.disk = disk_min;
954 }
955 if (ns.pdsk > pdsk_max)
956 ns.pdsk = pdsk_max;
957
958 if (ns.pdsk < pdsk_min) {
959 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
960 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
961 ns.pdsk = pdsk_min;
962 }
963
Philipp Reisnerb411b362009-09-25 16:07:19 -0700964 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200965 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
966 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200967 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +0200968
969 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
970 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
971 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200972 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700973
974 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
975 if (ns.conn == C_SYNC_SOURCE)
976 ns.conn = C_PAUSED_SYNC_S;
977 if (ns.conn == C_SYNC_TARGET)
978 ns.conn = C_PAUSED_SYNC_T;
979 } else {
980 if (ns.conn == C_PAUSED_SYNC_S)
981 ns.conn = C_SYNC_SOURCE;
982 if (ns.conn == C_PAUSED_SYNC_T)
983 ns.conn = C_SYNC_TARGET;
984 }
985
986 return ns;
987}
988
989/* helper for __drbd_set_state */
990static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
991{
Lars Ellenberg30b743a2010-11-05 09:39:06 +0100992 if (mdev->agreed_pro_version < 90)
993 mdev->ov_start_sector = 0;
994 mdev->rs_total = drbd_bm_bits(mdev);
995 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700996 if (cs == C_VERIFY_T) {
997 /* starting online verify from an arbitrary position
998 * does not fit well into the existing protocol.
999 * on C_VERIFY_T, we initialize ov_left and friends
1000 * implicitly in receive_DataRequest once the
1001 * first P_OV_REQUEST is received */
1002 mdev->ov_start_sector = ~(sector_t)0;
1003 } else {
1004 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001005 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001006 mdev->ov_start_sector =
1007 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001008 mdev->rs_total = 1;
1009 } else
1010 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001011 mdev->ov_position = mdev->ov_start_sector;
1012 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001013 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001014}
1015
Philipp Reisner07782862010-08-31 12:00:50 +02001016static void drbd_resume_al(struct drbd_conf *mdev)
1017{
1018 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1019 dev_info(DEV, "Resumed AL updates\n");
1020}
1021
Philipp Reisnerb411b362009-09-25 16:07:19 -07001022/**
1023 * __drbd_set_state() - Set a new DRBD state
1024 * @mdev: DRBD device.
1025 * @ns: new state.
1026 * @flags: Flags
1027 * @done: Optional completion, that will get completed after the after_state_ch() finished
1028 *
1029 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1030 */
1031int __drbd_set_state(struct drbd_conf *mdev,
1032 union drbd_state ns, enum chg_state_flags flags,
1033 struct completion *done)
1034{
1035 union drbd_state os;
1036 int rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001037 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001038 struct after_state_chg_work *ascw;
1039
1040 os = mdev->state;
1041
1042 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1043
1044 if (ns.i == os.i)
1045 return SS_NOTHING_TO_DO;
1046
1047 if (!(flags & CS_HARD)) {
1048 /* pre-state-change checks ; only look at ns */
1049 /* See drbd_state_sw_errors in drbd_strings.c */
1050
1051 rv = is_valid_state(mdev, ns);
1052 if (rv < SS_SUCCESS) {
1053 /* If the old state was illegal as well, then let
1054 this happen...*/
1055
Philipp Reisner1616a252010-06-10 16:55:15 +02001056 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001057 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001058 } else
1059 rv = is_valid_state_transition(mdev, ns, os);
1060 }
1061
1062 if (rv < SS_SUCCESS) {
1063 if (flags & CS_VERBOSE)
1064 print_st_err(mdev, os, ns, rv);
1065 return rv;
1066 }
1067
1068 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001069 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001070
1071 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001072 char *pbp, pb[300];
1073 pbp = pb;
1074 *pbp = 0;
1075 if (ns.role != os.role)
1076 pbp += sprintf(pbp, "role( %s -> %s ) ",
1077 drbd_role_str(os.role),
1078 drbd_role_str(ns.role));
1079 if (ns.peer != os.peer)
1080 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1081 drbd_role_str(os.peer),
1082 drbd_role_str(ns.peer));
1083 if (ns.conn != os.conn)
1084 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1085 drbd_conn_str(os.conn),
1086 drbd_conn_str(ns.conn));
1087 if (ns.disk != os.disk)
1088 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1089 drbd_disk_str(os.disk),
1090 drbd_disk_str(ns.disk));
1091 if (ns.pdsk != os.pdsk)
1092 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1093 drbd_disk_str(os.pdsk),
1094 drbd_disk_str(ns.pdsk));
1095 if (is_susp(ns) != is_susp(os))
1096 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1097 is_susp(os),
1098 is_susp(ns));
1099 if (ns.aftr_isp != os.aftr_isp)
1100 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1101 os.aftr_isp,
1102 ns.aftr_isp);
1103 if (ns.peer_isp != os.peer_isp)
1104 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1105 os.peer_isp,
1106 ns.peer_isp);
1107 if (ns.user_isp != os.user_isp)
1108 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1109 os.user_isp,
1110 ns.user_isp);
1111 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001112 }
1113
1114 /* solve the race between becoming unconfigured,
1115 * worker doing the cleanup, and
1116 * admin reconfiguring us:
1117 * on (re)configure, first set CONFIG_PENDING,
1118 * then wait for a potentially exiting worker,
1119 * start the worker, and schedule one no_op.
1120 * then proceed with configuration.
1121 */
1122 if (ns.disk == D_DISKLESS &&
1123 ns.conn == C_STANDALONE &&
1124 ns.role == R_SECONDARY &&
1125 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1126 set_bit(DEVICE_DYING, &mdev->flags);
1127
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001128 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1129 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1130 * drbd_ldev_destroy() won't happen before our corresponding
1131 * after_state_ch works run, where we put_ldev again. */
1132 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1133 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1134 atomic_inc(&mdev->local_cnt);
1135
1136 mdev->state = ns;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001137 wake_up(&mdev->misc_wait);
1138 wake_up(&mdev->state_wait);
1139
Philipp Reisnerb411b362009-09-25 16:07:19 -07001140 /* aborted verify run. log the last position */
1141 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1142 ns.conn < C_CONNECTED) {
1143 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001144 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001145 dev_info(DEV, "Online Verify reached sector %llu\n",
1146 (unsigned long long)mdev->ov_start_sector);
1147 }
1148
1149 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1150 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1151 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001152 mdev->rs_paused += (long)jiffies
1153 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001154 if (ns.conn == C_SYNC_TARGET)
1155 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001156 }
1157
1158 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1159 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1160 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001161 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001162 }
1163
1164 if (os.conn == C_CONNECTED &&
1165 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001166 unsigned long now = jiffies;
1167 int i;
1168
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001169 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001170 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001171 mdev->rs_last_events = 0;
1172 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001173 mdev->ov_last_oos_size = 0;
1174 mdev->ov_last_oos_start = 0;
1175
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001176 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001177 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001178 mdev->rs_mark_time[i] = now;
1179 }
1180
Lars Ellenberg2649f082010-11-05 10:05:47 +01001181 drbd_rs_controller_reset(mdev);
1182
Philipp Reisnerb411b362009-09-25 16:07:19 -07001183 if (ns.conn == C_VERIFY_S) {
1184 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1185 (unsigned long long)mdev->ov_position);
1186 mod_timer(&mdev->resync_timer, jiffies);
1187 }
1188 }
1189
1190 if (get_ldev(mdev)) {
1191 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1192 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1193 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1194
1195 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1196 mdf |= MDF_CRASHED_PRIMARY;
1197 if (mdev->state.role == R_PRIMARY ||
1198 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1199 mdf |= MDF_PRIMARY_IND;
1200 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1201 mdf |= MDF_CONNECTED_IND;
1202 if (mdev->state.disk > D_INCONSISTENT)
1203 mdf |= MDF_CONSISTENT;
1204 if (mdev->state.disk > D_OUTDATED)
1205 mdf |= MDF_WAS_UP_TO_DATE;
1206 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1207 mdf |= MDF_PEER_OUT_DATED;
1208 if (mdf != mdev->ldev->md.flags) {
1209 mdev->ldev->md.flags = mdf;
1210 drbd_md_mark_dirty(mdev);
1211 }
1212 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1213 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1214 put_ldev(mdev);
1215 }
1216
1217 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1218 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1219 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1220 set_bit(CONSIDER_RESYNC, &mdev->flags);
1221
1222 /* Receiver should clean up itself */
1223 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1224 drbd_thread_stop_nowait(&mdev->receiver);
1225
1226 /* Now the receiver finished cleaning up itself, it should die */
1227 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1228 drbd_thread_stop_nowait(&mdev->receiver);
1229
1230 /* Upon network failure, we need to restart the receiver. */
1231 if (os.conn > C_TEAR_DOWN &&
1232 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1233 drbd_thread_restart_nowait(&mdev->receiver);
1234
Philipp Reisner07782862010-08-31 12:00:50 +02001235 /* Resume AL writing if we get a connection */
1236 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1237 drbd_resume_al(mdev);
1238
Philipp Reisnerb411b362009-09-25 16:07:19 -07001239 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1240 if (ascw) {
1241 ascw->os = os;
1242 ascw->ns = ns;
1243 ascw->flags = flags;
1244 ascw->w.cb = w_after_state_ch;
1245 ascw->done = done;
1246 drbd_queue_work(&mdev->data.work, &ascw->w);
1247 } else {
1248 dev_warn(DEV, "Could not kmalloc an ascw\n");
1249 }
1250
1251 return rv;
1252}
1253
1254static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1255{
1256 struct after_state_chg_work *ascw =
1257 container_of(w, struct after_state_chg_work, w);
1258 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1259 if (ascw->flags & CS_WAIT_COMPLETE) {
1260 D_ASSERT(ascw->done != NULL);
1261 complete(ascw->done);
1262 }
1263 kfree(ascw);
1264
1265 return 1;
1266}
1267
1268static void abw_start_sync(struct drbd_conf *mdev, int rv)
1269{
1270 if (rv) {
1271 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1272 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1273 return;
1274 }
1275
1276 switch (mdev->state.conn) {
1277 case C_STARTING_SYNC_T:
1278 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1279 break;
1280 case C_STARTING_SYNC_S:
1281 drbd_start_resync(mdev, C_SYNC_SOURCE);
1282 break;
1283 }
1284}
1285
1286/**
1287 * after_state_ch() - Perform after state change actions that may sleep
1288 * @mdev: DRBD device.
1289 * @os: old state.
1290 * @ns: new state.
1291 * @flags: Flags
1292 */
1293static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1294 union drbd_state ns, enum chg_state_flags flags)
1295{
1296 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001297 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001298 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001299
1300 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1301 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1302 if (mdev->p_uuid)
1303 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1304 }
1305
1306 fp = FP_DONT_CARE;
1307 if (get_ldev(mdev)) {
1308 fp = mdev->ldev->dc.fencing;
1309 put_ldev(mdev);
1310 }
1311
1312 /* Inform userspace about the change... */
1313 drbd_bcast_state(mdev, ns);
1314
1315 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1316 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1317 drbd_khelper(mdev, "pri-on-incon-degr");
1318
1319 /* Here we have the actions that are performed after a
1320 state change. This function might sleep */
1321
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001322 nsm.i = -1;
1323 if (ns.susp_nod) {
Philipp Reisner265be2d2010-05-31 10:14:17 +02001324 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
Philipp Reisner67098932010-06-24 16:24:25 +02001325 if (ns.conn == C_CONNECTED)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001326 what = resend, nsm.susp_nod = 0;
Philipp Reisner67098932010-06-24 16:24:25 +02001327 else /* ns.conn > C_CONNECTED */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001328 dev_err(DEV, "Unexpected Resynd going on!\n");
1329 }
1330
Philipp Reisner67098932010-06-24 16:24:25 +02001331 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001332 what = restart_frozen_disk_io, nsm.susp_nod = 0;
1333
Philipp Reisner265be2d2010-05-31 10:14:17 +02001334 }
1335
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001336 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001337 /* case1: The outdate peer handler is successful: */
1338 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001339 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001340 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1341 drbd_uuid_new_current(mdev);
1342 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001343 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001344 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001345 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001346 spin_unlock_irq(&mdev->req_lock);
1347 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001348 /* case2: The connection was established again: */
1349 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1350 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001351 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001352 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001353 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001354 }
Philipp Reisner67098932010-06-24 16:24:25 +02001355
1356 if (what != nothing) {
1357 spin_lock_irq(&mdev->req_lock);
1358 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001359 nsm.i &= mdev->state.i;
1360 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001361 spin_unlock_irq(&mdev->req_lock);
1362 }
1363
Philipp Reisnerb411b362009-09-25 16:07:19 -07001364 /* Do not change the order of the if above and the two below... */
1365 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1366 drbd_send_uuids(mdev);
1367 drbd_send_state(mdev);
1368 }
1369 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1370 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1371
1372 /* Lost contact to peer's copy of the data */
1373 if ((os.pdsk >= D_INCONSISTENT &&
1374 os.pdsk != D_UNKNOWN &&
1375 os.pdsk != D_OUTDATED)
1376 && (ns.pdsk < D_INCONSISTENT ||
1377 ns.pdsk == D_UNKNOWN ||
1378 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001379 if (get_ldev(mdev)) {
1380 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001381 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001382 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001383 set_bit(NEW_CUR_UUID, &mdev->flags);
1384 } else {
1385 drbd_uuid_new_current(mdev);
1386 drbd_send_uuids(mdev);
1387 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001388 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001389 put_ldev(mdev);
1390 }
1391 }
1392
1393 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001394 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001395 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001396 drbd_send_uuids(mdev);
1397 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001398
1399 /* D_DISKLESS Peer becomes secondary */
1400 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1401 drbd_al_to_on_disk_bm(mdev);
1402 put_ldev(mdev);
1403 }
1404
1405 /* Last part of the attaching process ... */
1406 if (ns.conn >= C_CONNECTED &&
1407 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001408 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001409 drbd_send_uuids(mdev);
1410 drbd_send_state(mdev);
1411 }
1412
1413 /* We want to pause/continue resync, tell peer. */
1414 if (ns.conn >= C_CONNECTED &&
1415 ((os.aftr_isp != ns.aftr_isp) ||
1416 (os.user_isp != ns.user_isp)))
1417 drbd_send_state(mdev);
1418
1419 /* In case one of the isp bits got set, suspend other devices. */
1420 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1421 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1422 suspend_other_sg(mdev);
1423
1424 /* Make sure the peer gets informed about eventual state
1425 changes (ISP bits) while we were in WFReportParams. */
1426 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1427 drbd_send_state(mdev);
1428
Philipp Reisner67531712010-10-27 12:21:30 +02001429 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1430 drbd_send_state(mdev);
1431
Philipp Reisnerb411b362009-09-25 16:07:19 -07001432 /* We are in the progress to start a full sync... */
1433 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1434 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1435 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1436
1437 /* We are invalidating our self... */
1438 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1439 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1440 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1441
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001442 /* first half of local IO error, failure to attach,
1443 * or administrative detach */
1444 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1445 enum drbd_io_error_p eh;
1446 int was_io_error;
1447 /* corresponding get_ldev was in __drbd_set_state, to serialize
1448 * our cleanup here with the transition to D_DISKLESS,
1449 * so it is safe to dreference ldev here. */
1450 eh = mdev->ldev->dc.on_io_error;
1451 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1452
1453 /* current state still has to be D_FAILED,
1454 * there is only one way out: to D_DISKLESS,
1455 * and that may only happen after our put_ldev below. */
1456 if (mdev->state.disk != D_FAILED)
1457 dev_err(DEV,
1458 "ASSERT FAILED: disk is %s during detach\n",
1459 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001460
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001461 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001462 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001463 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001464 dev_err(DEV, "Sending state for detaching disk failed\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001465
1466 drbd_rs_cancel_all(mdev);
1467
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001468 /* In case we want to get something to stable storage still,
1469 * this may be the last chance.
1470 * Following put_ldev may transition to D_DISKLESS. */
1471 drbd_md_sync(mdev);
1472 put_ldev(mdev);
1473
1474 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001475 drbd_khelper(mdev, "local-io-error");
1476 }
1477
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001478 /* second half of local IO error, failure to attach,
1479 * or administrative detach,
1480 * after local_cnt references have reached zero again */
1481 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1482 /* We must still be diskless,
1483 * re-attach has to be serialized with this! */
1484 if (mdev->state.disk != D_DISKLESS)
1485 dev_err(DEV,
1486 "ASSERT FAILED: disk is %s while going diskless\n",
1487 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001488
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001489 mdev->rs_total = 0;
1490 mdev->rs_failed = 0;
1491 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001492
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001493 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001494 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001495 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001496 dev_err(DEV, "Sending state for being diskless failed\n");
1497 /* corresponding get_ldev in __drbd_set_state
1498 * this may finaly trigger drbd_ldev_destroy. */
1499 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001500 }
1501
1502 /* Disks got bigger while they were detached */
1503 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1504 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1505 if (ns.conn == C_CONNECTED)
1506 resync_after_online_grow(mdev);
1507 }
1508
1509 /* A resync finished or aborted, wake paused devices... */
1510 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1511 (os.peer_isp && !ns.peer_isp) ||
1512 (os.user_isp && !ns.user_isp))
1513 resume_next_sg(mdev);
1514
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001515 /* sync target done with resync. Explicitly notify peer, even though
1516 * it should (at least for non-empty resyncs) already know itself. */
1517 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1518 drbd_send_state(mdev);
1519
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001520 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001521 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001522 drbd_free_tl_hash(mdev);
1523
Philipp Reisnerb411b362009-09-25 16:07:19 -07001524 /* Upon network connection, we need to start the receiver */
1525 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1526 drbd_thread_start(&mdev->receiver);
1527
1528 /* Terminate worker thread if we are unconfigured - it will be
1529 restarted as needed... */
1530 if (ns.disk == D_DISKLESS &&
1531 ns.conn == C_STANDALONE &&
1532 ns.role == R_SECONDARY) {
1533 if (os.aftr_isp != ns.aftr_isp)
1534 resume_next_sg(mdev);
1535 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1536 if (test_bit(DEVICE_DYING, &mdev->flags))
1537 drbd_thread_stop_nowait(&mdev->worker);
1538 }
1539
1540 drbd_md_sync(mdev);
1541}
1542
1543
1544static int drbd_thread_setup(void *arg)
1545{
1546 struct drbd_thread *thi = (struct drbd_thread *) arg;
1547 struct drbd_conf *mdev = thi->mdev;
1548 unsigned long flags;
1549 int retval;
1550
1551restart:
1552 retval = thi->function(thi);
1553
1554 spin_lock_irqsave(&thi->t_lock, flags);
1555
1556 /* if the receiver has been "Exiting", the last thing it did
1557 * was set the conn state to "StandAlone",
1558 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1559 * and receiver thread will be "started".
1560 * drbd_thread_start needs to set "Restarting" in that case.
1561 * t_state check and assignment needs to be within the same spinlock,
1562 * so either thread_start sees Exiting, and can remap to Restarting,
1563 * or thread_start see None, and can proceed as normal.
1564 */
1565
1566 if (thi->t_state == Restarting) {
1567 dev_info(DEV, "Restarting %s\n", current->comm);
1568 thi->t_state = Running;
1569 spin_unlock_irqrestore(&thi->t_lock, flags);
1570 goto restart;
1571 }
1572
1573 thi->task = NULL;
1574 thi->t_state = None;
1575 smp_mb();
1576 complete(&thi->stop);
1577 spin_unlock_irqrestore(&thi->t_lock, flags);
1578
1579 dev_info(DEV, "Terminating %s\n", current->comm);
1580
1581 /* Release mod reference taken when thread was started */
1582 module_put(THIS_MODULE);
1583 return retval;
1584}
1585
1586static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1587 int (*func) (struct drbd_thread *))
1588{
1589 spin_lock_init(&thi->t_lock);
1590 thi->task = NULL;
1591 thi->t_state = None;
1592 thi->function = func;
1593 thi->mdev = mdev;
1594}
1595
1596int drbd_thread_start(struct drbd_thread *thi)
1597{
1598 struct drbd_conf *mdev = thi->mdev;
1599 struct task_struct *nt;
1600 unsigned long flags;
1601
1602 const char *me =
1603 thi == &mdev->receiver ? "receiver" :
1604 thi == &mdev->asender ? "asender" :
1605 thi == &mdev->worker ? "worker" : "NONSENSE";
1606
1607 /* is used from state engine doing drbd_thread_stop_nowait,
1608 * while holding the req lock irqsave */
1609 spin_lock_irqsave(&thi->t_lock, flags);
1610
1611 switch (thi->t_state) {
1612 case None:
1613 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1614 me, current->comm, current->pid);
1615
1616 /* Get ref on module for thread - this is released when thread exits */
1617 if (!try_module_get(THIS_MODULE)) {
1618 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1619 spin_unlock_irqrestore(&thi->t_lock, flags);
1620 return FALSE;
1621 }
1622
1623 init_completion(&thi->stop);
1624 D_ASSERT(thi->task == NULL);
1625 thi->reset_cpu_mask = 1;
1626 thi->t_state = Running;
1627 spin_unlock_irqrestore(&thi->t_lock, flags);
1628 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1629
1630 nt = kthread_create(drbd_thread_setup, (void *) thi,
1631 "drbd%d_%s", mdev_to_minor(mdev), me);
1632
1633 if (IS_ERR(nt)) {
1634 dev_err(DEV, "Couldn't start thread\n");
1635
1636 module_put(THIS_MODULE);
1637 return FALSE;
1638 }
1639 spin_lock_irqsave(&thi->t_lock, flags);
1640 thi->task = nt;
1641 thi->t_state = Running;
1642 spin_unlock_irqrestore(&thi->t_lock, flags);
1643 wake_up_process(nt);
1644 break;
1645 case Exiting:
1646 thi->t_state = Restarting;
1647 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1648 me, current->comm, current->pid);
1649 /* fall through */
1650 case Running:
1651 case Restarting:
1652 default:
1653 spin_unlock_irqrestore(&thi->t_lock, flags);
1654 break;
1655 }
1656
1657 return TRUE;
1658}
1659
1660
1661void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1662{
1663 unsigned long flags;
1664
1665 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1666
1667 /* may be called from state engine, holding the req lock irqsave */
1668 spin_lock_irqsave(&thi->t_lock, flags);
1669
1670 if (thi->t_state == None) {
1671 spin_unlock_irqrestore(&thi->t_lock, flags);
1672 if (restart)
1673 drbd_thread_start(thi);
1674 return;
1675 }
1676
1677 if (thi->t_state != ns) {
1678 if (thi->task == NULL) {
1679 spin_unlock_irqrestore(&thi->t_lock, flags);
1680 return;
1681 }
1682
1683 thi->t_state = ns;
1684 smp_mb();
1685 init_completion(&thi->stop);
1686 if (thi->task != current)
1687 force_sig(DRBD_SIGKILL, thi->task);
1688
1689 }
1690
1691 spin_unlock_irqrestore(&thi->t_lock, flags);
1692
1693 if (wait)
1694 wait_for_completion(&thi->stop);
1695}
1696
1697#ifdef CONFIG_SMP
1698/**
1699 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1700 * @mdev: DRBD device.
1701 *
1702 * Forces all threads of a device onto the same CPU. This is beneficial for
1703 * DRBD's performance. May be overwritten by user's configuration.
1704 */
1705void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1706{
1707 int ord, cpu;
1708
1709 /* user override. */
1710 if (cpumask_weight(mdev->cpu_mask))
1711 return;
1712
1713 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1714 for_each_online_cpu(cpu) {
1715 if (ord-- == 0) {
1716 cpumask_set_cpu(cpu, mdev->cpu_mask);
1717 return;
1718 }
1719 }
1720 /* should not be reached */
1721 cpumask_setall(mdev->cpu_mask);
1722}
1723
1724/**
1725 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1726 * @mdev: DRBD device.
1727 *
1728 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1729 * prematurely.
1730 */
1731void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1732{
1733 struct task_struct *p = current;
1734 struct drbd_thread *thi =
1735 p == mdev->asender.task ? &mdev->asender :
1736 p == mdev->receiver.task ? &mdev->receiver :
1737 p == mdev->worker.task ? &mdev->worker :
1738 NULL;
1739 ERR_IF(thi == NULL)
1740 return;
1741 if (!thi->reset_cpu_mask)
1742 return;
1743 thi->reset_cpu_mask = 0;
1744 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1745}
1746#endif
1747
1748/* the appropriate socket mutex must be held already */
1749int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001750 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001751 size_t size, unsigned msg_flags)
1752{
1753 int sent, ok;
1754
1755 ERR_IF(!h) return FALSE;
1756 ERR_IF(!size) return FALSE;
1757
1758 h->magic = BE_DRBD_MAGIC;
1759 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001760 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001761
Philipp Reisnerb411b362009-09-25 16:07:19 -07001762 sent = drbd_send(mdev, sock, h, size, msg_flags);
1763
1764 ok = (sent == size);
1765 if (!ok)
1766 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1767 cmdname(cmd), (int)size, sent);
1768 return ok;
1769}
1770
1771/* don't pass the socket. we may only look at it
1772 * when we hold the appropriate socket mutex.
1773 */
1774int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001775 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001776{
1777 int ok = 0;
1778 struct socket *sock;
1779
1780 if (use_data_socket) {
1781 mutex_lock(&mdev->data.mutex);
1782 sock = mdev->data.socket;
1783 } else {
1784 mutex_lock(&mdev->meta.mutex);
1785 sock = mdev->meta.socket;
1786 }
1787
1788 /* drbd_disconnect() could have called drbd_free_sock()
1789 * while we were waiting in down()... */
1790 if (likely(sock != NULL))
1791 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1792
1793 if (use_data_socket)
1794 mutex_unlock(&mdev->data.mutex);
1795 else
1796 mutex_unlock(&mdev->meta.mutex);
1797 return ok;
1798}
1799
1800int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1801 size_t size)
1802{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001803 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001804 int ok;
1805
1806 h.magic = BE_DRBD_MAGIC;
1807 h.command = cpu_to_be16(cmd);
1808 h.length = cpu_to_be16(size);
1809
1810 if (!drbd_get_data_sock(mdev))
1811 return 0;
1812
Philipp Reisnerb411b362009-09-25 16:07:19 -07001813 ok = (sizeof(h) ==
1814 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1815 ok = ok && (size ==
1816 drbd_send(mdev, mdev->data.socket, data, size, 0));
1817
1818 drbd_put_data_sock(mdev);
1819
1820 return ok;
1821}
1822
1823int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1824{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001825 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001826 struct socket *sock;
1827 int size, rv;
1828 const int apv = mdev->agreed_pro_version;
1829
1830 size = apv <= 87 ? sizeof(struct p_rs_param)
1831 : apv == 88 ? sizeof(struct p_rs_param)
1832 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001833 : apv <= 94 ? sizeof(struct p_rs_param_89)
1834 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001835
1836 /* used from admin command context and receiver/worker context.
1837 * to avoid kmalloc, grab the socket right here,
1838 * then use the pre-allocated sbuf there */
1839 mutex_lock(&mdev->data.mutex);
1840 sock = mdev->data.socket;
1841
1842 if (likely(sock != NULL)) {
1843 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1844
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001845 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001846
1847 /* initialize verify_alg and csums_alg */
1848 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1849
1850 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001851 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1852 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1853 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1854 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001855
1856 if (apv >= 88)
1857 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1858 if (apv >= 89)
1859 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1860
1861 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1862 } else
1863 rv = 0; /* not ok */
1864
1865 mutex_unlock(&mdev->data.mutex);
1866
1867 return rv;
1868}
1869
1870int drbd_send_protocol(struct drbd_conf *mdev)
1871{
1872 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001873 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001874
1875 size = sizeof(struct p_protocol);
1876
1877 if (mdev->agreed_pro_version >= 87)
1878 size += strlen(mdev->net_conf->integrity_alg) + 1;
1879
1880 /* we must not recurse into our own queue,
1881 * as that is blocked during handshake */
1882 p = kmalloc(size, GFP_NOIO);
1883 if (p == NULL)
1884 return 0;
1885
1886 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1887 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1888 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1889 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001890 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1891
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001892 cf = 0;
1893 if (mdev->net_conf->want_lose)
1894 cf |= CF_WANT_LOSE;
1895 if (mdev->net_conf->dry_run) {
1896 if (mdev->agreed_pro_version >= 92)
1897 cf |= CF_DRY_RUN;
1898 else {
1899 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001900 kfree(p);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001901 return 0;
1902 }
1903 }
1904 p->conn_flags = cpu_to_be32(cf);
1905
Philipp Reisnerb411b362009-09-25 16:07:19 -07001906 if (mdev->agreed_pro_version >= 87)
1907 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1908
1909 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001910 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001911 kfree(p);
1912 return rv;
1913}
1914
1915int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1916{
1917 struct p_uuids p;
1918 int i;
1919
1920 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1921 return 1;
1922
1923 for (i = UI_CURRENT; i < UI_SIZE; i++)
1924 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1925
1926 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1927 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1928 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1929 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1930 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1931 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1932
1933 put_ldev(mdev);
1934
1935 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001936 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001937}
1938
1939int drbd_send_uuids(struct drbd_conf *mdev)
1940{
1941 return _drbd_send_uuids(mdev, 0);
1942}
1943
1944int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1945{
1946 return _drbd_send_uuids(mdev, 8);
1947}
1948
1949
1950int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1951{
1952 struct p_rs_uuid p;
1953
1954 p.uuid = cpu_to_be64(val);
1955
1956 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001957 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001958}
1959
Philipp Reisnere89b5912010-03-24 17:11:33 +01001960int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001961{
1962 struct p_sizes p;
1963 sector_t d_size, u_size;
1964 int q_order_type;
1965 int ok;
1966
1967 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1968 D_ASSERT(mdev->ldev->backing_bdev);
1969 d_size = drbd_get_max_capacity(mdev->ldev);
1970 u_size = mdev->ldev->dc.disk_size;
1971 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001972 put_ldev(mdev);
1973 } else {
1974 d_size = 0;
1975 u_size = 0;
1976 q_order_type = QUEUE_ORDERED_NONE;
1977 }
1978
1979 p.d_size = cpu_to_be64(d_size);
1980 p.u_size = cpu_to_be64(u_size);
1981 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01001982 p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
Philipp Reisnere89b5912010-03-24 17:11:33 +01001983 p.queue_order_type = cpu_to_be16(q_order_type);
1984 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001985
1986 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001987 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001988 return ok;
1989}
1990
1991/**
1992 * drbd_send_state() - Sends the drbd state to the peer
1993 * @mdev: DRBD device.
1994 */
1995int drbd_send_state(struct drbd_conf *mdev)
1996{
1997 struct socket *sock;
1998 struct p_state p;
1999 int ok = 0;
2000
2001 /* Grab state lock so we wont send state if we're in the middle
2002 * of a cluster wide state change on another thread */
2003 drbd_state_lock(mdev);
2004
2005 mutex_lock(&mdev->data.mutex);
2006
2007 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2008 sock = mdev->data.socket;
2009
2010 if (likely(sock != NULL)) {
2011 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002012 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002013 }
2014
2015 mutex_unlock(&mdev->data.mutex);
2016
2017 drbd_state_unlock(mdev);
2018 return ok;
2019}
2020
2021int drbd_send_state_req(struct drbd_conf *mdev,
2022 union drbd_state mask, union drbd_state val)
2023{
2024 struct p_req_state p;
2025
2026 p.mask = cpu_to_be32(mask.i);
2027 p.val = cpu_to_be32(val.i);
2028
2029 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002030 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002031}
2032
2033int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
2034{
2035 struct p_req_state_reply p;
2036
2037 p.retcode = cpu_to_be32(retcode);
2038
2039 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002040 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002041}
2042
2043int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2044 struct p_compressed_bm *p,
2045 struct bm_xfer_ctx *c)
2046{
2047 struct bitstream bs;
2048 unsigned long plain_bits;
2049 unsigned long tmp;
2050 unsigned long rl;
2051 unsigned len;
2052 unsigned toggle;
2053 int bits;
2054
2055 /* may we use this feature? */
2056 if ((mdev->sync_conf.use_rle == 0) ||
2057 (mdev->agreed_pro_version < 90))
2058 return 0;
2059
2060 if (c->bit_offset >= c->bm_bits)
2061 return 0; /* nothing to do. */
2062
2063 /* use at most thus many bytes */
2064 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2065 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2066 /* plain bits covered in this code string */
2067 plain_bits = 0;
2068
2069 /* p->encoding & 0x80 stores whether the first run length is set.
2070 * bit offset is implicit.
2071 * start with toggle == 2 to be able to tell the first iteration */
2072 toggle = 2;
2073
2074 /* see how much plain bits we can stuff into one packet
2075 * using RLE and VLI. */
2076 do {
2077 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2078 : _drbd_bm_find_next(mdev, c->bit_offset);
2079 if (tmp == -1UL)
2080 tmp = c->bm_bits;
2081 rl = tmp - c->bit_offset;
2082
2083 if (toggle == 2) { /* first iteration */
2084 if (rl == 0) {
2085 /* the first checked bit was set,
2086 * store start value, */
2087 DCBP_set_start(p, 1);
2088 /* but skip encoding of zero run length */
2089 toggle = !toggle;
2090 continue;
2091 }
2092 DCBP_set_start(p, 0);
2093 }
2094
2095 /* paranoia: catch zero runlength.
2096 * can only happen if bitmap is modified while we scan it. */
2097 if (rl == 0) {
2098 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2099 "t:%u bo:%lu\n", toggle, c->bit_offset);
2100 return -1;
2101 }
2102
2103 bits = vli_encode_bits(&bs, rl);
2104 if (bits == -ENOBUFS) /* buffer full */
2105 break;
2106 if (bits <= 0) {
2107 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2108 return 0;
2109 }
2110
2111 toggle = !toggle;
2112 plain_bits += rl;
2113 c->bit_offset = tmp;
2114 } while (c->bit_offset < c->bm_bits);
2115
2116 len = bs.cur.b - p->code + !!bs.cur.bit;
2117
2118 if (plain_bits < (len << 3)) {
2119 /* incompressible with this method.
2120 * we need to rewind both word and bit position. */
2121 c->bit_offset -= plain_bits;
2122 bm_xfer_ctx_bit_to_word_offset(c);
2123 c->bit_offset = c->word_offset * BITS_PER_LONG;
2124 return 0;
2125 }
2126
2127 /* RLE + VLI was able to compress it just fine.
2128 * update c->word_offset. */
2129 bm_xfer_ctx_bit_to_word_offset(c);
2130
2131 /* store pad_bits */
2132 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2133
2134 return len;
2135}
2136
2137enum { OK, FAILED, DONE }
2138send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002139 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002140{
2141 struct p_compressed_bm *p = (void*)h;
2142 unsigned long num_words;
2143 int len;
2144 int ok;
2145
2146 len = fill_bitmap_rle_bits(mdev, p, c);
2147
2148 if (len < 0)
2149 return FAILED;
2150
2151 if (len) {
2152 DCBP_set_code(p, RLE_VLI_Bits);
2153 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2154 sizeof(*p) + len, 0);
2155
2156 c->packets[0]++;
2157 c->bytes[0] += sizeof(*p) + len;
2158
2159 if (c->bit_offset >= c->bm_bits)
2160 len = 0; /* DONE */
2161 } else {
2162 /* was not compressible.
2163 * send a buffer full of plain text bits instead. */
2164 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2165 len = num_words * sizeof(long);
2166 if (len)
2167 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2168 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002169 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002170 c->word_offset += num_words;
2171 c->bit_offset = c->word_offset * BITS_PER_LONG;
2172
2173 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002174 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002175
2176 if (c->bit_offset > c->bm_bits)
2177 c->bit_offset = c->bm_bits;
2178 }
2179 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2180
2181 if (ok == DONE)
2182 INFO_bm_xfer_stats(mdev, "send", c);
2183 return ok;
2184}
2185
2186/* See the comment at receive_bitmap() */
2187int _drbd_send_bitmap(struct drbd_conf *mdev)
2188{
2189 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002190 struct p_header80 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002191 int ret;
2192
2193 ERR_IF(!mdev->bitmap) return FALSE;
2194
2195 /* maybe we should use some per thread scratch page,
2196 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002197 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002198 if (!p) {
2199 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2200 return FALSE;
2201 }
2202
2203 if (get_ldev(mdev)) {
2204 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2205 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2206 drbd_bm_set_all(mdev);
2207 if (drbd_bm_write(mdev)) {
2208 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2209 * but otherwise process as per normal - need to tell other
2210 * side that a full resync is required! */
2211 dev_err(DEV, "Failed to write bitmap to disk!\n");
2212 } else {
2213 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2214 drbd_md_sync(mdev);
2215 }
2216 }
2217 put_ldev(mdev);
2218 }
2219
2220 c = (struct bm_xfer_ctx) {
2221 .bm_bits = drbd_bm_bits(mdev),
2222 .bm_words = drbd_bm_words(mdev),
2223 };
2224
2225 do {
2226 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2227 } while (ret == OK);
2228
2229 free_page((unsigned long) p);
2230 return (ret == DONE);
2231}
2232
2233int drbd_send_bitmap(struct drbd_conf *mdev)
2234{
2235 int err;
2236
2237 if (!drbd_get_data_sock(mdev))
2238 return -1;
2239 err = !_drbd_send_bitmap(mdev);
2240 drbd_put_data_sock(mdev);
2241 return err;
2242}
2243
2244int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2245{
2246 int ok;
2247 struct p_barrier_ack p;
2248
2249 p.barrier = barrier_nr;
2250 p.set_size = cpu_to_be32(set_size);
2251
2252 if (mdev->state.conn < C_CONNECTED)
2253 return FALSE;
2254 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002255 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002256 return ok;
2257}
2258
2259/**
2260 * _drbd_send_ack() - Sends an ack packet
2261 * @mdev: DRBD device.
2262 * @cmd: Packet command code.
2263 * @sector: sector, needs to be in big endian byte order
2264 * @blksize: size in byte, needs to be in big endian byte order
2265 * @block_id: Id, big endian byte order
2266 */
2267static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2268 u64 sector,
2269 u32 blksize,
2270 u64 block_id)
2271{
2272 int ok;
2273 struct p_block_ack p;
2274
2275 p.sector = sector;
2276 p.block_id = block_id;
2277 p.blksize = blksize;
2278 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2279
2280 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2281 return FALSE;
2282 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002283 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002284 return ok;
2285}
2286
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002287/* dp->sector and dp->block_id already/still in network byte order,
2288 * data_size is payload size according to dp->head,
2289 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002290int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002291 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002292{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002293 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2294 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002295 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2296 dp->block_id);
2297}
2298
2299int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2300 struct p_block_req *rp)
2301{
2302 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2303}
2304
2305/**
2306 * drbd_send_ack() - Sends an ack packet
2307 * @mdev: DRBD device.
2308 * @cmd: Packet command code.
2309 * @e: Epoch entry.
2310 */
2311int drbd_send_ack(struct drbd_conf *mdev,
2312 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2313{
2314 return _drbd_send_ack(mdev, cmd,
2315 cpu_to_be64(e->sector),
2316 cpu_to_be32(e->size),
2317 e->block_id);
2318}
2319
2320/* This function misuses the block_id field to signal if the blocks
2321 * are is sync or not. */
2322int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2323 sector_t sector, int blksize, u64 block_id)
2324{
2325 return _drbd_send_ack(mdev, cmd,
2326 cpu_to_be64(sector),
2327 cpu_to_be32(blksize),
2328 cpu_to_be64(block_id));
2329}
2330
2331int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2332 sector_t sector, int size, u64 block_id)
2333{
2334 int ok;
2335 struct p_block_req p;
2336
2337 p.sector = cpu_to_be64(sector);
2338 p.block_id = block_id;
2339 p.blksize = cpu_to_be32(size);
2340
2341 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002342 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002343 return ok;
2344}
2345
2346int drbd_send_drequest_csum(struct drbd_conf *mdev,
2347 sector_t sector, int size,
2348 void *digest, int digest_size,
2349 enum drbd_packets cmd)
2350{
2351 int ok;
2352 struct p_block_req p;
2353
2354 p.sector = cpu_to_be64(sector);
2355 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2356 p.blksize = cpu_to_be32(size);
2357
2358 p.head.magic = BE_DRBD_MAGIC;
2359 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002360 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002361
2362 mutex_lock(&mdev->data.mutex);
2363
2364 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2365 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2366
2367 mutex_unlock(&mdev->data.mutex);
2368
2369 return ok;
2370}
2371
2372int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2373{
2374 int ok;
2375 struct p_block_req p;
2376
2377 p.sector = cpu_to_be64(sector);
2378 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2379 p.blksize = cpu_to_be32(size);
2380
2381 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002382 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002383 return ok;
2384}
2385
2386/* called on sndtimeo
2387 * returns FALSE if we should retry,
2388 * TRUE if we think connection is dead
2389 */
2390static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2391{
2392 int drop_it;
2393 /* long elapsed = (long)(jiffies - mdev->last_received); */
2394
2395 drop_it = mdev->meta.socket == sock
2396 || !mdev->asender.task
2397 || get_t_state(&mdev->asender) != Running
2398 || mdev->state.conn < C_CONNECTED;
2399
2400 if (drop_it)
2401 return TRUE;
2402
2403 drop_it = !--mdev->ko_count;
2404 if (!drop_it) {
2405 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2406 current->comm, current->pid, mdev->ko_count);
2407 request_ping(mdev);
2408 }
2409
2410 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2411}
2412
2413/* The idea of sendpage seems to be to put some kind of reference
2414 * to the page into the skb, and to hand it over to the NIC. In
2415 * this process get_page() gets called.
2416 *
2417 * As soon as the page was really sent over the network put_page()
2418 * gets called by some part of the network layer. [ NIC driver? ]
2419 *
2420 * [ get_page() / put_page() increment/decrement the count. If count
2421 * reaches 0 the page will be freed. ]
2422 *
2423 * This works nicely with pages from FSs.
2424 * But this means that in protocol A we might signal IO completion too early!
2425 *
2426 * In order not to corrupt data during a resync we must make sure
2427 * that we do not reuse our own buffer pages (EEs) to early, therefore
2428 * we have the net_ee list.
2429 *
2430 * XFS seems to have problems, still, it submits pages with page_count == 0!
2431 * As a workaround, we disable sendpage on pages
2432 * with page_count == 0 or PageSlab.
2433 */
2434static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002435 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002436{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002437 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002438 kunmap(page);
2439 if (sent == size)
2440 mdev->send_cnt += size>>9;
2441 return sent == size;
2442}
2443
2444static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002445 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002446{
2447 mm_segment_t oldfs = get_fs();
2448 int sent, ok;
2449 int len = size;
2450
2451 /* e.g. XFS meta- & log-data is in slab pages, which have a
2452 * page_count of 0 and/or have PageSlab() set.
2453 * we cannot use send_page for those, as that does get_page();
2454 * put_page(); and would cause either a VM_BUG directly, or
2455 * __page_cache_release a page that would actually still be referenced
2456 * by someone, leading to some obscure delayed Oops somewhere else. */
2457 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002458 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002459
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002460 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002461 drbd_update_congested(mdev);
2462 set_fs(KERNEL_DS);
2463 do {
2464 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2465 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002466 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002467 if (sent == -EAGAIN) {
2468 if (we_should_drop_the_connection(mdev,
2469 mdev->data.socket))
2470 break;
2471 else
2472 continue;
2473 }
2474 if (sent <= 0) {
2475 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2476 __func__, (int)size, len, sent);
2477 break;
2478 }
2479 len -= sent;
2480 offset += sent;
2481 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2482 set_fs(oldfs);
2483 clear_bit(NET_CONGESTED, &mdev->flags);
2484
2485 ok = (len == 0);
2486 if (likely(ok))
2487 mdev->send_cnt += size>>9;
2488 return ok;
2489}
2490
2491static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2492{
2493 struct bio_vec *bvec;
2494 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002495 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002496 __bio_for_each_segment(bvec, bio, i, 0) {
2497 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002498 bvec->bv_offset, bvec->bv_len,
2499 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002500 return 0;
2501 }
2502 return 1;
2503}
2504
2505static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2506{
2507 struct bio_vec *bvec;
2508 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002509 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002510 __bio_for_each_segment(bvec, bio, i, 0) {
2511 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002512 bvec->bv_offset, bvec->bv_len,
2513 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002514 return 0;
2515 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002516 return 1;
2517}
2518
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002519static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2520{
2521 struct page *page = e->pages;
2522 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002523 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002524 page_chain_for_each(page) {
2525 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002526 if (!_drbd_send_page(mdev, page, 0, l,
2527 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002528 return 0;
2529 len -= l;
2530 }
2531 return 1;
2532}
2533
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002534static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2535{
2536 if (mdev->agreed_pro_version >= 95)
2537 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002538 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2539 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2540 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2541 else
Jens Axboe721a9602011-03-09 11:56:30 +01002542 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002543}
2544
Philipp Reisnerb411b362009-09-25 16:07:19 -07002545/* Used to send write requests
2546 * R_PRIMARY -> Peer (P_DATA)
2547 */
2548int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2549{
2550 int ok = 1;
2551 struct p_data p;
2552 unsigned int dp_flags = 0;
2553 void *dgb;
2554 int dgs;
2555
2556 if (!drbd_get_data_sock(mdev))
2557 return 0;
2558
2559 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2560 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2561
Philipp Reisnerd5373382010-08-23 15:18:33 +02002562 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002563 p.head.h80.magic = BE_DRBD_MAGIC;
2564 p.head.h80.command = cpu_to_be16(P_DATA);
2565 p.head.h80.length =
2566 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2567 } else {
2568 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2569 p.head.h95.command = cpu_to_be16(P_DATA);
2570 p.head.h95.length =
2571 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2572 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002573
2574 p.sector = cpu_to_be64(req->sector);
2575 p.block_id = (unsigned long)req;
2576 p.seq_num = cpu_to_be32(req->seq_num =
2577 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002578
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002579 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2580
Philipp Reisnerb411b362009-09-25 16:07:19 -07002581 if (mdev->state.conn >= C_SYNC_SOURCE &&
2582 mdev->state.conn <= C_PAUSED_SYNC_T)
2583 dp_flags |= DP_MAY_SET_IN_SYNC;
2584
2585 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002586 set_bit(UNPLUG_REMOTE, &mdev->flags);
2587 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002588 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002589 if (ok && dgs) {
2590 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002591 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002592 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002593 }
2594 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002595 /* For protocol A, we have to memcpy the payload into
2596 * socket buffers, as we may complete right away
2597 * as soon as we handed it over to tcp, at which point the data
2598 * pages may become invalid.
2599 *
2600 * For data-integrity enabled, we copy it as well, so we can be
2601 * sure that even if the bio pages may still be modified, it
2602 * won't change the data on the wire, thus if the digest checks
2603 * out ok after sending on this side, but does not fit on the
2604 * receiving side, we sure have detected corruption elsewhere.
2605 */
2606 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002607 ok = _drbd_send_bio(mdev, req->master_bio);
2608 else
2609 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002610
2611 /* double check digest, sometimes buffers have been modified in flight. */
2612 if (dgs > 0 && dgs <= 64) {
2613 /* 64 byte, 512 bit, is the larges digest size
2614 * currently supported in kernel crypto. */
2615 unsigned char digest[64];
2616 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2617 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2618 dev_warn(DEV,
2619 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2620 (unsigned long long)req->sector, req->size);
2621 }
2622 } /* else if (dgs > 64) {
2623 ... Be noisy about digest too large ...
2624 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002625 }
2626
2627 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002628
Philipp Reisnerb411b362009-09-25 16:07:19 -07002629 return ok;
2630}
2631
2632/* answer packet, used to send data back for read requests:
2633 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2634 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2635 */
2636int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2637 struct drbd_epoch_entry *e)
2638{
2639 int ok;
2640 struct p_data p;
2641 void *dgb;
2642 int dgs;
2643
2644 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2645 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2646
Philipp Reisnerd5373382010-08-23 15:18:33 +02002647 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002648 p.head.h80.magic = BE_DRBD_MAGIC;
2649 p.head.h80.command = cpu_to_be16(cmd);
2650 p.head.h80.length =
2651 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2652 } else {
2653 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2654 p.head.h95.command = cpu_to_be16(cmd);
2655 p.head.h95.length =
2656 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2657 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002658
2659 p.sector = cpu_to_be64(e->sector);
2660 p.block_id = e->block_id;
2661 /* p.seq_num = 0; No sequence numbers here.. */
2662
2663 /* Only called by our kernel thread.
2664 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2665 * in response to admin command or module unload.
2666 */
2667 if (!drbd_get_data_sock(mdev))
2668 return 0;
2669
Philipp Reisner0b70a132010-08-20 13:36:10 +02002670 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002671 if (ok && dgs) {
2672 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002673 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002674 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002675 }
2676 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002677 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002678
2679 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002680
Philipp Reisnerb411b362009-09-25 16:07:19 -07002681 return ok;
2682}
2683
Philipp Reisner73a01a12010-10-27 14:33:00 +02002684int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2685{
2686 struct p_block_desc p;
2687
2688 p.sector = cpu_to_be64(req->sector);
2689 p.blksize = cpu_to_be32(req->size);
2690
2691 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2692}
2693
Philipp Reisnerb411b362009-09-25 16:07:19 -07002694/*
2695 drbd_send distinguishes two cases:
2696
2697 Packets sent via the data socket "sock"
2698 and packets sent via the meta data socket "msock"
2699
2700 sock msock
2701 -----------------+-------------------------+------------------------------
2702 timeout conf.timeout / 2 conf.timeout / 2
2703 timeout action send a ping via msock Abort communication
2704 and close all sockets
2705*/
2706
2707/*
2708 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2709 */
2710int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2711 void *buf, size_t size, unsigned msg_flags)
2712{
2713 struct kvec iov;
2714 struct msghdr msg;
2715 int rv, sent = 0;
2716
2717 if (!sock)
2718 return -1000;
2719
2720 /* THINK if (signal_pending) return ... ? */
2721
2722 iov.iov_base = buf;
2723 iov.iov_len = size;
2724
2725 msg.msg_name = NULL;
2726 msg.msg_namelen = 0;
2727 msg.msg_control = NULL;
2728 msg.msg_controllen = 0;
2729 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2730
2731 if (sock == mdev->data.socket) {
2732 mdev->ko_count = mdev->net_conf->ko_count;
2733 drbd_update_congested(mdev);
2734 }
2735 do {
2736 /* STRANGE
2737 * tcp_sendmsg does _not_ use its size parameter at all ?
2738 *
2739 * -EAGAIN on timeout, -EINTR on signal.
2740 */
2741/* THINK
2742 * do we need to block DRBD_SIG if sock == &meta.socket ??
2743 * otherwise wake_asender() might interrupt some send_*Ack !
2744 */
2745 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2746 if (rv == -EAGAIN) {
2747 if (we_should_drop_the_connection(mdev, sock))
2748 break;
2749 else
2750 continue;
2751 }
2752 D_ASSERT(rv != 0);
2753 if (rv == -EINTR) {
2754 flush_signals(current);
2755 rv = 0;
2756 }
2757 if (rv < 0)
2758 break;
2759 sent += rv;
2760 iov.iov_base += rv;
2761 iov.iov_len -= rv;
2762 } while (sent < size);
2763
2764 if (sock == mdev->data.socket)
2765 clear_bit(NET_CONGESTED, &mdev->flags);
2766
2767 if (rv <= 0) {
2768 if (rv != -EAGAIN) {
2769 dev_err(DEV, "%s_sendmsg returned %d\n",
2770 sock == mdev->meta.socket ? "msock" : "sock",
2771 rv);
2772 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2773 } else
2774 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2775 }
2776
2777 return sent;
2778}
2779
2780static int drbd_open(struct block_device *bdev, fmode_t mode)
2781{
2782 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2783 unsigned long flags;
2784 int rv = 0;
2785
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002786 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002787 spin_lock_irqsave(&mdev->req_lock, flags);
2788 /* to have a stable mdev->state.role
2789 * and no race with updating open_cnt */
2790
2791 if (mdev->state.role != R_PRIMARY) {
2792 if (mode & FMODE_WRITE)
2793 rv = -EROFS;
2794 else if (!allow_oos)
2795 rv = -EMEDIUMTYPE;
2796 }
2797
2798 if (!rv)
2799 mdev->open_cnt++;
2800 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002801 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002802
2803 return rv;
2804}
2805
2806static int drbd_release(struct gendisk *gd, fmode_t mode)
2807{
2808 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002809 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002810 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002811 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002812 return 0;
2813}
2814
Philipp Reisnerb411b362009-09-25 16:07:19 -07002815static void drbd_set_defaults(struct drbd_conf *mdev)
2816{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002817 /* This way we get a compile error when sync_conf grows,
2818 and we forgot to initialize it here */
2819 mdev->sync_conf = (struct syncer_conf) {
2820 /* .rate = */ DRBD_RATE_DEF,
2821 /* .after = */ DRBD_AFTER_DEF,
2822 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002823 /* .verify_alg = */ {}, 0,
2824 /* .cpu_mask = */ {}, 0,
2825 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002826 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002827 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2828 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2829 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2830 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002831 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2832 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002833 };
2834
2835 /* Have to use that way, because the layout differs between
2836 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002837 mdev->state = (union drbd_state) {
2838 { .role = R_SECONDARY,
2839 .peer = R_UNKNOWN,
2840 .conn = C_STANDALONE,
2841 .disk = D_DISKLESS,
2842 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002843 .susp = 0,
2844 .susp_nod = 0,
2845 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002846 } };
2847}
2848
2849void drbd_init_set_defaults(struct drbd_conf *mdev)
2850{
2851 /* the memset(,0,) did most of this.
2852 * note: only assignments, no allocation in here */
2853
2854 drbd_set_defaults(mdev);
2855
Philipp Reisnerb411b362009-09-25 16:07:19 -07002856 atomic_set(&mdev->ap_bio_cnt, 0);
2857 atomic_set(&mdev->ap_pending_cnt, 0);
2858 atomic_set(&mdev->rs_pending_cnt, 0);
2859 atomic_set(&mdev->unacked_cnt, 0);
2860 atomic_set(&mdev->local_cnt, 0);
2861 atomic_set(&mdev->net_cnt, 0);
2862 atomic_set(&mdev->packet_seq, 0);
2863 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02002864 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02002865 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002866 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02002867 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002868
2869 mutex_init(&mdev->md_io_mutex);
2870 mutex_init(&mdev->data.mutex);
2871 mutex_init(&mdev->meta.mutex);
2872 sema_init(&mdev->data.work.s, 0);
2873 sema_init(&mdev->meta.work.s, 0);
2874 mutex_init(&mdev->state_mutex);
2875
2876 spin_lock_init(&mdev->data.work.q_lock);
2877 spin_lock_init(&mdev->meta.work.q_lock);
2878
2879 spin_lock_init(&mdev->al_lock);
2880 spin_lock_init(&mdev->req_lock);
2881 spin_lock_init(&mdev->peer_seq_lock);
2882 spin_lock_init(&mdev->epoch_lock);
2883
2884 INIT_LIST_HEAD(&mdev->active_ee);
2885 INIT_LIST_HEAD(&mdev->sync_ee);
2886 INIT_LIST_HEAD(&mdev->done_ee);
2887 INIT_LIST_HEAD(&mdev->read_ee);
2888 INIT_LIST_HEAD(&mdev->net_ee);
2889 INIT_LIST_HEAD(&mdev->resync_reads);
2890 INIT_LIST_HEAD(&mdev->data.work.q);
2891 INIT_LIST_HEAD(&mdev->meta.work.q);
2892 INIT_LIST_HEAD(&mdev->resync_work.list);
2893 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002894 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002895 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02002896 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002897 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02002898
Philipp Reisnerb411b362009-09-25 16:07:19 -07002899 mdev->resync_work.cb = w_resync_inactive;
2900 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002901 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002902 mdev->md_sync_work.cb = w_md_sync;
2903 mdev->bm_io_work.w.cb = w_bitmap_io;
2904 init_timer(&mdev->resync_timer);
2905 init_timer(&mdev->md_sync_timer);
2906 mdev->resync_timer.function = resync_timer_fn;
2907 mdev->resync_timer.data = (unsigned long) mdev;
2908 mdev->md_sync_timer.function = md_sync_timer_fn;
2909 mdev->md_sync_timer.data = (unsigned long) mdev;
2910
2911 init_waitqueue_head(&mdev->misc_wait);
2912 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02002913 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002914 init_waitqueue_head(&mdev->ee_wait);
2915 init_waitqueue_head(&mdev->al_wait);
2916 init_waitqueue_head(&mdev->seq_wait);
2917
2918 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2919 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2920 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2921
2922 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02002923 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002924 mdev->resync_wenr = LC_FREE;
2925}
2926
2927void drbd_mdev_cleanup(struct drbd_conf *mdev)
2928{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002929 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002930 if (mdev->receiver.t_state != None)
2931 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2932 mdev->receiver.t_state);
2933
2934 /* no need to lock it, I'm the only thread alive */
2935 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2936 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2937 mdev->al_writ_cnt =
2938 mdev->bm_writ_cnt =
2939 mdev->read_cnt =
2940 mdev->recv_cnt =
2941 mdev->send_cnt =
2942 mdev->writ_cnt =
2943 mdev->p_size =
2944 mdev->rs_start =
2945 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002946 mdev->rs_failed = 0;
2947 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002948 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002949 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2950 mdev->rs_mark_left[i] = 0;
2951 mdev->rs_mark_time[i] = 0;
2952 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002953 D_ASSERT(mdev->net_conf == NULL);
2954
2955 drbd_set_my_capacity(mdev, 0);
2956 if (mdev->bitmap) {
2957 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01002958 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002959 drbd_bm_cleanup(mdev);
2960 }
2961
2962 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02002963 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002964
2965 /*
2966 * currently we drbd_init_ee only on module load, so
2967 * we may do drbd_release_ee only on module unload!
2968 */
2969 D_ASSERT(list_empty(&mdev->active_ee));
2970 D_ASSERT(list_empty(&mdev->sync_ee));
2971 D_ASSERT(list_empty(&mdev->done_ee));
2972 D_ASSERT(list_empty(&mdev->read_ee));
2973 D_ASSERT(list_empty(&mdev->net_ee));
2974 D_ASSERT(list_empty(&mdev->resync_reads));
2975 D_ASSERT(list_empty(&mdev->data.work.q));
2976 D_ASSERT(list_empty(&mdev->meta.work.q));
2977 D_ASSERT(list_empty(&mdev->resync_work.list));
2978 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002979 D_ASSERT(list_empty(&mdev->go_diskless.list));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002980}
2981
2982
2983static void drbd_destroy_mempools(void)
2984{
2985 struct page *page;
2986
2987 while (drbd_pp_pool) {
2988 page = drbd_pp_pool;
2989 drbd_pp_pool = (struct page *)page_private(page);
2990 __free_page(page);
2991 drbd_pp_vacant--;
2992 }
2993
2994 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2995
2996 if (drbd_ee_mempool)
2997 mempool_destroy(drbd_ee_mempool);
2998 if (drbd_request_mempool)
2999 mempool_destroy(drbd_request_mempool);
3000 if (drbd_ee_cache)
3001 kmem_cache_destroy(drbd_ee_cache);
3002 if (drbd_request_cache)
3003 kmem_cache_destroy(drbd_request_cache);
3004 if (drbd_bm_ext_cache)
3005 kmem_cache_destroy(drbd_bm_ext_cache);
3006 if (drbd_al_ext_cache)
3007 kmem_cache_destroy(drbd_al_ext_cache);
3008
3009 drbd_ee_mempool = NULL;
3010 drbd_request_mempool = NULL;
3011 drbd_ee_cache = NULL;
3012 drbd_request_cache = NULL;
3013 drbd_bm_ext_cache = NULL;
3014 drbd_al_ext_cache = NULL;
3015
3016 return;
3017}
3018
3019static int drbd_create_mempools(void)
3020{
3021 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003022 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003023 int i;
3024
3025 /* prepare our caches and mempools */
3026 drbd_request_mempool = NULL;
3027 drbd_ee_cache = NULL;
3028 drbd_request_cache = NULL;
3029 drbd_bm_ext_cache = NULL;
3030 drbd_al_ext_cache = NULL;
3031 drbd_pp_pool = NULL;
3032
3033 /* caches */
3034 drbd_request_cache = kmem_cache_create(
3035 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3036 if (drbd_request_cache == NULL)
3037 goto Enomem;
3038
3039 drbd_ee_cache = kmem_cache_create(
3040 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3041 if (drbd_ee_cache == NULL)
3042 goto Enomem;
3043
3044 drbd_bm_ext_cache = kmem_cache_create(
3045 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3046 if (drbd_bm_ext_cache == NULL)
3047 goto Enomem;
3048
3049 drbd_al_ext_cache = kmem_cache_create(
3050 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3051 if (drbd_al_ext_cache == NULL)
3052 goto Enomem;
3053
3054 /* mempools */
3055 drbd_request_mempool = mempool_create(number,
3056 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3057 if (drbd_request_mempool == NULL)
3058 goto Enomem;
3059
3060 drbd_ee_mempool = mempool_create(number,
3061 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003062 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003063 goto Enomem;
3064
3065 /* drbd's page pool */
3066 spin_lock_init(&drbd_pp_lock);
3067
3068 for (i = 0; i < number; i++) {
3069 page = alloc_page(GFP_HIGHUSER);
3070 if (!page)
3071 goto Enomem;
3072 set_page_private(page, (unsigned long)drbd_pp_pool);
3073 drbd_pp_pool = page;
3074 }
3075 drbd_pp_vacant = number;
3076
3077 return 0;
3078
3079Enomem:
3080 drbd_destroy_mempools(); /* in case we allocated some */
3081 return -ENOMEM;
3082}
3083
3084static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3085 void *unused)
3086{
3087 /* just so we have it. you never know what interesting things we
3088 * might want to do here some day...
3089 */
3090
3091 return NOTIFY_DONE;
3092}
3093
3094static struct notifier_block drbd_notifier = {
3095 .notifier_call = drbd_notify_sys,
3096};
3097
3098static void drbd_release_ee_lists(struct drbd_conf *mdev)
3099{
3100 int rr;
3101
3102 rr = drbd_release_ee(mdev, &mdev->active_ee);
3103 if (rr)
3104 dev_err(DEV, "%d EEs in active list found!\n", rr);
3105
3106 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3107 if (rr)
3108 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3109
3110 rr = drbd_release_ee(mdev, &mdev->read_ee);
3111 if (rr)
3112 dev_err(DEV, "%d EEs in read list found!\n", rr);
3113
3114 rr = drbd_release_ee(mdev, &mdev->done_ee);
3115 if (rr)
3116 dev_err(DEV, "%d EEs in done list found!\n", rr);
3117
3118 rr = drbd_release_ee(mdev, &mdev->net_ee);
3119 if (rr)
3120 dev_err(DEV, "%d EEs in net list found!\n", rr);
3121}
3122
3123/* caution. no locking.
3124 * currently only used from module cleanup code. */
3125static void drbd_delete_device(unsigned int minor)
3126{
3127 struct drbd_conf *mdev = minor_to_mdev(minor);
3128
3129 if (!mdev)
3130 return;
3131
3132 /* paranoia asserts */
3133 if (mdev->open_cnt != 0)
3134 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3135 __FILE__ , __LINE__);
3136
3137 ERR_IF (!list_empty(&mdev->data.work.q)) {
3138 struct list_head *lp;
3139 list_for_each(lp, &mdev->data.work.q) {
3140 dev_err(DEV, "lp = %p\n", lp);
3141 }
3142 };
3143 /* end paranoia asserts */
3144
3145 del_gendisk(mdev->vdisk);
3146
3147 /* cleanup stuff that may have been allocated during
3148 * device (re-)configuration or state changes */
3149
3150 if (mdev->this_bdev)
3151 bdput(mdev->this_bdev);
3152
3153 drbd_free_resources(mdev);
3154
3155 drbd_release_ee_lists(mdev);
3156
3157 /* should be free'd on disconnect? */
3158 kfree(mdev->ee_hash);
3159 /*
3160 mdev->ee_hash_s = 0;
3161 mdev->ee_hash = NULL;
3162 */
3163
3164 lc_destroy(mdev->act_log);
3165 lc_destroy(mdev->resync);
3166
3167 kfree(mdev->p_uuid);
3168 /* mdev->p_uuid = NULL; */
3169
3170 kfree(mdev->int_dig_out);
3171 kfree(mdev->int_dig_in);
3172 kfree(mdev->int_dig_vv);
3173
3174 /* cleanup the rest that has been
3175 * allocated from drbd_new_device
3176 * and actually free the mdev itself */
3177 drbd_free_mdev(mdev);
3178}
3179
3180static void drbd_cleanup(void)
3181{
3182 unsigned int i;
3183
3184 unregister_reboot_notifier(&drbd_notifier);
3185
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003186 /* first remove proc,
3187 * drbdsetup uses it's presence to detect
3188 * whether DRBD is loaded.
3189 * If we would get stuck in proc removal,
3190 * but have netlink already deregistered,
3191 * some drbdsetup commands may wait forever
3192 * for an answer.
3193 */
3194 if (drbd_proc)
3195 remove_proc_entry("drbd", NULL);
3196
Philipp Reisnerb411b362009-09-25 16:07:19 -07003197 drbd_nl_cleanup();
3198
3199 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003200 i = minor_count;
3201 while (i--)
3202 drbd_delete_device(i);
3203 drbd_destroy_mempools();
3204 }
3205
3206 kfree(minor_table);
3207
3208 unregister_blkdev(DRBD_MAJOR, "drbd");
3209
3210 printk(KERN_INFO "drbd: module cleanup done.\n");
3211}
3212
3213/**
3214 * drbd_congested() - Callback for pdflush
3215 * @congested_data: User data
3216 * @bdi_bits: Bits pdflush is currently interested in
3217 *
3218 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3219 */
3220static int drbd_congested(void *congested_data, int bdi_bits)
3221{
3222 struct drbd_conf *mdev = congested_data;
3223 struct request_queue *q;
3224 char reason = '-';
3225 int r = 0;
3226
3227 if (!__inc_ap_bio_cond(mdev)) {
3228 /* DRBD has frozen IO */
3229 r = bdi_bits;
3230 reason = 'd';
3231 goto out;
3232 }
3233
3234 if (get_ldev(mdev)) {
3235 q = bdev_get_queue(mdev->ldev->backing_bdev);
3236 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3237 put_ldev(mdev);
3238 if (r)
3239 reason = 'b';
3240 }
3241
3242 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3243 r |= (1 << BDI_async_congested);
3244 reason = reason == 'b' ? 'a' : 'n';
3245 }
3246
3247out:
3248 mdev->congestion_reason = reason;
3249 return r;
3250}
3251
3252struct drbd_conf *drbd_new_device(unsigned int minor)
3253{
3254 struct drbd_conf *mdev;
3255 struct gendisk *disk;
3256 struct request_queue *q;
3257
3258 /* GFP_KERNEL, we are outside of all write-out paths */
3259 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3260 if (!mdev)
3261 return NULL;
3262 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3263 goto out_no_cpumask;
3264
3265 mdev->minor = minor;
3266
3267 drbd_init_set_defaults(mdev);
3268
3269 q = blk_alloc_queue(GFP_KERNEL);
3270 if (!q)
3271 goto out_no_q;
3272 mdev->rq_queue = q;
3273 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003274
3275 disk = alloc_disk(1);
3276 if (!disk)
3277 goto out_no_disk;
3278 mdev->vdisk = disk;
3279
3280 set_disk_ro(disk, TRUE);
3281
3282 disk->queue = q;
3283 disk->major = DRBD_MAJOR;
3284 disk->first_minor = minor;
3285 disk->fops = &drbd_ops;
3286 sprintf(disk->disk_name, "drbd%d", minor);
3287 disk->private_data = mdev;
3288
3289 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3290 /* we have no partitions. we contain only ourselves. */
3291 mdev->this_bdev->bd_contains = mdev->this_bdev;
3292
3293 q->backing_dev_info.congested_fn = drbd_congested;
3294 q->backing_dev_info.congested_data = mdev;
3295
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003296 blk_queue_make_request(q, drbd_make_request);
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003297 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003298 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3299 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003300 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003301
3302 mdev->md_io_page = alloc_page(GFP_KERNEL);
3303 if (!mdev->md_io_page)
3304 goto out_no_io_page;
3305
3306 if (drbd_bm_init(mdev))
3307 goto out_no_bitmap;
3308 /* no need to lock access, we are still initializing this minor device. */
3309 if (!tl_init(mdev))
3310 goto out_no_tl;
3311
3312 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3313 if (!mdev->app_reads_hash)
3314 goto out_no_app_reads;
3315
3316 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3317 if (!mdev->current_epoch)
3318 goto out_no_epoch;
3319
3320 INIT_LIST_HEAD(&mdev->current_epoch->list);
3321 mdev->epochs = 1;
3322
3323 return mdev;
3324
3325/* out_whatever_else:
3326 kfree(mdev->current_epoch); */
3327out_no_epoch:
3328 kfree(mdev->app_reads_hash);
3329out_no_app_reads:
3330 tl_cleanup(mdev);
3331out_no_tl:
3332 drbd_bm_cleanup(mdev);
3333out_no_bitmap:
3334 __free_page(mdev->md_io_page);
3335out_no_io_page:
3336 put_disk(disk);
3337out_no_disk:
3338 blk_cleanup_queue(q);
3339out_no_q:
3340 free_cpumask_var(mdev->cpu_mask);
3341out_no_cpumask:
3342 kfree(mdev);
3343 return NULL;
3344}
3345
3346/* counterpart of drbd_new_device.
3347 * last part of drbd_delete_device. */
3348void drbd_free_mdev(struct drbd_conf *mdev)
3349{
3350 kfree(mdev->current_epoch);
3351 kfree(mdev->app_reads_hash);
3352 tl_cleanup(mdev);
3353 if (mdev->bitmap) /* should no longer be there. */
3354 drbd_bm_cleanup(mdev);
3355 __free_page(mdev->md_io_page);
3356 put_disk(mdev->vdisk);
3357 blk_cleanup_queue(mdev->rq_queue);
3358 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003359 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003360 kfree(mdev);
3361}
3362
3363
3364int __init drbd_init(void)
3365{
3366 int err;
3367
3368 if (sizeof(struct p_handshake) != 80) {
3369 printk(KERN_ERR
3370 "drbd: never change the size or layout "
3371 "of the HandShake packet.\n");
3372 return -EINVAL;
3373 }
3374
3375 if (1 > minor_count || minor_count > 255) {
3376 printk(KERN_ERR
3377 "drbd: invalid minor_count (%d)\n", minor_count);
3378#ifdef MODULE
3379 return -EINVAL;
3380#else
3381 minor_count = 8;
3382#endif
3383 }
3384
3385 err = drbd_nl_init();
3386 if (err)
3387 return err;
3388
3389 err = register_blkdev(DRBD_MAJOR, "drbd");
3390 if (err) {
3391 printk(KERN_ERR
3392 "drbd: unable to register block device major %d\n",
3393 DRBD_MAJOR);
3394 return err;
3395 }
3396
3397 register_reboot_notifier(&drbd_notifier);
3398
3399 /*
3400 * allocate all necessary structs
3401 */
3402 err = -ENOMEM;
3403
3404 init_waitqueue_head(&drbd_pp_wait);
3405
3406 drbd_proc = NULL; /* play safe for drbd_cleanup */
3407 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3408 GFP_KERNEL);
3409 if (!minor_table)
3410 goto Enomem;
3411
3412 err = drbd_create_mempools();
3413 if (err)
3414 goto Enomem;
3415
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003416 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003417 if (!drbd_proc) {
3418 printk(KERN_ERR "drbd: unable to register proc file\n");
3419 goto Enomem;
3420 }
3421
3422 rwlock_init(&global_state_lock);
3423
3424 printk(KERN_INFO "drbd: initialized. "
3425 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3426 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3427 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3428 printk(KERN_INFO "drbd: registered as block device major %d\n",
3429 DRBD_MAJOR);
3430 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3431
3432 return 0; /* Success! */
3433
3434Enomem:
3435 drbd_cleanup();
3436 if (err == -ENOMEM)
3437 /* currently always the case */
3438 printk(KERN_ERR "drbd: ran out of memory\n");
3439 else
3440 printk(KERN_ERR "drbd: initialization failure\n");
3441 return err;
3442}
3443
3444void drbd_free_bc(struct drbd_backing_dev *ldev)
3445{
3446 if (ldev == NULL)
3447 return;
3448
Tejun Heoe525fd82010-11-13 11:55:17 +01003449 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3450 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003451
3452 kfree(ldev);
3453}
3454
3455void drbd_free_sock(struct drbd_conf *mdev)
3456{
3457 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003458 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003459 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3460 sock_release(mdev->data.socket);
3461 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003462 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003463 }
3464 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003465 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003466 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3467 sock_release(mdev->meta.socket);
3468 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003469 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003470 }
3471}
3472
3473
3474void drbd_free_resources(struct drbd_conf *mdev)
3475{
3476 crypto_free_hash(mdev->csums_tfm);
3477 mdev->csums_tfm = NULL;
3478 crypto_free_hash(mdev->verify_tfm);
3479 mdev->verify_tfm = NULL;
3480 crypto_free_hash(mdev->cram_hmac_tfm);
3481 mdev->cram_hmac_tfm = NULL;
3482 crypto_free_hash(mdev->integrity_w_tfm);
3483 mdev->integrity_w_tfm = NULL;
3484 crypto_free_hash(mdev->integrity_r_tfm);
3485 mdev->integrity_r_tfm = NULL;
3486
3487 drbd_free_sock(mdev);
3488
3489 __no_warn(local,
3490 drbd_free_bc(mdev->ldev);
3491 mdev->ldev = NULL;);
3492}
3493
3494/* meta data management */
3495
3496struct meta_data_on_disk {
3497 u64 la_size; /* last agreed size. */
3498 u64 uuid[UI_SIZE]; /* UUIDs. */
3499 u64 device_uuid;
3500 u64 reserved_u64_1;
3501 u32 flags; /* MDF */
3502 u32 magic;
3503 u32 md_size_sect;
3504 u32 al_offset; /* offset to this block */
3505 u32 al_nr_extents; /* important for restoring the AL */
3506 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3507 u32 bm_offset; /* offset to the bitmap, from here */
3508 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3509 u32 reserved_u32[4];
3510
3511} __packed;
3512
3513/**
3514 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3515 * @mdev: DRBD device.
3516 */
3517void drbd_md_sync(struct drbd_conf *mdev)
3518{
3519 struct meta_data_on_disk *buffer;
3520 sector_t sector;
3521 int i;
3522
Lars Ellenbergee15b032010-09-03 10:00:09 +02003523 del_timer(&mdev->md_sync_timer);
3524 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003525 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3526 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003527
3528 /* We use here D_FAILED and not D_ATTACHING because we try to write
3529 * metadata even if we detach due to a disk failure! */
3530 if (!get_ldev_if_state(mdev, D_FAILED))
3531 return;
3532
Philipp Reisnerb411b362009-09-25 16:07:19 -07003533 mutex_lock(&mdev->md_io_mutex);
3534 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3535 memset(buffer, 0, 512);
3536
3537 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3538 for (i = UI_CURRENT; i < UI_SIZE; i++)
3539 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3540 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3541 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3542
3543 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3544 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3545 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3546 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3547 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3548
3549 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3550
3551 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3552 sector = mdev->ldev->md.md_offset;
3553
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003554 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003555 /* this was a try anyways ... */
3556 dev_err(DEV, "meta data update failed!\n");
Philipp Reisnerb411b362009-09-25 16:07:19 -07003557 drbd_chk_io_error(mdev, 1, TRUE);
3558 }
3559
3560 /* Update mdev->ldev->md.la_size_sect,
3561 * since we updated it on metadata. */
3562 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3563
3564 mutex_unlock(&mdev->md_io_mutex);
3565 put_ldev(mdev);
3566}
3567
3568/**
3569 * drbd_md_read() - Reads in the meta data super block
3570 * @mdev: DRBD device.
3571 * @bdev: Device from which the meta data should be read in.
3572 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003573 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003574 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3575 */
3576int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3577{
3578 struct meta_data_on_disk *buffer;
3579 int i, rv = NO_ERROR;
3580
3581 if (!get_ldev_if_state(mdev, D_ATTACHING))
3582 return ERR_IO_MD_DISK;
3583
Philipp Reisnerb411b362009-09-25 16:07:19 -07003584 mutex_lock(&mdev->md_io_mutex);
3585 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3586
3587 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3588 /* NOTE: cant do normal error processing here as this is
3589 called BEFORE disk is attached */
3590 dev_err(DEV, "Error while reading metadata.\n");
3591 rv = ERR_IO_MD_DISK;
3592 goto err;
3593 }
3594
3595 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3596 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3597 rv = ERR_MD_INVALID;
3598 goto err;
3599 }
3600 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3601 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3602 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3603 rv = ERR_MD_INVALID;
3604 goto err;
3605 }
3606 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3607 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3608 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3609 rv = ERR_MD_INVALID;
3610 goto err;
3611 }
3612 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3613 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3614 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3615 rv = ERR_MD_INVALID;
3616 goto err;
3617 }
3618
3619 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3620 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3621 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3622 rv = ERR_MD_INVALID;
3623 goto err;
3624 }
3625
3626 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3627 for (i = UI_CURRENT; i < UI_SIZE; i++)
3628 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3629 bdev->md.flags = be32_to_cpu(buffer->flags);
3630 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3631 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3632
3633 if (mdev->sync_conf.al_extents < 7)
3634 mdev->sync_conf.al_extents = 127;
3635
3636 err:
3637 mutex_unlock(&mdev->md_io_mutex);
3638 put_ldev(mdev);
3639
3640 return rv;
3641}
3642
Lars Ellenbergac724122010-10-07 15:18:08 +02003643static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3644{
3645 static char *uuid_str[UI_EXTENDED_SIZE] = {
3646 [UI_CURRENT] = "CURRENT",
3647 [UI_BITMAP] = "BITMAP",
3648 [UI_HISTORY_START] = "HISTORY_START",
3649 [UI_HISTORY_END] = "HISTORY_END",
3650 [UI_SIZE] = "SIZE",
3651 [UI_FLAGS] = "FLAGS",
3652 };
3653
3654 if (index >= UI_EXTENDED_SIZE) {
3655 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3656 return;
3657 }
3658
3659 dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3660 uuid_str[index],
3661 (unsigned long long)mdev->ldev->md.uuid[index]);
3662}
3663
3664
Philipp Reisnerb411b362009-09-25 16:07:19 -07003665/**
3666 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3667 * @mdev: DRBD device.
3668 *
3669 * Call this function if you change anything that should be written to
3670 * the meta-data super block. This function sets MD_DIRTY, and starts a
3671 * timer that ensures that within five seconds you have to call drbd_md_sync().
3672 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003673#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003674void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3675{
3676 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3677 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3678 mdev->last_md_mark_dirty.line = line;
3679 mdev->last_md_mark_dirty.func = func;
3680 }
3681}
3682#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003683void drbd_md_mark_dirty(struct drbd_conf *mdev)
3684{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003685 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003686 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003687}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003688#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003689
3690static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3691{
3692 int i;
3693
Lars Ellenbergac724122010-10-07 15:18:08 +02003694 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003695 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Lars Ellenbergac724122010-10-07 15:18:08 +02003696 debug_drbd_uuid(mdev, i+1);
3697 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003698}
3699
3700void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3701{
3702 if (idx == UI_CURRENT) {
3703 if (mdev->state.role == R_PRIMARY)
3704 val |= 1;
3705 else
3706 val &= ~((u64)1);
3707
3708 drbd_set_ed_uuid(mdev, val);
3709 }
3710
3711 mdev->ldev->md.uuid[idx] = val;
Lars Ellenbergac724122010-10-07 15:18:08 +02003712 debug_drbd_uuid(mdev, idx);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003713 drbd_md_mark_dirty(mdev);
3714}
3715
3716
3717void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3718{
3719 if (mdev->ldev->md.uuid[idx]) {
3720 drbd_uuid_move_history(mdev);
3721 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Lars Ellenbergac724122010-10-07 15:18:08 +02003722 debug_drbd_uuid(mdev, UI_HISTORY_START);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003723 }
3724 _drbd_uuid_set(mdev, idx, val);
3725}
3726
3727/**
3728 * drbd_uuid_new_current() - Creates a new current UUID
3729 * @mdev: DRBD device.
3730 *
3731 * Creates a new current UUID, and rotates the old current UUID into
3732 * the bitmap slot. Causes an incremental resync upon next connect.
3733 */
3734void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3735{
3736 u64 val;
3737
3738 dev_info(DEV, "Creating new current UUID\n");
3739 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3740 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Lars Ellenbergac724122010-10-07 15:18:08 +02003741 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003742
3743 get_random_bytes(&val, sizeof(u64));
3744 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003745 /* get it to stable storage _now_ */
3746 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003747}
3748
3749void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3750{
3751 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3752 return;
3753
3754 if (val == 0) {
3755 drbd_uuid_move_history(mdev);
3756 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3757 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Lars Ellenbergac724122010-10-07 15:18:08 +02003758 debug_drbd_uuid(mdev, UI_HISTORY_START);
3759 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003760 } else {
3761 if (mdev->ldev->md.uuid[UI_BITMAP])
3762 dev_warn(DEV, "bm UUID already set");
3763
3764 mdev->ldev->md.uuid[UI_BITMAP] = val;
3765 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3766
Lars Ellenbergac724122010-10-07 15:18:08 +02003767 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003768 }
3769 drbd_md_mark_dirty(mdev);
3770}
3771
3772/**
3773 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3774 * @mdev: DRBD device.
3775 *
3776 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3777 */
3778int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3779{
3780 int rv = -EIO;
3781
3782 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3783 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3784 drbd_md_sync(mdev);
3785 drbd_bm_set_all(mdev);
3786
3787 rv = drbd_bm_write(mdev);
3788
3789 if (!rv) {
3790 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3791 drbd_md_sync(mdev);
3792 }
3793
3794 put_ldev(mdev);
3795 }
3796
3797 return rv;
3798}
3799
3800/**
3801 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3802 * @mdev: DRBD device.
3803 *
3804 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3805 */
3806int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3807{
3808 int rv = -EIO;
3809
Philipp Reisner07782862010-08-31 12:00:50 +02003810 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003811 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3812 drbd_bm_clear_all(mdev);
3813 rv = drbd_bm_write(mdev);
3814 put_ldev(mdev);
3815 }
3816
3817 return rv;
3818}
3819
3820static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3821{
3822 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3823 int rv;
3824
3825 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3826
3827 drbd_bm_lock(mdev, work->why);
3828 rv = work->io_fn(mdev);
3829 drbd_bm_unlock(mdev);
3830
3831 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003832 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07003833 wake_up(&mdev->misc_wait);
3834
3835 if (work->done)
3836 work->done(mdev, rv);
3837
3838 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3839 work->why = NULL;
3840
3841 return 1;
3842}
3843
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003844void drbd_ldev_destroy(struct drbd_conf *mdev)
3845{
3846 lc_destroy(mdev->resync);
3847 mdev->resync = NULL;
3848 lc_destroy(mdev->act_log);
3849 mdev->act_log = NULL;
3850 __no_warn(local,
3851 drbd_free_bc(mdev->ldev);
3852 mdev->ldev = NULL;);
3853
3854 if (mdev->md_io_tmpp) {
3855 __free_page(mdev->md_io_tmpp);
3856 mdev->md_io_tmpp = NULL;
3857 }
3858 clear_bit(GO_DISKLESS, &mdev->flags);
3859}
3860
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003861static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3862{
3863 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02003864 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3865 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003866 * the protected members anymore, though, so once put_ldev reaches zero
3867 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003868 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003869 return 1;
3870}
3871
3872void drbd_go_diskless(struct drbd_conf *mdev)
3873{
3874 D_ASSERT(mdev->state.disk == D_FAILED);
3875 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02003876 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003877}
3878
Philipp Reisnerb411b362009-09-25 16:07:19 -07003879/**
3880 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3881 * @mdev: DRBD device.
3882 * @io_fn: IO callback to be called when bitmap IO is possible
3883 * @done: callback to be called after the bitmap IO was performed
3884 * @why: Descriptive text of the reason for doing the IO
3885 *
3886 * While IO on the bitmap happens we freeze application IO thus we ensure
3887 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3888 * called from worker context. It MUST NOT be used while a previous such
3889 * work is still pending!
3890 */
3891void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3892 int (*io_fn)(struct drbd_conf *),
3893 void (*done)(struct drbd_conf *, int),
3894 char *why)
3895{
3896 D_ASSERT(current == mdev->worker.task);
3897
3898 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3899 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3900 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3901 if (mdev->bm_io_work.why)
3902 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3903 why, mdev->bm_io_work.why);
3904
3905 mdev->bm_io_work.io_fn = io_fn;
3906 mdev->bm_io_work.done = done;
3907 mdev->bm_io_work.why = why;
3908
Philipp Reisner22afd7e2010-11-16 15:30:44 +01003909 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003910 set_bit(BITMAP_IO, &mdev->flags);
3911 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01003912 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003913 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003914 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01003915 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003916}
3917
3918/**
3919 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3920 * @mdev: DRBD device.
3921 * @io_fn: IO callback to be called when bitmap IO is possible
3922 * @why: Descriptive text of the reason for doing the IO
3923 *
3924 * freezes application IO while that the actual IO operations runs. This
3925 * functions MAY NOT be called from worker context.
3926 */
3927int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3928{
3929 int rv;
3930
3931 D_ASSERT(current != mdev->worker.task);
3932
3933 drbd_suspend_io(mdev);
3934
3935 drbd_bm_lock(mdev, why);
3936 rv = io_fn(mdev);
3937 drbd_bm_unlock(mdev);
3938
3939 drbd_resume_io(mdev);
3940
3941 return rv;
3942}
3943
3944void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3945{
3946 if ((mdev->ldev->md.flags & flag) != flag) {
3947 drbd_md_mark_dirty(mdev);
3948 mdev->ldev->md.flags |= flag;
3949 }
3950}
3951
3952void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3953{
3954 if ((mdev->ldev->md.flags & flag) != 0) {
3955 drbd_md_mark_dirty(mdev);
3956 mdev->ldev->md.flags &= ~flag;
3957 }
3958}
3959int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3960{
3961 return (bdev->md.flags & flag) != 0;
3962}
3963
3964static void md_sync_timer_fn(unsigned long data)
3965{
3966 struct drbd_conf *mdev = (struct drbd_conf *) data;
3967
3968 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3969}
3970
3971static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3972{
3973 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02003974#ifdef DEBUG
3975 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3976 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3977#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003978 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003979 return 1;
3980}
3981
3982#ifdef CONFIG_DRBD_FAULT_INJECTION
3983/* Fault insertion support including random number generator shamelessly
3984 * stolen from kernel/rcutorture.c */
3985struct fault_random_state {
3986 unsigned long state;
3987 unsigned long count;
3988};
3989
3990#define FAULT_RANDOM_MULT 39916801 /* prime */
3991#define FAULT_RANDOM_ADD 479001701 /* prime */
3992#define FAULT_RANDOM_REFRESH 10000
3993
3994/*
3995 * Crude but fast random-number generator. Uses a linear congruential
3996 * generator, with occasional help from get_random_bytes().
3997 */
3998static unsigned long
3999_drbd_fault_random(struct fault_random_state *rsp)
4000{
4001 long refresh;
4002
Roel Kluin49829ea2009-12-15 22:55:44 +01004003 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004004 get_random_bytes(&refresh, sizeof(refresh));
4005 rsp->state += refresh;
4006 rsp->count = FAULT_RANDOM_REFRESH;
4007 }
4008 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4009 return swahw32(rsp->state);
4010}
4011
4012static char *
4013_drbd_fault_str(unsigned int type) {
4014 static char *_faults[] = {
4015 [DRBD_FAULT_MD_WR] = "Meta-data write",
4016 [DRBD_FAULT_MD_RD] = "Meta-data read",
4017 [DRBD_FAULT_RS_WR] = "Resync write",
4018 [DRBD_FAULT_RS_RD] = "Resync read",
4019 [DRBD_FAULT_DT_WR] = "Data write",
4020 [DRBD_FAULT_DT_RD] = "Data read",
4021 [DRBD_FAULT_DT_RA] = "Data read ahead",
4022 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004023 [DRBD_FAULT_AL_EE] = "EE allocation",
4024 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004025 };
4026
4027 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4028}
4029
4030unsigned int
4031_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4032{
4033 static struct fault_random_state rrs = {0, 0};
4034
4035 unsigned int ret = (
4036 (fault_devs == 0 ||
4037 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4038 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4039
4040 if (ret) {
4041 fault_count++;
4042
Lars Ellenberg73835062010-05-27 11:51:56 +02004043 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004044 dev_warn(DEV, "***Simulating %s failure\n",
4045 _drbd_fault_str(type));
4046 }
4047
4048 return ret;
4049}
4050#endif
4051
4052const char *drbd_buildtag(void)
4053{
4054 /* DRBD built from external sources has here a reference to the
4055 git hash of the source code. */
4056
4057 static char buildtag[38] = "\0uilt-in";
4058
4059 if (buildtag[0] == 0) {
4060#ifdef CONFIG_MODULES
4061 if (THIS_MODULE != NULL)
4062 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4063 else
4064#endif
4065 buildtag[0] = 'b';
4066 }
4067
4068 return buildtag;
4069}
4070
4071module_init(drbd_init)
4072module_exit(drbd_cleanup)
4073
Philipp Reisnerb411b362009-09-25 16:07:19 -07004074EXPORT_SYMBOL(drbd_conn_str);
4075EXPORT_SYMBOL(drbd_role_str);
4076EXPORT_SYMBOL(drbd_disk_str);
4077EXPORT_SYMBOL(drbd_set_st_err_str);