blob: 3d5fe307a19d55b28c280596eaa12eae6464dd73 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
35#include <linux/smp_lock.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
67int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *);
70
71int drbd_init(void);
72static int drbd_open(struct block_device *bdev, fmode_t mode);
73static int drbd_release(struct gendisk *gd, fmode_t mode);
74static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76 union drbd_state ns, enum chg_state_flags flags);
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80
Philipp Reisnerb411b362009-09-25 16:07:19 -070081MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82 "Lars Ellenberg <lars@linbit.com>");
83MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84MODULE_VERSION(REL_VERSION);
85MODULE_LICENSE("GPL");
86MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
87MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88
89#include <linux/moduleparam.h>
90/* allow_open_on_secondary */
91MODULE_PARM_DESC(allow_oos, "DONT USE!");
92/* thanks to these macros, if compiled into the kernel (not-module),
93 * this becomes the boot parameter drbd.minor_count */
94module_param(minor_count, uint, 0444);
95module_param(disable_sendpage, bool, 0644);
96module_param(allow_oos, bool, 0);
97module_param(cn_idx, uint, 0444);
98module_param(proc_details, int, 0644);
99
100#ifdef CONFIG_DRBD_FAULT_INJECTION
101int enable_faults;
102int fault_rate;
103static int fault_count;
104int fault_devs;
105/* bitmap of enabled faults */
106module_param(enable_faults, int, 0664);
107/* fault rate % value - applies to all enabled faults */
108module_param(fault_rate, int, 0664);
109/* count of faults inserted */
110module_param(fault_count, int, 0664);
111/* bitmap of devices to insert faults on */
112module_param(fault_devs, int, 0644);
113#endif
114
115/* module parameter, defined */
116unsigned int minor_count = 32;
117int disable_sendpage;
118int allow_oos;
119unsigned int cn_idx = CN_IDX_DRBD;
120int proc_details; /* Detail level in proc drbd*/
121
122/* Module parameter for setting the user mode helper program
123 * to run. Default is /sbin/drbdadm */
124char usermode_helper[80] = "/sbin/drbdadm";
125
126module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
127
128/* in 2.6.x, our device mapping and config info contains our virtual gendisks
129 * as member "struct gendisk *vdisk;"
130 */
131struct drbd_conf **minor_table;
132
133struct kmem_cache *drbd_request_cache;
134struct kmem_cache *drbd_ee_cache; /* epoch entries */
135struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
136struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
137mempool_t *drbd_request_mempool;
138mempool_t *drbd_ee_mempool;
139
140/* I do not use a standard mempool, because:
141 1) I want to hand out the pre-allocated objects first.
142 2) I want to be able to interrupt sleeping allocation with a signal.
143 Note: This is a single linked list, the next pointer is the private
144 member of struct page.
145 */
146struct page *drbd_pp_pool;
147spinlock_t drbd_pp_lock;
148int drbd_pp_vacant;
149wait_queue_head_t drbd_pp_wait;
150
151DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
152
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100153static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700154 .owner = THIS_MODULE,
155 .open = drbd_open,
156 .release = drbd_release,
157};
158
159#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
160
161#ifdef __CHECKER__
162/* When checking with sparse, and this is an inline function, sparse will
163 give tons of false positives. When this is a real functions sparse works.
164 */
165int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
166{
167 int io_allowed;
168
169 atomic_inc(&mdev->local_cnt);
170 io_allowed = (mdev->state.disk >= mins);
171 if (!io_allowed) {
172 if (atomic_dec_and_test(&mdev->local_cnt))
173 wake_up(&mdev->misc_wait);
174 }
175 return io_allowed;
176}
177
178#endif
179
180/**
181 * DOC: The transfer log
182 *
183 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
184 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
185 * of the list. There is always at least one &struct drbd_tl_epoch object.
186 *
187 * Each &struct drbd_tl_epoch has a circular double linked list of requests
188 * attached.
189 */
190static int tl_init(struct drbd_conf *mdev)
191{
192 struct drbd_tl_epoch *b;
193
194 /* during device minor initialization, we may well use GFP_KERNEL */
195 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
196 if (!b)
197 return 0;
198 INIT_LIST_HEAD(&b->requests);
199 INIT_LIST_HEAD(&b->w.list);
200 b->next = NULL;
201 b->br_number = 4711;
202 b->n_req = 0;
203 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204
205 mdev->oldest_tle = b;
206 mdev->newest_tle = b;
207 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
208
209 mdev->tl_hash = NULL;
210 mdev->tl_hash_s = 0;
211
212 return 1;
213}
214
215static void tl_cleanup(struct drbd_conf *mdev)
216{
217 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
218 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
219 kfree(mdev->oldest_tle);
220 mdev->oldest_tle = NULL;
221 kfree(mdev->unused_spare_tle);
222 mdev->unused_spare_tle = NULL;
223 kfree(mdev->tl_hash);
224 mdev->tl_hash = NULL;
225 mdev->tl_hash_s = 0;
226}
227
228/**
229 * _tl_add_barrier() - Adds a barrier to the transfer log
230 * @mdev: DRBD device.
231 * @new: Barrier to be added before the current head of the TL.
232 *
233 * The caller must hold the req_lock.
234 */
235void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
236{
237 struct drbd_tl_epoch *newest_before;
238
239 INIT_LIST_HEAD(&new->requests);
240 INIT_LIST_HEAD(&new->w.list);
241 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242 new->next = NULL;
243 new->n_req = 0;
244
245 newest_before = mdev->newest_tle;
246 /* never send a barrier number == 0, because that is special-cased
247 * when using TCQ for our write ordering code */
248 new->br_number = (newest_before->br_number+1) ?: 1;
249 if (mdev->newest_tle != new) {
250 mdev->newest_tle->next = new;
251 mdev->newest_tle = new;
252 }
253}
254
255/**
256 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
257 * @mdev: DRBD device.
258 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
259 * @set_size: Expected number of requests before that barrier.
260 *
261 * In case the passed barrier_nr or set_size does not match the oldest
262 * &struct drbd_tl_epoch objects this function will cause a termination
263 * of the connection.
264 */
265void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
266 unsigned int set_size)
267{
268 struct drbd_tl_epoch *b, *nob; /* next old barrier */
269 struct list_head *le, *tle;
270 struct drbd_request *r;
271
272 spin_lock_irq(&mdev->req_lock);
273
274 b = mdev->oldest_tle;
275
276 /* first some paranoia code */
277 if (b == NULL) {
278 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
279 barrier_nr);
280 goto bail;
281 }
282 if (b->br_number != barrier_nr) {
283 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
284 barrier_nr, b->br_number);
285 goto bail;
286 }
287 if (b->n_req != set_size) {
288 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
289 barrier_nr, set_size, b->n_req);
290 goto bail;
291 }
292
293 /* Clean up list of requests processed during current epoch */
294 list_for_each_safe(le, tle, &b->requests) {
295 r = list_entry(le, struct drbd_request, tl_requests);
296 _req_mod(r, barrier_acked);
297 }
298 /* There could be requests on the list waiting for completion
299 of the write to the local disk. To avoid corruptions of
300 slab's data structures we have to remove the lists head.
301
302 Also there could have been a barrier ack out of sequence, overtaking
303 the write acks - which would be a bug and violating write ordering.
304 To not deadlock in case we lose connection while such requests are
305 still pending, we need some way to find them for the
306 _req_mode(connection_lost_while_pending).
307
308 These have been list_move'd to the out_of_sequence_requests list in
309 _req_mod(, barrier_acked) above.
310 */
311 list_del_init(&b->requests);
312
313 nob = b->next;
314 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
315 _tl_add_barrier(mdev, b);
316 if (nob)
317 mdev->oldest_tle = nob;
318 /* if nob == NULL b was the only barrier, and becomes the new
319 barrier. Therefore mdev->oldest_tle points already to b */
320 } else {
321 D_ASSERT(nob != NULL);
322 mdev->oldest_tle = nob;
323 kfree(b);
324 }
325
326 spin_unlock_irq(&mdev->req_lock);
327 dec_ap_pending(mdev);
328
329 return;
330
331bail:
332 spin_unlock_irq(&mdev->req_lock);
333 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334}
335
336
337/**
338 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
339 * @mdev: DRBD device.
340 *
341 * This is called after the connection to the peer was lost. The storage covered
342 * by the requests on the transfer gets marked as our of sync. Called from the
343 * receiver thread and the worker thread.
344 */
345void tl_clear(struct drbd_conf *mdev)
346{
347 struct drbd_tl_epoch *b, *tmp;
348 struct list_head *le, *tle;
349 struct drbd_request *r;
350 int new_initial_bnr = net_random();
351
352 spin_lock_irq(&mdev->req_lock);
353
354 b = mdev->oldest_tle;
355 while (b) {
356 list_for_each_safe(le, tle, &b->requests) {
357 r = list_entry(le, struct drbd_request, tl_requests);
358 /* It would be nice to complete outside of spinlock.
359 * But this is easier for now. */
360 _req_mod(r, connection_lost_while_pending);
361 }
362 tmp = b->next;
363
364 /* there could still be requests on that ring list,
365 * in case local io is still pending */
366 list_del(&b->requests);
367
368 /* dec_ap_pending corresponding to queue_barrier.
369 * the newest barrier may not have been queued yet,
370 * in which case w.cb is still NULL. */
371 if (b->w.cb != NULL)
372 dec_ap_pending(mdev);
373
374 if (b == mdev->newest_tle) {
375 /* recycle, but reinit! */
376 D_ASSERT(tmp == NULL);
377 INIT_LIST_HEAD(&b->requests);
378 INIT_LIST_HEAD(&b->w.list);
379 b->w.cb = NULL;
380 b->br_number = new_initial_bnr;
381 b->n_req = 0;
382
383 mdev->oldest_tle = b;
384 break;
385 }
386 kfree(b);
387 b = tmp;
388 }
389
390 /* we expect this list to be empty. */
391 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
392
393 /* but just in case, clean it up anyways! */
394 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
395 r = list_entry(le, struct drbd_request, tl_requests);
396 /* It would be nice to complete outside of spinlock.
397 * But this is easier for now. */
398 _req_mod(r, connection_lost_while_pending);
399 }
400
401 /* ensure bit indicating barrier is required is clear */
402 clear_bit(CREATE_BARRIER, &mdev->flags);
403
404 spin_unlock_irq(&mdev->req_lock);
405}
406
407/**
408 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
409 * @mdev: DRBD device.
410 * @os: old (current) state.
411 * @ns: new (wanted) state.
412 */
413static int cl_wide_st_chg(struct drbd_conf *mdev,
414 union drbd_state os, union drbd_state ns)
415{
416 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
417 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
418 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
419 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
420 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
421 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
422 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
423}
424
425int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
426 union drbd_state mask, union drbd_state val)
427{
428 unsigned long flags;
429 union drbd_state os, ns;
430 int rv;
431
432 spin_lock_irqsave(&mdev->req_lock, flags);
433 os = mdev->state;
434 ns.i = (os.i & ~mask.i) | val.i;
435 rv = _drbd_set_state(mdev, ns, f, NULL);
436 ns = mdev->state;
437 spin_unlock_irqrestore(&mdev->req_lock, flags);
438
439 return rv;
440}
441
442/**
443 * drbd_force_state() - Impose a change which happens outside our control on our state
444 * @mdev: DRBD device.
445 * @mask: mask of state bits to change.
446 * @val: value of new state bits.
447 */
448void drbd_force_state(struct drbd_conf *mdev,
449 union drbd_state mask, union drbd_state val)
450{
451 drbd_change_state(mdev, CS_HARD, mask, val);
452}
453
454static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
455static int is_valid_state_transition(struct drbd_conf *,
456 union drbd_state, union drbd_state);
457static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
458 union drbd_state ns, int *warn_sync_abort);
459int drbd_send_state_req(struct drbd_conf *,
460 union drbd_state, union drbd_state);
461
462static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
463 union drbd_state mask, union drbd_state val)
464{
465 union drbd_state os, ns;
466 unsigned long flags;
467 int rv;
468
469 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
470 return SS_CW_SUCCESS;
471
472 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
473 return SS_CW_FAILED_BY_PEER;
474
475 rv = 0;
476 spin_lock_irqsave(&mdev->req_lock, flags);
477 os = mdev->state;
478 ns.i = (os.i & ~mask.i) | val.i;
479 ns = sanitize_state(mdev, os, ns, NULL);
480
481 if (!cl_wide_st_chg(mdev, os, ns))
482 rv = SS_CW_NO_NEED;
483 if (!rv) {
484 rv = is_valid_state(mdev, ns);
485 if (rv == SS_SUCCESS) {
486 rv = is_valid_state_transition(mdev, ns, os);
487 if (rv == SS_SUCCESS)
488 rv = 0; /* cont waiting, otherwise fail. */
489 }
490 }
491 spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493 return rv;
494}
495
496/**
497 * drbd_req_state() - Perform an eventually cluster wide state change
498 * @mdev: DRBD device.
499 * @mask: mask of state bits to change.
500 * @val: value of new state bits.
501 * @f: flags
502 *
503 * Should not be called directly, use drbd_request_state() or
504 * _drbd_request_state().
505 */
506static int drbd_req_state(struct drbd_conf *mdev,
507 union drbd_state mask, union drbd_state val,
508 enum chg_state_flags f)
509{
510 struct completion done;
511 unsigned long flags;
512 union drbd_state os, ns;
513 int rv;
514
515 init_completion(&done);
516
517 if (f & CS_SERIALIZE)
518 mutex_lock(&mdev->state_mutex);
519
520 spin_lock_irqsave(&mdev->req_lock, flags);
521 os = mdev->state;
522 ns.i = (os.i & ~mask.i) | val.i;
523 ns = sanitize_state(mdev, os, ns, NULL);
524
525 if (cl_wide_st_chg(mdev, os, ns)) {
526 rv = is_valid_state(mdev, ns);
527 if (rv == SS_SUCCESS)
528 rv = is_valid_state_transition(mdev, ns, os);
529 spin_unlock_irqrestore(&mdev->req_lock, flags);
530
531 if (rv < SS_SUCCESS) {
532 if (f & CS_VERBOSE)
533 print_st_err(mdev, os, ns, rv);
534 goto abort;
535 }
536
537 drbd_state_lock(mdev);
538 if (!drbd_send_state_req(mdev, mask, val)) {
539 drbd_state_unlock(mdev);
540 rv = SS_CW_FAILED_BY_PEER;
541 if (f & CS_VERBOSE)
542 print_st_err(mdev, os, ns, rv);
543 goto abort;
544 }
545
546 wait_event(mdev->state_wait,
547 (rv = _req_st_cond(mdev, mask, val)));
548
549 if (rv < SS_SUCCESS) {
550 drbd_state_unlock(mdev);
551 if (f & CS_VERBOSE)
552 print_st_err(mdev, os, ns, rv);
553 goto abort;
554 }
555 spin_lock_irqsave(&mdev->req_lock, flags);
556 os = mdev->state;
557 ns.i = (os.i & ~mask.i) | val.i;
558 rv = _drbd_set_state(mdev, ns, f, &done);
559 drbd_state_unlock(mdev);
560 } else {
561 rv = _drbd_set_state(mdev, ns, f, &done);
562 }
563
564 spin_unlock_irqrestore(&mdev->req_lock, flags);
565
566 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
567 D_ASSERT(current != mdev->worker.task);
568 wait_for_completion(&done);
569 }
570
571abort:
572 if (f & CS_SERIALIZE)
573 mutex_unlock(&mdev->state_mutex);
574
575 return rv;
576}
577
578/**
579 * _drbd_request_state() - Request a state change (with flags)
580 * @mdev: DRBD device.
581 * @mask: mask of state bits to change.
582 * @val: value of new state bits.
583 * @f: flags
584 *
585 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
586 * flag, or when logging of failed state change requests is not desired.
587 */
588int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
589 union drbd_state val, enum chg_state_flags f)
590{
591 int rv;
592
593 wait_event(mdev->state_wait,
594 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
595
596 return rv;
597}
598
599static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
600{
601 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
602 name,
603 drbd_conn_str(ns.conn),
604 drbd_role_str(ns.role),
605 drbd_role_str(ns.peer),
606 drbd_disk_str(ns.disk),
607 drbd_disk_str(ns.pdsk),
608 ns.susp ? 's' : 'r',
609 ns.aftr_isp ? 'a' : '-',
610 ns.peer_isp ? 'p' : '-',
611 ns.user_isp ? 'u' : '-'
612 );
613}
614
615void print_st_err(struct drbd_conf *mdev,
616 union drbd_state os, union drbd_state ns, int err)
617{
618 if (err == SS_IN_TRANSIENT_STATE)
619 return;
620 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
621 print_st(mdev, " state", os);
622 print_st(mdev, "wanted", ns);
623}
624
625
626#define drbd_peer_str drbd_role_str
627#define drbd_pdsk_str drbd_disk_str
628
629#define drbd_susp_str(A) ((A) ? "1" : "0")
630#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
631#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
632#define drbd_user_isp_str(A) ((A) ? "1" : "0")
633
634#define PSC(A) \
635 ({ if (ns.A != os.A) { \
636 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
637 drbd_##A##_str(os.A), \
638 drbd_##A##_str(ns.A)); \
639 } })
640
641/**
642 * is_valid_state() - Returns an SS_ error code if ns is not valid
643 * @mdev: DRBD device.
644 * @ns: State to consider.
645 */
646static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
647{
648 /* See drbd_state_sw_errors in drbd_strings.c */
649
650 enum drbd_fencing_p fp;
651 int rv = SS_SUCCESS;
652
653 fp = FP_DONT_CARE;
654 if (get_ldev(mdev)) {
655 fp = mdev->ldev->dc.fencing;
656 put_ldev(mdev);
657 }
658
659 if (get_net_conf(mdev)) {
660 if (!mdev->net_conf->two_primaries &&
661 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
662 rv = SS_TWO_PRIMARIES;
663 put_net_conf(mdev);
664 }
665
666 if (rv <= 0)
667 /* already found a reason to abort */;
668 else if (ns.role == R_SECONDARY && mdev->open_cnt)
669 rv = SS_DEVICE_IN_USE;
670
671 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
672 rv = SS_NO_UP_TO_DATE_DISK;
673
674 else if (fp >= FP_RESOURCE &&
675 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
676 rv = SS_PRIMARY_NOP;
677
678 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
679 rv = SS_NO_UP_TO_DATE_DISK;
680
681 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
682 rv = SS_NO_LOCAL_DISK;
683
684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
685 rv = SS_NO_REMOTE_DISK;
686
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200687 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
688 rv = SS_NO_UP_TO_DATE_DISK;
689
Philipp Reisnerb411b362009-09-25 16:07:19 -0700690 else if ((ns.conn == C_CONNECTED ||
691 ns.conn == C_WF_BITMAP_S ||
692 ns.conn == C_SYNC_SOURCE ||
693 ns.conn == C_PAUSED_SYNC_S) &&
694 ns.disk == D_OUTDATED)
695 rv = SS_CONNECTED_OUTDATES;
696
697 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
698 (mdev->sync_conf.verify_alg[0] == 0))
699 rv = SS_NO_VERIFY_ALG;
700
701 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
702 mdev->agreed_pro_version < 88)
703 rv = SS_NOT_SUPPORTED;
704
705 return rv;
706}
707
708/**
709 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
710 * @mdev: DRBD device.
711 * @ns: new state.
712 * @os: old state.
713 */
714static int is_valid_state_transition(struct drbd_conf *mdev,
715 union drbd_state ns, union drbd_state os)
716{
717 int rv = SS_SUCCESS;
718
719 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
720 os.conn > C_CONNECTED)
721 rv = SS_RESYNC_RUNNING;
722
723 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
724 rv = SS_ALREADY_STANDALONE;
725
726 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
727 rv = SS_IS_DISKLESS;
728
729 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
730 rv = SS_NO_NET_CONFIG;
731
732 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
733 rv = SS_LOWER_THAN_OUTDATED;
734
735 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
736 rv = SS_IN_TRANSIENT_STATE;
737
738 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
739 rv = SS_IN_TRANSIENT_STATE;
740
741 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
742 rv = SS_NEED_CONNECTION;
743
744 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
745 ns.conn != os.conn && os.conn > C_CONNECTED)
746 rv = SS_RESYNC_RUNNING;
747
748 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
749 os.conn < C_CONNECTED)
750 rv = SS_NEED_CONNECTION;
751
752 return rv;
753}
754
755/**
756 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
757 * @mdev: DRBD device.
758 * @os: old state.
759 * @ns: new state.
760 * @warn_sync_abort:
761 *
762 * When we loose connection, we have to set the state of the peers disk (pdsk)
763 * to D_UNKNOWN. This rule and many more along those lines are in this function.
764 */
765static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
766 union drbd_state ns, int *warn_sync_abort)
767{
768 enum drbd_fencing_p fp;
769
770 fp = FP_DONT_CARE;
771 if (get_ldev(mdev)) {
772 fp = mdev->ldev->dc.fencing;
773 put_ldev(mdev);
774 }
775
776 /* Disallow Network errors to configure a device's network part */
777 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
778 os.conn <= C_DISCONNECTING)
779 ns.conn = os.conn;
780
781 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
782 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
783 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
784 ns.conn = os.conn;
785
786 /* After C_DISCONNECTING only C_STANDALONE may follow */
787 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
788 ns.conn = os.conn;
789
790 if (ns.conn < C_CONNECTED) {
791 ns.peer_isp = 0;
792 ns.peer = R_UNKNOWN;
793 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
794 ns.pdsk = D_UNKNOWN;
795 }
796
797 /* Clear the aftr_isp when becoming unconfigured */
798 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
799 ns.aftr_isp = 0;
800
801 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
802 ns.pdsk = D_UNKNOWN;
803
804 /* Abort resync if a disk fails/detaches */
805 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
806 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
807 if (warn_sync_abort)
808 *warn_sync_abort = 1;
809 ns.conn = C_CONNECTED;
810 }
811
812 if (ns.conn >= C_CONNECTED &&
813 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
814 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
815 switch (ns.conn) {
816 case C_WF_BITMAP_T:
817 case C_PAUSED_SYNC_T:
818 ns.disk = D_OUTDATED;
819 break;
820 case C_CONNECTED:
821 case C_WF_BITMAP_S:
822 case C_SYNC_SOURCE:
823 case C_PAUSED_SYNC_S:
824 ns.disk = D_UP_TO_DATE;
825 break;
826 case C_SYNC_TARGET:
827 ns.disk = D_INCONSISTENT;
828 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
829 break;
830 }
831 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
832 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
833 }
834
835 if (ns.conn >= C_CONNECTED &&
836 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
837 switch (ns.conn) {
838 case C_CONNECTED:
839 case C_WF_BITMAP_T:
840 case C_PAUSED_SYNC_T:
841 case C_SYNC_TARGET:
842 ns.pdsk = D_UP_TO_DATE;
843 break;
844 case C_WF_BITMAP_S:
845 case C_PAUSED_SYNC_S:
Lars Ellenberge0f83012010-04-01 15:13:19 +0200846 /* remap any consistent state to D_OUTDATED,
847 * but disallow "upgrade" of not even consistent states.
848 */
849 ns.pdsk =
850 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
851 ? os.pdsk : D_OUTDATED;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700852 break;
853 case C_SYNC_SOURCE:
854 ns.pdsk = D_INCONSISTENT;
855 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
856 break;
857 }
858 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
859 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
860 }
861
862 /* Connection breaks down before we finished "Negotiating" */
863 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
864 get_ldev_if_state(mdev, D_NEGOTIATING)) {
865 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
866 ns.disk = mdev->new_state_tmp.disk;
867 ns.pdsk = mdev->new_state_tmp.pdsk;
868 } else {
869 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
870 ns.disk = D_DISKLESS;
871 ns.pdsk = D_UNKNOWN;
872 }
873 put_ldev(mdev);
874 }
875
876 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200877 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
878 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
879 ns.susp = 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700880
881 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
882 if (ns.conn == C_SYNC_SOURCE)
883 ns.conn = C_PAUSED_SYNC_S;
884 if (ns.conn == C_SYNC_TARGET)
885 ns.conn = C_PAUSED_SYNC_T;
886 } else {
887 if (ns.conn == C_PAUSED_SYNC_S)
888 ns.conn = C_SYNC_SOURCE;
889 if (ns.conn == C_PAUSED_SYNC_T)
890 ns.conn = C_SYNC_TARGET;
891 }
892
893 return ns;
894}
895
896/* helper for __drbd_set_state */
897static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
898{
899 if (cs == C_VERIFY_T) {
900 /* starting online verify from an arbitrary position
901 * does not fit well into the existing protocol.
902 * on C_VERIFY_T, we initialize ov_left and friends
903 * implicitly in receive_DataRequest once the
904 * first P_OV_REQUEST is received */
905 mdev->ov_start_sector = ~(sector_t)0;
906 } else {
907 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
908 if (bit >= mdev->rs_total)
909 mdev->ov_start_sector =
910 BM_BIT_TO_SECT(mdev->rs_total - 1);
911 mdev->ov_position = mdev->ov_start_sector;
912 }
913}
914
915/**
916 * __drbd_set_state() - Set a new DRBD state
917 * @mdev: DRBD device.
918 * @ns: new state.
919 * @flags: Flags
920 * @done: Optional completion, that will get completed after the after_state_ch() finished
921 *
922 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
923 */
924int __drbd_set_state(struct drbd_conf *mdev,
925 union drbd_state ns, enum chg_state_flags flags,
926 struct completion *done)
927{
928 union drbd_state os;
929 int rv = SS_SUCCESS;
930 int warn_sync_abort = 0;
931 struct after_state_chg_work *ascw;
932
933 os = mdev->state;
934
935 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
936
937 if (ns.i == os.i)
938 return SS_NOTHING_TO_DO;
939
940 if (!(flags & CS_HARD)) {
941 /* pre-state-change checks ; only look at ns */
942 /* See drbd_state_sw_errors in drbd_strings.c */
943
944 rv = is_valid_state(mdev, ns);
945 if (rv < SS_SUCCESS) {
946 /* If the old state was illegal as well, then let
947 this happen...*/
948
949 if (is_valid_state(mdev, os) == rv) {
950 dev_err(DEV, "Considering state change from bad state. "
951 "Error would be: '%s'\n",
952 drbd_set_st_err_str(rv));
953 print_st(mdev, "old", os);
954 print_st(mdev, "new", ns);
955 rv = is_valid_state_transition(mdev, ns, os);
956 }
957 } else
958 rv = is_valid_state_transition(mdev, ns, os);
959 }
960
961 if (rv < SS_SUCCESS) {
962 if (flags & CS_VERBOSE)
963 print_st_err(mdev, os, ns, rv);
964 return rv;
965 }
966
967 if (warn_sync_abort)
968 dev_warn(DEV, "Resync aborted.\n");
969
970 {
971 char *pbp, pb[300];
972 pbp = pb;
973 *pbp = 0;
974 PSC(role);
975 PSC(peer);
976 PSC(conn);
977 PSC(disk);
978 PSC(pdsk);
979 PSC(susp);
980 PSC(aftr_isp);
981 PSC(peer_isp);
982 PSC(user_isp);
983 dev_info(DEV, "%s\n", pb);
984 }
985
986 /* solve the race between becoming unconfigured,
987 * worker doing the cleanup, and
988 * admin reconfiguring us:
989 * on (re)configure, first set CONFIG_PENDING,
990 * then wait for a potentially exiting worker,
991 * start the worker, and schedule one no_op.
992 * then proceed with configuration.
993 */
994 if (ns.disk == D_DISKLESS &&
995 ns.conn == C_STANDALONE &&
996 ns.role == R_SECONDARY &&
997 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
998 set_bit(DEVICE_DYING, &mdev->flags);
999
1000 mdev->state.i = ns.i;
1001 wake_up(&mdev->misc_wait);
1002 wake_up(&mdev->state_wait);
1003
1004 /* post-state-change actions */
1005 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
1006 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1007 mod_timer(&mdev->resync_timer, jiffies);
1008 }
1009
1010 /* aborted verify run. log the last position */
1011 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1012 ns.conn < C_CONNECTED) {
1013 mdev->ov_start_sector =
1014 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1015 dev_info(DEV, "Online Verify reached sector %llu\n",
1016 (unsigned long long)mdev->ov_start_sector);
1017 }
1018
1019 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1020 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1021 dev_info(DEV, "Syncer continues.\n");
1022 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1023 if (ns.conn == C_SYNC_TARGET) {
1024 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1025 mod_timer(&mdev->resync_timer, jiffies);
1026 /* This if (!test_bit) is only needed for the case
1027 that a device that has ceased to used its timer,
1028 i.e. it is already in drbd_resync_finished() gets
1029 paused and resumed. */
1030 }
1031 }
1032
1033 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1034 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1035 dev_info(DEV, "Resync suspended\n");
1036 mdev->rs_mark_time = jiffies;
1037 if (ns.conn == C_PAUSED_SYNC_T)
1038 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1039 }
1040
1041 if (os.conn == C_CONNECTED &&
1042 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1043 mdev->ov_position = 0;
1044 mdev->rs_total =
1045 mdev->rs_mark_left = drbd_bm_bits(mdev);
1046 if (mdev->agreed_pro_version >= 90)
1047 set_ov_position(mdev, ns.conn);
1048 else
1049 mdev->ov_start_sector = 0;
1050 mdev->ov_left = mdev->rs_total
1051 - BM_SECT_TO_BIT(mdev->ov_position);
1052 mdev->rs_start =
1053 mdev->rs_mark_time = jiffies;
1054 mdev->ov_last_oos_size = 0;
1055 mdev->ov_last_oos_start = 0;
1056
1057 if (ns.conn == C_VERIFY_S) {
1058 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1059 (unsigned long long)mdev->ov_position);
1060 mod_timer(&mdev->resync_timer, jiffies);
1061 }
1062 }
1063
1064 if (get_ldev(mdev)) {
1065 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1066 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1067 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1068
1069 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1070 mdf |= MDF_CRASHED_PRIMARY;
1071 if (mdev->state.role == R_PRIMARY ||
1072 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1073 mdf |= MDF_PRIMARY_IND;
1074 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1075 mdf |= MDF_CONNECTED_IND;
1076 if (mdev->state.disk > D_INCONSISTENT)
1077 mdf |= MDF_CONSISTENT;
1078 if (mdev->state.disk > D_OUTDATED)
1079 mdf |= MDF_WAS_UP_TO_DATE;
1080 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1081 mdf |= MDF_PEER_OUT_DATED;
1082 if (mdf != mdev->ldev->md.flags) {
1083 mdev->ldev->md.flags = mdf;
1084 drbd_md_mark_dirty(mdev);
1085 }
1086 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1087 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1088 put_ldev(mdev);
1089 }
1090
1091 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1092 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1093 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1094 set_bit(CONSIDER_RESYNC, &mdev->flags);
1095
1096 /* Receiver should clean up itself */
1097 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1098 drbd_thread_stop_nowait(&mdev->receiver);
1099
1100 /* Now the receiver finished cleaning up itself, it should die */
1101 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1102 drbd_thread_stop_nowait(&mdev->receiver);
1103
1104 /* Upon network failure, we need to restart the receiver. */
1105 if (os.conn > C_TEAR_DOWN &&
1106 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1107 drbd_thread_restart_nowait(&mdev->receiver);
1108
1109 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1110 if (ascw) {
1111 ascw->os = os;
1112 ascw->ns = ns;
1113 ascw->flags = flags;
1114 ascw->w.cb = w_after_state_ch;
1115 ascw->done = done;
1116 drbd_queue_work(&mdev->data.work, &ascw->w);
1117 } else {
1118 dev_warn(DEV, "Could not kmalloc an ascw\n");
1119 }
1120
1121 return rv;
1122}
1123
1124static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1125{
1126 struct after_state_chg_work *ascw =
1127 container_of(w, struct after_state_chg_work, w);
1128 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1129 if (ascw->flags & CS_WAIT_COMPLETE) {
1130 D_ASSERT(ascw->done != NULL);
1131 complete(ascw->done);
1132 }
1133 kfree(ascw);
1134
1135 return 1;
1136}
1137
1138static void abw_start_sync(struct drbd_conf *mdev, int rv)
1139{
1140 if (rv) {
1141 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1142 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1143 return;
1144 }
1145
1146 switch (mdev->state.conn) {
1147 case C_STARTING_SYNC_T:
1148 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1149 break;
1150 case C_STARTING_SYNC_S:
1151 drbd_start_resync(mdev, C_SYNC_SOURCE);
1152 break;
1153 }
1154}
1155
1156/**
1157 * after_state_ch() - Perform after state change actions that may sleep
1158 * @mdev: DRBD device.
1159 * @os: old state.
1160 * @ns: new state.
1161 * @flags: Flags
1162 */
1163static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1164 union drbd_state ns, enum chg_state_flags flags)
1165{
1166 enum drbd_fencing_p fp;
1167
1168 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1169 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1170 if (mdev->p_uuid)
1171 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1172 }
1173
1174 fp = FP_DONT_CARE;
1175 if (get_ldev(mdev)) {
1176 fp = mdev->ldev->dc.fencing;
1177 put_ldev(mdev);
1178 }
1179
1180 /* Inform userspace about the change... */
1181 drbd_bcast_state(mdev, ns);
1182
1183 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1184 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1185 drbd_khelper(mdev, "pri-on-incon-degr");
1186
1187 /* Here we have the actions that are performed after a
1188 state change. This function might sleep */
1189
1190 if (fp == FP_STONITH && ns.susp) {
1191 /* case1: The outdate peer handler is successful:
1192 * case2: The connection was established again: */
1193 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
1194 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1195 tl_clear(mdev);
1196 spin_lock_irq(&mdev->req_lock);
1197 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1198 spin_unlock_irq(&mdev->req_lock);
1199 }
1200 }
1201 /* Do not change the order of the if above and the two below... */
1202 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1203 drbd_send_uuids(mdev);
1204 drbd_send_state(mdev);
1205 }
1206 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1207 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1208
1209 /* Lost contact to peer's copy of the data */
1210 if ((os.pdsk >= D_INCONSISTENT &&
1211 os.pdsk != D_UNKNOWN &&
1212 os.pdsk != D_OUTDATED)
1213 && (ns.pdsk < D_INCONSISTENT ||
1214 ns.pdsk == D_UNKNOWN ||
1215 ns.pdsk == D_OUTDATED)) {
1216 kfree(mdev->p_uuid);
1217 mdev->p_uuid = NULL;
1218 if (get_ldev(mdev)) {
1219 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1220 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1221 drbd_uuid_new_current(mdev);
1222 drbd_send_uuids(mdev);
1223 }
1224 put_ldev(mdev);
1225 }
1226 }
1227
1228 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1229 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1230 drbd_uuid_new_current(mdev);
1231
1232 /* D_DISKLESS Peer becomes secondary */
1233 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1234 drbd_al_to_on_disk_bm(mdev);
1235 put_ldev(mdev);
1236 }
1237
1238 /* Last part of the attaching process ... */
1239 if (ns.conn >= C_CONNECTED &&
1240 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1241 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1242 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
Philipp Reisnere89b5912010-03-24 17:11:33 +01001243 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001244 drbd_send_uuids(mdev);
1245 drbd_send_state(mdev);
1246 }
1247
1248 /* We want to pause/continue resync, tell peer. */
1249 if (ns.conn >= C_CONNECTED &&
1250 ((os.aftr_isp != ns.aftr_isp) ||
1251 (os.user_isp != ns.user_isp)))
1252 drbd_send_state(mdev);
1253
1254 /* In case one of the isp bits got set, suspend other devices. */
1255 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1256 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1257 suspend_other_sg(mdev);
1258
1259 /* Make sure the peer gets informed about eventual state
1260 changes (ISP bits) while we were in WFReportParams. */
1261 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1262 drbd_send_state(mdev);
1263
1264 /* We are in the progress to start a full sync... */
1265 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1266 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1267 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1268
1269 /* We are invalidating our self... */
1270 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1271 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1272 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1273
1274 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1275 enum drbd_io_error_p eh;
1276
1277 eh = EP_PASS_ON;
1278 if (get_ldev_if_state(mdev, D_FAILED)) {
1279 eh = mdev->ldev->dc.on_io_error;
1280 put_ldev(mdev);
1281 }
1282
1283 drbd_rs_cancel_all(mdev);
1284 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1285 and it is D_DISKLESS here, local_cnt can only go down, it can
1286 not increase... It will reach zero */
1287 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1288 mdev->rs_total = 0;
1289 mdev->rs_failed = 0;
1290 atomic_set(&mdev->rs_pending_cnt, 0);
1291
1292 spin_lock_irq(&mdev->req_lock);
1293 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1294 spin_unlock_irq(&mdev->req_lock);
1295
1296 if (eh == EP_CALL_HELPER)
1297 drbd_khelper(mdev, "local-io-error");
1298 }
1299
1300 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1301
1302 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1303 if (drbd_send_state(mdev))
1304 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1305 else
1306 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1307 }
1308
Philipp Reisner0a6dbf22009-12-28 16:58:38 +01001309 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001310 lc_destroy(mdev->resync);
1311 mdev->resync = NULL;
1312 lc_destroy(mdev->act_log);
1313 mdev->act_log = NULL;
1314 __no_warn(local,
1315 drbd_free_bc(mdev->ldev);
1316 mdev->ldev = NULL;);
1317
1318 if (mdev->md_io_tmpp)
1319 __free_page(mdev->md_io_tmpp);
1320 }
1321
1322 /* Disks got bigger while they were detached */
1323 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1324 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1325 if (ns.conn == C_CONNECTED)
1326 resync_after_online_grow(mdev);
1327 }
1328
1329 /* A resync finished or aborted, wake paused devices... */
1330 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1331 (os.peer_isp && !ns.peer_isp) ||
1332 (os.user_isp && !ns.user_isp))
1333 resume_next_sg(mdev);
1334
1335 /* Upon network connection, we need to start the receiver */
1336 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1337 drbd_thread_start(&mdev->receiver);
1338
1339 /* Terminate worker thread if we are unconfigured - it will be
1340 restarted as needed... */
1341 if (ns.disk == D_DISKLESS &&
1342 ns.conn == C_STANDALONE &&
1343 ns.role == R_SECONDARY) {
1344 if (os.aftr_isp != ns.aftr_isp)
1345 resume_next_sg(mdev);
1346 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1347 if (test_bit(DEVICE_DYING, &mdev->flags))
1348 drbd_thread_stop_nowait(&mdev->worker);
1349 }
1350
1351 drbd_md_sync(mdev);
1352}
1353
1354
1355static int drbd_thread_setup(void *arg)
1356{
1357 struct drbd_thread *thi = (struct drbd_thread *) arg;
1358 struct drbd_conf *mdev = thi->mdev;
1359 unsigned long flags;
1360 int retval;
1361
1362restart:
1363 retval = thi->function(thi);
1364
1365 spin_lock_irqsave(&thi->t_lock, flags);
1366
1367 /* if the receiver has been "Exiting", the last thing it did
1368 * was set the conn state to "StandAlone",
1369 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1370 * and receiver thread will be "started".
1371 * drbd_thread_start needs to set "Restarting" in that case.
1372 * t_state check and assignment needs to be within the same spinlock,
1373 * so either thread_start sees Exiting, and can remap to Restarting,
1374 * or thread_start see None, and can proceed as normal.
1375 */
1376
1377 if (thi->t_state == Restarting) {
1378 dev_info(DEV, "Restarting %s\n", current->comm);
1379 thi->t_state = Running;
1380 spin_unlock_irqrestore(&thi->t_lock, flags);
1381 goto restart;
1382 }
1383
1384 thi->task = NULL;
1385 thi->t_state = None;
1386 smp_mb();
1387 complete(&thi->stop);
1388 spin_unlock_irqrestore(&thi->t_lock, flags);
1389
1390 dev_info(DEV, "Terminating %s\n", current->comm);
1391
1392 /* Release mod reference taken when thread was started */
1393 module_put(THIS_MODULE);
1394 return retval;
1395}
1396
1397static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1398 int (*func) (struct drbd_thread *))
1399{
1400 spin_lock_init(&thi->t_lock);
1401 thi->task = NULL;
1402 thi->t_state = None;
1403 thi->function = func;
1404 thi->mdev = mdev;
1405}
1406
1407int drbd_thread_start(struct drbd_thread *thi)
1408{
1409 struct drbd_conf *mdev = thi->mdev;
1410 struct task_struct *nt;
1411 unsigned long flags;
1412
1413 const char *me =
1414 thi == &mdev->receiver ? "receiver" :
1415 thi == &mdev->asender ? "asender" :
1416 thi == &mdev->worker ? "worker" : "NONSENSE";
1417
1418 /* is used from state engine doing drbd_thread_stop_nowait,
1419 * while holding the req lock irqsave */
1420 spin_lock_irqsave(&thi->t_lock, flags);
1421
1422 switch (thi->t_state) {
1423 case None:
1424 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1425 me, current->comm, current->pid);
1426
1427 /* Get ref on module for thread - this is released when thread exits */
1428 if (!try_module_get(THIS_MODULE)) {
1429 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1430 spin_unlock_irqrestore(&thi->t_lock, flags);
1431 return FALSE;
1432 }
1433
1434 init_completion(&thi->stop);
1435 D_ASSERT(thi->task == NULL);
1436 thi->reset_cpu_mask = 1;
1437 thi->t_state = Running;
1438 spin_unlock_irqrestore(&thi->t_lock, flags);
1439 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1440
1441 nt = kthread_create(drbd_thread_setup, (void *) thi,
1442 "drbd%d_%s", mdev_to_minor(mdev), me);
1443
1444 if (IS_ERR(nt)) {
1445 dev_err(DEV, "Couldn't start thread\n");
1446
1447 module_put(THIS_MODULE);
1448 return FALSE;
1449 }
1450 spin_lock_irqsave(&thi->t_lock, flags);
1451 thi->task = nt;
1452 thi->t_state = Running;
1453 spin_unlock_irqrestore(&thi->t_lock, flags);
1454 wake_up_process(nt);
1455 break;
1456 case Exiting:
1457 thi->t_state = Restarting;
1458 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1459 me, current->comm, current->pid);
1460 /* fall through */
1461 case Running:
1462 case Restarting:
1463 default:
1464 spin_unlock_irqrestore(&thi->t_lock, flags);
1465 break;
1466 }
1467
1468 return TRUE;
1469}
1470
1471
1472void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1473{
1474 unsigned long flags;
1475
1476 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1477
1478 /* may be called from state engine, holding the req lock irqsave */
1479 spin_lock_irqsave(&thi->t_lock, flags);
1480
1481 if (thi->t_state == None) {
1482 spin_unlock_irqrestore(&thi->t_lock, flags);
1483 if (restart)
1484 drbd_thread_start(thi);
1485 return;
1486 }
1487
1488 if (thi->t_state != ns) {
1489 if (thi->task == NULL) {
1490 spin_unlock_irqrestore(&thi->t_lock, flags);
1491 return;
1492 }
1493
1494 thi->t_state = ns;
1495 smp_mb();
1496 init_completion(&thi->stop);
1497 if (thi->task != current)
1498 force_sig(DRBD_SIGKILL, thi->task);
1499
1500 }
1501
1502 spin_unlock_irqrestore(&thi->t_lock, flags);
1503
1504 if (wait)
1505 wait_for_completion(&thi->stop);
1506}
1507
1508#ifdef CONFIG_SMP
1509/**
1510 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1511 * @mdev: DRBD device.
1512 *
1513 * Forces all threads of a device onto the same CPU. This is beneficial for
1514 * DRBD's performance. May be overwritten by user's configuration.
1515 */
1516void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1517{
1518 int ord, cpu;
1519
1520 /* user override. */
1521 if (cpumask_weight(mdev->cpu_mask))
1522 return;
1523
1524 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1525 for_each_online_cpu(cpu) {
1526 if (ord-- == 0) {
1527 cpumask_set_cpu(cpu, mdev->cpu_mask);
1528 return;
1529 }
1530 }
1531 /* should not be reached */
1532 cpumask_setall(mdev->cpu_mask);
1533}
1534
1535/**
1536 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1537 * @mdev: DRBD device.
1538 *
1539 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1540 * prematurely.
1541 */
1542void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1543{
1544 struct task_struct *p = current;
1545 struct drbd_thread *thi =
1546 p == mdev->asender.task ? &mdev->asender :
1547 p == mdev->receiver.task ? &mdev->receiver :
1548 p == mdev->worker.task ? &mdev->worker :
1549 NULL;
1550 ERR_IF(thi == NULL)
1551 return;
1552 if (!thi->reset_cpu_mask)
1553 return;
1554 thi->reset_cpu_mask = 0;
1555 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1556}
1557#endif
1558
1559/* the appropriate socket mutex must be held already */
1560int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1561 enum drbd_packets cmd, struct p_header *h,
1562 size_t size, unsigned msg_flags)
1563{
1564 int sent, ok;
1565
1566 ERR_IF(!h) return FALSE;
1567 ERR_IF(!size) return FALSE;
1568
1569 h->magic = BE_DRBD_MAGIC;
1570 h->command = cpu_to_be16(cmd);
1571 h->length = cpu_to_be16(size-sizeof(struct p_header));
1572
Philipp Reisnerb411b362009-09-25 16:07:19 -07001573 sent = drbd_send(mdev, sock, h, size, msg_flags);
1574
1575 ok = (sent == size);
1576 if (!ok)
1577 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1578 cmdname(cmd), (int)size, sent);
1579 return ok;
1580}
1581
1582/* don't pass the socket. we may only look at it
1583 * when we hold the appropriate socket mutex.
1584 */
1585int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1586 enum drbd_packets cmd, struct p_header *h, size_t size)
1587{
1588 int ok = 0;
1589 struct socket *sock;
1590
1591 if (use_data_socket) {
1592 mutex_lock(&mdev->data.mutex);
1593 sock = mdev->data.socket;
1594 } else {
1595 mutex_lock(&mdev->meta.mutex);
1596 sock = mdev->meta.socket;
1597 }
1598
1599 /* drbd_disconnect() could have called drbd_free_sock()
1600 * while we were waiting in down()... */
1601 if (likely(sock != NULL))
1602 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1603
1604 if (use_data_socket)
1605 mutex_unlock(&mdev->data.mutex);
1606 else
1607 mutex_unlock(&mdev->meta.mutex);
1608 return ok;
1609}
1610
1611int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1612 size_t size)
1613{
1614 struct p_header h;
1615 int ok;
1616
1617 h.magic = BE_DRBD_MAGIC;
1618 h.command = cpu_to_be16(cmd);
1619 h.length = cpu_to_be16(size);
1620
1621 if (!drbd_get_data_sock(mdev))
1622 return 0;
1623
Philipp Reisnerb411b362009-09-25 16:07:19 -07001624 ok = (sizeof(h) ==
1625 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1626 ok = ok && (size ==
1627 drbd_send(mdev, mdev->data.socket, data, size, 0));
1628
1629 drbd_put_data_sock(mdev);
1630
1631 return ok;
1632}
1633
1634int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1635{
1636 struct p_rs_param_89 *p;
1637 struct socket *sock;
1638 int size, rv;
1639 const int apv = mdev->agreed_pro_version;
1640
1641 size = apv <= 87 ? sizeof(struct p_rs_param)
1642 : apv == 88 ? sizeof(struct p_rs_param)
1643 + strlen(mdev->sync_conf.verify_alg) + 1
1644 : /* 89 */ sizeof(struct p_rs_param_89);
1645
1646 /* used from admin command context and receiver/worker context.
1647 * to avoid kmalloc, grab the socket right here,
1648 * then use the pre-allocated sbuf there */
1649 mutex_lock(&mdev->data.mutex);
1650 sock = mdev->data.socket;
1651
1652 if (likely(sock != NULL)) {
1653 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1654
1655 p = &mdev->data.sbuf.rs_param_89;
1656
1657 /* initialize verify_alg and csums_alg */
1658 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1659
1660 p->rate = cpu_to_be32(sc->rate);
1661
1662 if (apv >= 88)
1663 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1664 if (apv >= 89)
1665 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1666
1667 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1668 } else
1669 rv = 0; /* not ok */
1670
1671 mutex_unlock(&mdev->data.mutex);
1672
1673 return rv;
1674}
1675
1676int drbd_send_protocol(struct drbd_conf *mdev)
1677{
1678 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001679 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001680
1681 size = sizeof(struct p_protocol);
1682
1683 if (mdev->agreed_pro_version >= 87)
1684 size += strlen(mdev->net_conf->integrity_alg) + 1;
1685
1686 /* we must not recurse into our own queue,
1687 * as that is blocked during handshake */
1688 p = kmalloc(size, GFP_NOIO);
1689 if (p == NULL)
1690 return 0;
1691
1692 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1693 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1694 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1695 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001696 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1697
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001698 cf = 0;
1699 if (mdev->net_conf->want_lose)
1700 cf |= CF_WANT_LOSE;
1701 if (mdev->net_conf->dry_run) {
1702 if (mdev->agreed_pro_version >= 92)
1703 cf |= CF_DRY_RUN;
1704 else {
1705 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001706 kfree(p);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001707 return 0;
1708 }
1709 }
1710 p->conn_flags = cpu_to_be32(cf);
1711
Philipp Reisnerb411b362009-09-25 16:07:19 -07001712 if (mdev->agreed_pro_version >= 87)
1713 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1714
1715 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1716 (struct p_header *)p, size);
1717 kfree(p);
1718 return rv;
1719}
1720
1721int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1722{
1723 struct p_uuids p;
1724 int i;
1725
1726 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1727 return 1;
1728
1729 for (i = UI_CURRENT; i < UI_SIZE; i++)
1730 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1731
1732 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1733 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1734 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1735 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1736 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1737 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1738
1739 put_ldev(mdev);
1740
1741 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1742 (struct p_header *)&p, sizeof(p));
1743}
1744
1745int drbd_send_uuids(struct drbd_conf *mdev)
1746{
1747 return _drbd_send_uuids(mdev, 0);
1748}
1749
1750int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1751{
1752 return _drbd_send_uuids(mdev, 8);
1753}
1754
1755
1756int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1757{
1758 struct p_rs_uuid p;
1759
1760 p.uuid = cpu_to_be64(val);
1761
1762 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1763 (struct p_header *)&p, sizeof(p));
1764}
1765
Philipp Reisnere89b5912010-03-24 17:11:33 +01001766int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001767{
1768 struct p_sizes p;
1769 sector_t d_size, u_size;
1770 int q_order_type;
1771 int ok;
1772
1773 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1774 D_ASSERT(mdev->ldev->backing_bdev);
1775 d_size = drbd_get_max_capacity(mdev->ldev);
1776 u_size = mdev->ldev->dc.disk_size;
1777 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001778 put_ldev(mdev);
1779 } else {
1780 d_size = 0;
1781 u_size = 0;
1782 q_order_type = QUEUE_ORDERED_NONE;
1783 }
1784
1785 p.d_size = cpu_to_be64(d_size);
1786 p.u_size = cpu_to_be64(u_size);
1787 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1788 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
Philipp Reisnere89b5912010-03-24 17:11:33 +01001789 p.queue_order_type = cpu_to_be16(q_order_type);
1790 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001791
1792 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1793 (struct p_header *)&p, sizeof(p));
1794 return ok;
1795}
1796
1797/**
1798 * drbd_send_state() - Sends the drbd state to the peer
1799 * @mdev: DRBD device.
1800 */
1801int drbd_send_state(struct drbd_conf *mdev)
1802{
1803 struct socket *sock;
1804 struct p_state p;
1805 int ok = 0;
1806
1807 /* Grab state lock so we wont send state if we're in the middle
1808 * of a cluster wide state change on another thread */
1809 drbd_state_lock(mdev);
1810
1811 mutex_lock(&mdev->data.mutex);
1812
1813 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1814 sock = mdev->data.socket;
1815
1816 if (likely(sock != NULL)) {
1817 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1818 (struct p_header *)&p, sizeof(p), 0);
1819 }
1820
1821 mutex_unlock(&mdev->data.mutex);
1822
1823 drbd_state_unlock(mdev);
1824 return ok;
1825}
1826
1827int drbd_send_state_req(struct drbd_conf *mdev,
1828 union drbd_state mask, union drbd_state val)
1829{
1830 struct p_req_state p;
1831
1832 p.mask = cpu_to_be32(mask.i);
1833 p.val = cpu_to_be32(val.i);
1834
1835 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1836 (struct p_header *)&p, sizeof(p));
1837}
1838
1839int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1840{
1841 struct p_req_state_reply p;
1842
1843 p.retcode = cpu_to_be32(retcode);
1844
1845 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1846 (struct p_header *)&p, sizeof(p));
1847}
1848
1849int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1850 struct p_compressed_bm *p,
1851 struct bm_xfer_ctx *c)
1852{
1853 struct bitstream bs;
1854 unsigned long plain_bits;
1855 unsigned long tmp;
1856 unsigned long rl;
1857 unsigned len;
1858 unsigned toggle;
1859 int bits;
1860
1861 /* may we use this feature? */
1862 if ((mdev->sync_conf.use_rle == 0) ||
1863 (mdev->agreed_pro_version < 90))
1864 return 0;
1865
1866 if (c->bit_offset >= c->bm_bits)
1867 return 0; /* nothing to do. */
1868
1869 /* use at most thus many bytes */
1870 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1871 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1872 /* plain bits covered in this code string */
1873 plain_bits = 0;
1874
1875 /* p->encoding & 0x80 stores whether the first run length is set.
1876 * bit offset is implicit.
1877 * start with toggle == 2 to be able to tell the first iteration */
1878 toggle = 2;
1879
1880 /* see how much plain bits we can stuff into one packet
1881 * using RLE and VLI. */
1882 do {
1883 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1884 : _drbd_bm_find_next(mdev, c->bit_offset);
1885 if (tmp == -1UL)
1886 tmp = c->bm_bits;
1887 rl = tmp - c->bit_offset;
1888
1889 if (toggle == 2) { /* first iteration */
1890 if (rl == 0) {
1891 /* the first checked bit was set,
1892 * store start value, */
1893 DCBP_set_start(p, 1);
1894 /* but skip encoding of zero run length */
1895 toggle = !toggle;
1896 continue;
1897 }
1898 DCBP_set_start(p, 0);
1899 }
1900
1901 /* paranoia: catch zero runlength.
1902 * can only happen if bitmap is modified while we scan it. */
1903 if (rl == 0) {
1904 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1905 "t:%u bo:%lu\n", toggle, c->bit_offset);
1906 return -1;
1907 }
1908
1909 bits = vli_encode_bits(&bs, rl);
1910 if (bits == -ENOBUFS) /* buffer full */
1911 break;
1912 if (bits <= 0) {
1913 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1914 return 0;
1915 }
1916
1917 toggle = !toggle;
1918 plain_bits += rl;
1919 c->bit_offset = tmp;
1920 } while (c->bit_offset < c->bm_bits);
1921
1922 len = bs.cur.b - p->code + !!bs.cur.bit;
1923
1924 if (plain_bits < (len << 3)) {
1925 /* incompressible with this method.
1926 * we need to rewind both word and bit position. */
1927 c->bit_offset -= plain_bits;
1928 bm_xfer_ctx_bit_to_word_offset(c);
1929 c->bit_offset = c->word_offset * BITS_PER_LONG;
1930 return 0;
1931 }
1932
1933 /* RLE + VLI was able to compress it just fine.
1934 * update c->word_offset. */
1935 bm_xfer_ctx_bit_to_word_offset(c);
1936
1937 /* store pad_bits */
1938 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1939
1940 return len;
1941}
1942
1943enum { OK, FAILED, DONE }
1944send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1945 struct p_header *h, struct bm_xfer_ctx *c)
1946{
1947 struct p_compressed_bm *p = (void*)h;
1948 unsigned long num_words;
1949 int len;
1950 int ok;
1951
1952 len = fill_bitmap_rle_bits(mdev, p, c);
1953
1954 if (len < 0)
1955 return FAILED;
1956
1957 if (len) {
1958 DCBP_set_code(p, RLE_VLI_Bits);
1959 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1960 sizeof(*p) + len, 0);
1961
1962 c->packets[0]++;
1963 c->bytes[0] += sizeof(*p) + len;
1964
1965 if (c->bit_offset >= c->bm_bits)
1966 len = 0; /* DONE */
1967 } else {
1968 /* was not compressible.
1969 * send a buffer full of plain text bits instead. */
1970 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1971 len = num_words * sizeof(long);
1972 if (len)
1973 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1974 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1975 h, sizeof(struct p_header) + len, 0);
1976 c->word_offset += num_words;
1977 c->bit_offset = c->word_offset * BITS_PER_LONG;
1978
1979 c->packets[1]++;
1980 c->bytes[1] += sizeof(struct p_header) + len;
1981
1982 if (c->bit_offset > c->bm_bits)
1983 c->bit_offset = c->bm_bits;
1984 }
1985 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1986
1987 if (ok == DONE)
1988 INFO_bm_xfer_stats(mdev, "send", c);
1989 return ok;
1990}
1991
1992/* See the comment at receive_bitmap() */
1993int _drbd_send_bitmap(struct drbd_conf *mdev)
1994{
1995 struct bm_xfer_ctx c;
1996 struct p_header *p;
1997 int ret;
1998
1999 ERR_IF(!mdev->bitmap) return FALSE;
2000
2001 /* maybe we should use some per thread scratch page,
2002 * and allocate that during initial device creation? */
2003 p = (struct p_header *) __get_free_page(GFP_NOIO);
2004 if (!p) {
2005 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2006 return FALSE;
2007 }
2008
2009 if (get_ldev(mdev)) {
2010 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2011 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2012 drbd_bm_set_all(mdev);
2013 if (drbd_bm_write(mdev)) {
2014 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2015 * but otherwise process as per normal - need to tell other
2016 * side that a full resync is required! */
2017 dev_err(DEV, "Failed to write bitmap to disk!\n");
2018 } else {
2019 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2020 drbd_md_sync(mdev);
2021 }
2022 }
2023 put_ldev(mdev);
2024 }
2025
2026 c = (struct bm_xfer_ctx) {
2027 .bm_bits = drbd_bm_bits(mdev),
2028 .bm_words = drbd_bm_words(mdev),
2029 };
2030
2031 do {
2032 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2033 } while (ret == OK);
2034
2035 free_page((unsigned long) p);
2036 return (ret == DONE);
2037}
2038
2039int drbd_send_bitmap(struct drbd_conf *mdev)
2040{
2041 int err;
2042
2043 if (!drbd_get_data_sock(mdev))
2044 return -1;
2045 err = !_drbd_send_bitmap(mdev);
2046 drbd_put_data_sock(mdev);
2047 return err;
2048}
2049
2050int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2051{
2052 int ok;
2053 struct p_barrier_ack p;
2054
2055 p.barrier = barrier_nr;
2056 p.set_size = cpu_to_be32(set_size);
2057
2058 if (mdev->state.conn < C_CONNECTED)
2059 return FALSE;
2060 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2061 (struct p_header *)&p, sizeof(p));
2062 return ok;
2063}
2064
2065/**
2066 * _drbd_send_ack() - Sends an ack packet
2067 * @mdev: DRBD device.
2068 * @cmd: Packet command code.
2069 * @sector: sector, needs to be in big endian byte order
2070 * @blksize: size in byte, needs to be in big endian byte order
2071 * @block_id: Id, big endian byte order
2072 */
2073static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2074 u64 sector,
2075 u32 blksize,
2076 u64 block_id)
2077{
2078 int ok;
2079 struct p_block_ack p;
2080
2081 p.sector = sector;
2082 p.block_id = block_id;
2083 p.blksize = blksize;
2084 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2085
2086 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2087 return FALSE;
2088 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2089 (struct p_header *)&p, sizeof(p));
2090 return ok;
2091}
2092
2093int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2094 struct p_data *dp)
2095{
2096 const int header_size = sizeof(struct p_data)
2097 - sizeof(struct p_header);
2098 int data_size = ((struct p_header *)dp)->length - header_size;
2099
2100 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2101 dp->block_id);
2102}
2103
2104int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2105 struct p_block_req *rp)
2106{
2107 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2108}
2109
2110/**
2111 * drbd_send_ack() - Sends an ack packet
2112 * @mdev: DRBD device.
2113 * @cmd: Packet command code.
2114 * @e: Epoch entry.
2115 */
2116int drbd_send_ack(struct drbd_conf *mdev,
2117 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2118{
2119 return _drbd_send_ack(mdev, cmd,
2120 cpu_to_be64(e->sector),
2121 cpu_to_be32(e->size),
2122 e->block_id);
2123}
2124
2125/* This function misuses the block_id field to signal if the blocks
2126 * are is sync or not. */
2127int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2128 sector_t sector, int blksize, u64 block_id)
2129{
2130 return _drbd_send_ack(mdev, cmd,
2131 cpu_to_be64(sector),
2132 cpu_to_be32(blksize),
2133 cpu_to_be64(block_id));
2134}
2135
2136int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2137 sector_t sector, int size, u64 block_id)
2138{
2139 int ok;
2140 struct p_block_req p;
2141
2142 p.sector = cpu_to_be64(sector);
2143 p.block_id = block_id;
2144 p.blksize = cpu_to_be32(size);
2145
2146 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2147 (struct p_header *)&p, sizeof(p));
2148 return ok;
2149}
2150
2151int drbd_send_drequest_csum(struct drbd_conf *mdev,
2152 sector_t sector, int size,
2153 void *digest, int digest_size,
2154 enum drbd_packets cmd)
2155{
2156 int ok;
2157 struct p_block_req p;
2158
2159 p.sector = cpu_to_be64(sector);
2160 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2161 p.blksize = cpu_to_be32(size);
2162
2163 p.head.magic = BE_DRBD_MAGIC;
2164 p.head.command = cpu_to_be16(cmd);
2165 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2166
2167 mutex_lock(&mdev->data.mutex);
2168
2169 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2170 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2171
2172 mutex_unlock(&mdev->data.mutex);
2173
2174 return ok;
2175}
2176
2177int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2178{
2179 int ok;
2180 struct p_block_req p;
2181
2182 p.sector = cpu_to_be64(sector);
2183 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2184 p.blksize = cpu_to_be32(size);
2185
2186 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2187 (struct p_header *)&p, sizeof(p));
2188 return ok;
2189}
2190
2191/* called on sndtimeo
2192 * returns FALSE if we should retry,
2193 * TRUE if we think connection is dead
2194 */
2195static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2196{
2197 int drop_it;
2198 /* long elapsed = (long)(jiffies - mdev->last_received); */
2199
2200 drop_it = mdev->meta.socket == sock
2201 || !mdev->asender.task
2202 || get_t_state(&mdev->asender) != Running
2203 || mdev->state.conn < C_CONNECTED;
2204
2205 if (drop_it)
2206 return TRUE;
2207
2208 drop_it = !--mdev->ko_count;
2209 if (!drop_it) {
2210 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2211 current->comm, current->pid, mdev->ko_count);
2212 request_ping(mdev);
2213 }
2214
2215 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2216}
2217
2218/* The idea of sendpage seems to be to put some kind of reference
2219 * to the page into the skb, and to hand it over to the NIC. In
2220 * this process get_page() gets called.
2221 *
2222 * As soon as the page was really sent over the network put_page()
2223 * gets called by some part of the network layer. [ NIC driver? ]
2224 *
2225 * [ get_page() / put_page() increment/decrement the count. If count
2226 * reaches 0 the page will be freed. ]
2227 *
2228 * This works nicely with pages from FSs.
2229 * But this means that in protocol A we might signal IO completion too early!
2230 *
2231 * In order not to corrupt data during a resync we must make sure
2232 * that we do not reuse our own buffer pages (EEs) to early, therefore
2233 * we have the net_ee list.
2234 *
2235 * XFS seems to have problems, still, it submits pages with page_count == 0!
2236 * As a workaround, we disable sendpage on pages
2237 * with page_count == 0 or PageSlab.
2238 */
2239static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2240 int offset, size_t size)
2241{
2242 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2243 kunmap(page);
2244 if (sent == size)
2245 mdev->send_cnt += size>>9;
2246 return sent == size;
2247}
2248
2249static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2250 int offset, size_t size)
2251{
2252 mm_segment_t oldfs = get_fs();
2253 int sent, ok;
2254 int len = size;
2255
2256 /* e.g. XFS meta- & log-data is in slab pages, which have a
2257 * page_count of 0 and/or have PageSlab() set.
2258 * we cannot use send_page for those, as that does get_page();
2259 * put_page(); and would cause either a VM_BUG directly, or
2260 * __page_cache_release a page that would actually still be referenced
2261 * by someone, leading to some obscure delayed Oops somewhere else. */
2262 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2263 return _drbd_no_send_page(mdev, page, offset, size);
2264
2265 drbd_update_congested(mdev);
2266 set_fs(KERNEL_DS);
2267 do {
2268 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2269 offset, len,
2270 MSG_NOSIGNAL);
2271 if (sent == -EAGAIN) {
2272 if (we_should_drop_the_connection(mdev,
2273 mdev->data.socket))
2274 break;
2275 else
2276 continue;
2277 }
2278 if (sent <= 0) {
2279 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2280 __func__, (int)size, len, sent);
2281 break;
2282 }
2283 len -= sent;
2284 offset += sent;
2285 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2286 set_fs(oldfs);
2287 clear_bit(NET_CONGESTED, &mdev->flags);
2288
2289 ok = (len == 0);
2290 if (likely(ok))
2291 mdev->send_cnt += size>>9;
2292 return ok;
2293}
2294
2295static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2296{
2297 struct bio_vec *bvec;
2298 int i;
2299 __bio_for_each_segment(bvec, bio, i, 0) {
2300 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2301 bvec->bv_offset, bvec->bv_len))
2302 return 0;
2303 }
2304 return 1;
2305}
2306
2307static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2308{
2309 struct bio_vec *bvec;
2310 int i;
2311 __bio_for_each_segment(bvec, bio, i, 0) {
2312 if (!_drbd_send_page(mdev, bvec->bv_page,
2313 bvec->bv_offset, bvec->bv_len))
2314 return 0;
2315 }
2316
2317 return 1;
2318}
2319
2320/* Used to send write requests
2321 * R_PRIMARY -> Peer (P_DATA)
2322 */
2323int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2324{
2325 int ok = 1;
2326 struct p_data p;
2327 unsigned int dp_flags = 0;
2328 void *dgb;
2329 int dgs;
2330
2331 if (!drbd_get_data_sock(mdev))
2332 return 0;
2333
2334 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2335 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2336
2337 p.head.magic = BE_DRBD_MAGIC;
2338 p.head.command = cpu_to_be16(P_DATA);
2339 p.head.length =
2340 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2341
2342 p.sector = cpu_to_be64(req->sector);
2343 p.block_id = (unsigned long)req;
2344 p.seq_num = cpu_to_be32(req->seq_num =
2345 atomic_add_return(1, &mdev->packet_seq));
2346 dp_flags = 0;
2347
2348 /* NOTE: no need to check if barriers supported here as we would
2349 * not pass the test in make_request_common in that case
2350 */
2351 if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2352 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2353 /* dp_flags |= DP_HARDBARRIER; */
2354 }
2355 if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2356 dp_flags |= DP_RW_SYNC;
2357 /* for now handle SYNCIO and UNPLUG
2358 * as if they still were one and the same flag */
2359 if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2360 dp_flags |= DP_RW_SYNC;
2361 if (mdev->state.conn >= C_SYNC_SOURCE &&
2362 mdev->state.conn <= C_PAUSED_SYNC_T)
2363 dp_flags |= DP_MAY_SET_IN_SYNC;
2364
2365 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002366 set_bit(UNPLUG_REMOTE, &mdev->flags);
2367 ok = (sizeof(p) ==
2368 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2369 if (ok && dgs) {
2370 dgb = mdev->int_dig_out;
2371 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2372 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2373 }
2374 if (ok) {
2375 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2376 ok = _drbd_send_bio(mdev, req->master_bio);
2377 else
2378 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2379 }
2380
2381 drbd_put_data_sock(mdev);
2382 return ok;
2383}
2384
2385/* answer packet, used to send data back for read requests:
2386 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2387 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2388 */
2389int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2390 struct drbd_epoch_entry *e)
2391{
2392 int ok;
2393 struct p_data p;
2394 void *dgb;
2395 int dgs;
2396
2397 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2398 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2399
2400 p.head.magic = BE_DRBD_MAGIC;
2401 p.head.command = cpu_to_be16(cmd);
2402 p.head.length =
2403 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2404
2405 p.sector = cpu_to_be64(e->sector);
2406 p.block_id = e->block_id;
2407 /* p.seq_num = 0; No sequence numbers here.. */
2408
2409 /* Only called by our kernel thread.
2410 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2411 * in response to admin command or module unload.
2412 */
2413 if (!drbd_get_data_sock(mdev))
2414 return 0;
2415
Philipp Reisnerb411b362009-09-25 16:07:19 -07002416 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2417 sizeof(p), MSG_MORE);
2418 if (ok && dgs) {
2419 dgb = mdev->int_dig_out;
2420 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
2421 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2422 }
2423 if (ok)
2424 ok = _drbd_send_zc_bio(mdev, e->private_bio);
2425
2426 drbd_put_data_sock(mdev);
2427 return ok;
2428}
2429
2430/*
2431 drbd_send distinguishes two cases:
2432
2433 Packets sent via the data socket "sock"
2434 and packets sent via the meta data socket "msock"
2435
2436 sock msock
2437 -----------------+-------------------------+------------------------------
2438 timeout conf.timeout / 2 conf.timeout / 2
2439 timeout action send a ping via msock Abort communication
2440 and close all sockets
2441*/
2442
2443/*
2444 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2445 */
2446int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2447 void *buf, size_t size, unsigned msg_flags)
2448{
2449 struct kvec iov;
2450 struct msghdr msg;
2451 int rv, sent = 0;
2452
2453 if (!sock)
2454 return -1000;
2455
2456 /* THINK if (signal_pending) return ... ? */
2457
2458 iov.iov_base = buf;
2459 iov.iov_len = size;
2460
2461 msg.msg_name = NULL;
2462 msg.msg_namelen = 0;
2463 msg.msg_control = NULL;
2464 msg.msg_controllen = 0;
2465 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2466
2467 if (sock == mdev->data.socket) {
2468 mdev->ko_count = mdev->net_conf->ko_count;
2469 drbd_update_congested(mdev);
2470 }
2471 do {
2472 /* STRANGE
2473 * tcp_sendmsg does _not_ use its size parameter at all ?
2474 *
2475 * -EAGAIN on timeout, -EINTR on signal.
2476 */
2477/* THINK
2478 * do we need to block DRBD_SIG if sock == &meta.socket ??
2479 * otherwise wake_asender() might interrupt some send_*Ack !
2480 */
2481 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2482 if (rv == -EAGAIN) {
2483 if (we_should_drop_the_connection(mdev, sock))
2484 break;
2485 else
2486 continue;
2487 }
2488 D_ASSERT(rv != 0);
2489 if (rv == -EINTR) {
2490 flush_signals(current);
2491 rv = 0;
2492 }
2493 if (rv < 0)
2494 break;
2495 sent += rv;
2496 iov.iov_base += rv;
2497 iov.iov_len -= rv;
2498 } while (sent < size);
2499
2500 if (sock == mdev->data.socket)
2501 clear_bit(NET_CONGESTED, &mdev->flags);
2502
2503 if (rv <= 0) {
2504 if (rv != -EAGAIN) {
2505 dev_err(DEV, "%s_sendmsg returned %d\n",
2506 sock == mdev->meta.socket ? "msock" : "sock",
2507 rv);
2508 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2509 } else
2510 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2511 }
2512
2513 return sent;
2514}
2515
2516static int drbd_open(struct block_device *bdev, fmode_t mode)
2517{
2518 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2519 unsigned long flags;
2520 int rv = 0;
2521
2522 spin_lock_irqsave(&mdev->req_lock, flags);
2523 /* to have a stable mdev->state.role
2524 * and no race with updating open_cnt */
2525
2526 if (mdev->state.role != R_PRIMARY) {
2527 if (mode & FMODE_WRITE)
2528 rv = -EROFS;
2529 else if (!allow_oos)
2530 rv = -EMEDIUMTYPE;
2531 }
2532
2533 if (!rv)
2534 mdev->open_cnt++;
2535 spin_unlock_irqrestore(&mdev->req_lock, flags);
2536
2537 return rv;
2538}
2539
2540static int drbd_release(struct gendisk *gd, fmode_t mode)
2541{
2542 struct drbd_conf *mdev = gd->private_data;
2543 mdev->open_cnt--;
2544 return 0;
2545}
2546
2547static void drbd_unplug_fn(struct request_queue *q)
2548{
2549 struct drbd_conf *mdev = q->queuedata;
2550
Philipp Reisnerb411b362009-09-25 16:07:19 -07002551 /* unplug FIRST */
2552 spin_lock_irq(q->queue_lock);
2553 blk_remove_plug(q);
2554 spin_unlock_irq(q->queue_lock);
2555
2556 /* only if connected */
2557 spin_lock_irq(&mdev->req_lock);
2558 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2559 D_ASSERT(mdev->state.role == R_PRIMARY);
2560 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2561 /* add to the data.work queue,
2562 * unless already queued.
2563 * XXX this might be a good addition to drbd_queue_work
2564 * anyways, to detect "double queuing" ... */
2565 if (list_empty(&mdev->unplug_work.list))
2566 drbd_queue_work(&mdev->data.work,
2567 &mdev->unplug_work);
2568 }
2569 }
2570 spin_unlock_irq(&mdev->req_lock);
2571
2572 if (mdev->state.disk >= D_INCONSISTENT)
2573 drbd_kick_lo(mdev);
2574}
2575
2576static void drbd_set_defaults(struct drbd_conf *mdev)
2577{
2578 mdev->sync_conf.after = DRBD_AFTER_DEF;
2579 mdev->sync_conf.rate = DRBD_RATE_DEF;
2580 mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2581 mdev->state = (union drbd_state) {
2582 { .role = R_SECONDARY,
2583 .peer = R_UNKNOWN,
2584 .conn = C_STANDALONE,
2585 .disk = D_DISKLESS,
2586 .pdsk = D_UNKNOWN,
2587 .susp = 0
2588 } };
2589}
2590
2591void drbd_init_set_defaults(struct drbd_conf *mdev)
2592{
2593 /* the memset(,0,) did most of this.
2594 * note: only assignments, no allocation in here */
2595
2596 drbd_set_defaults(mdev);
2597
2598 /* for now, we do NOT yet support it,
2599 * even though we start some framework
2600 * to eventually support barriers */
2601 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2602
2603 atomic_set(&mdev->ap_bio_cnt, 0);
2604 atomic_set(&mdev->ap_pending_cnt, 0);
2605 atomic_set(&mdev->rs_pending_cnt, 0);
2606 atomic_set(&mdev->unacked_cnt, 0);
2607 atomic_set(&mdev->local_cnt, 0);
2608 atomic_set(&mdev->net_cnt, 0);
2609 atomic_set(&mdev->packet_seq, 0);
2610 atomic_set(&mdev->pp_in_use, 0);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02002611 atomic_set(&mdev->delay_seq, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002612
2613 mutex_init(&mdev->md_io_mutex);
2614 mutex_init(&mdev->data.mutex);
2615 mutex_init(&mdev->meta.mutex);
2616 sema_init(&mdev->data.work.s, 0);
2617 sema_init(&mdev->meta.work.s, 0);
2618 mutex_init(&mdev->state_mutex);
2619
2620 spin_lock_init(&mdev->data.work.q_lock);
2621 spin_lock_init(&mdev->meta.work.q_lock);
2622
2623 spin_lock_init(&mdev->al_lock);
2624 spin_lock_init(&mdev->req_lock);
2625 spin_lock_init(&mdev->peer_seq_lock);
2626 spin_lock_init(&mdev->epoch_lock);
2627
2628 INIT_LIST_HEAD(&mdev->active_ee);
2629 INIT_LIST_HEAD(&mdev->sync_ee);
2630 INIT_LIST_HEAD(&mdev->done_ee);
2631 INIT_LIST_HEAD(&mdev->read_ee);
2632 INIT_LIST_HEAD(&mdev->net_ee);
2633 INIT_LIST_HEAD(&mdev->resync_reads);
2634 INIT_LIST_HEAD(&mdev->data.work.q);
2635 INIT_LIST_HEAD(&mdev->meta.work.q);
2636 INIT_LIST_HEAD(&mdev->resync_work.list);
2637 INIT_LIST_HEAD(&mdev->unplug_work.list);
2638 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2639 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02002640 INIT_LIST_HEAD(&mdev->delay_probes);
2641
Philipp Reisnerb411b362009-09-25 16:07:19 -07002642 mdev->resync_work.cb = w_resync_inactive;
2643 mdev->unplug_work.cb = w_send_write_hint;
2644 mdev->md_sync_work.cb = w_md_sync;
2645 mdev->bm_io_work.w.cb = w_bitmap_io;
2646 init_timer(&mdev->resync_timer);
2647 init_timer(&mdev->md_sync_timer);
2648 mdev->resync_timer.function = resync_timer_fn;
2649 mdev->resync_timer.data = (unsigned long) mdev;
2650 mdev->md_sync_timer.function = md_sync_timer_fn;
2651 mdev->md_sync_timer.data = (unsigned long) mdev;
2652
2653 init_waitqueue_head(&mdev->misc_wait);
2654 init_waitqueue_head(&mdev->state_wait);
2655 init_waitqueue_head(&mdev->ee_wait);
2656 init_waitqueue_head(&mdev->al_wait);
2657 init_waitqueue_head(&mdev->seq_wait);
2658
2659 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2660 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2661 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2662
2663 mdev->agreed_pro_version = PRO_VERSION_MAX;
2664 mdev->write_ordering = WO_bio_barrier;
2665 mdev->resync_wenr = LC_FREE;
2666}
2667
2668void drbd_mdev_cleanup(struct drbd_conf *mdev)
2669{
2670 if (mdev->receiver.t_state != None)
2671 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2672 mdev->receiver.t_state);
2673
2674 /* no need to lock it, I'm the only thread alive */
2675 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2676 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2677 mdev->al_writ_cnt =
2678 mdev->bm_writ_cnt =
2679 mdev->read_cnt =
2680 mdev->recv_cnt =
2681 mdev->send_cnt =
2682 mdev->writ_cnt =
2683 mdev->p_size =
2684 mdev->rs_start =
2685 mdev->rs_total =
2686 mdev->rs_failed =
2687 mdev->rs_mark_left =
2688 mdev->rs_mark_time = 0;
2689 D_ASSERT(mdev->net_conf == NULL);
2690
2691 drbd_set_my_capacity(mdev, 0);
2692 if (mdev->bitmap) {
2693 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01002694 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002695 drbd_bm_cleanup(mdev);
2696 }
2697
2698 drbd_free_resources(mdev);
2699
2700 /*
2701 * currently we drbd_init_ee only on module load, so
2702 * we may do drbd_release_ee only on module unload!
2703 */
2704 D_ASSERT(list_empty(&mdev->active_ee));
2705 D_ASSERT(list_empty(&mdev->sync_ee));
2706 D_ASSERT(list_empty(&mdev->done_ee));
2707 D_ASSERT(list_empty(&mdev->read_ee));
2708 D_ASSERT(list_empty(&mdev->net_ee));
2709 D_ASSERT(list_empty(&mdev->resync_reads));
2710 D_ASSERT(list_empty(&mdev->data.work.q));
2711 D_ASSERT(list_empty(&mdev->meta.work.q));
2712 D_ASSERT(list_empty(&mdev->resync_work.list));
2713 D_ASSERT(list_empty(&mdev->unplug_work.list));
2714
2715}
2716
2717
2718static void drbd_destroy_mempools(void)
2719{
2720 struct page *page;
2721
2722 while (drbd_pp_pool) {
2723 page = drbd_pp_pool;
2724 drbd_pp_pool = (struct page *)page_private(page);
2725 __free_page(page);
2726 drbd_pp_vacant--;
2727 }
2728
2729 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2730
2731 if (drbd_ee_mempool)
2732 mempool_destroy(drbd_ee_mempool);
2733 if (drbd_request_mempool)
2734 mempool_destroy(drbd_request_mempool);
2735 if (drbd_ee_cache)
2736 kmem_cache_destroy(drbd_ee_cache);
2737 if (drbd_request_cache)
2738 kmem_cache_destroy(drbd_request_cache);
2739 if (drbd_bm_ext_cache)
2740 kmem_cache_destroy(drbd_bm_ext_cache);
2741 if (drbd_al_ext_cache)
2742 kmem_cache_destroy(drbd_al_ext_cache);
2743
2744 drbd_ee_mempool = NULL;
2745 drbd_request_mempool = NULL;
2746 drbd_ee_cache = NULL;
2747 drbd_request_cache = NULL;
2748 drbd_bm_ext_cache = NULL;
2749 drbd_al_ext_cache = NULL;
2750
2751 return;
2752}
2753
2754static int drbd_create_mempools(void)
2755{
2756 struct page *page;
2757 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2758 int i;
2759
2760 /* prepare our caches and mempools */
2761 drbd_request_mempool = NULL;
2762 drbd_ee_cache = NULL;
2763 drbd_request_cache = NULL;
2764 drbd_bm_ext_cache = NULL;
2765 drbd_al_ext_cache = NULL;
2766 drbd_pp_pool = NULL;
2767
2768 /* caches */
2769 drbd_request_cache = kmem_cache_create(
2770 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2771 if (drbd_request_cache == NULL)
2772 goto Enomem;
2773
2774 drbd_ee_cache = kmem_cache_create(
2775 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2776 if (drbd_ee_cache == NULL)
2777 goto Enomem;
2778
2779 drbd_bm_ext_cache = kmem_cache_create(
2780 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2781 if (drbd_bm_ext_cache == NULL)
2782 goto Enomem;
2783
2784 drbd_al_ext_cache = kmem_cache_create(
2785 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2786 if (drbd_al_ext_cache == NULL)
2787 goto Enomem;
2788
2789 /* mempools */
2790 drbd_request_mempool = mempool_create(number,
2791 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2792 if (drbd_request_mempool == NULL)
2793 goto Enomem;
2794
2795 drbd_ee_mempool = mempool_create(number,
2796 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2797 if (drbd_request_mempool == NULL)
2798 goto Enomem;
2799
2800 /* drbd's page pool */
2801 spin_lock_init(&drbd_pp_lock);
2802
2803 for (i = 0; i < number; i++) {
2804 page = alloc_page(GFP_HIGHUSER);
2805 if (!page)
2806 goto Enomem;
2807 set_page_private(page, (unsigned long)drbd_pp_pool);
2808 drbd_pp_pool = page;
2809 }
2810 drbd_pp_vacant = number;
2811
2812 return 0;
2813
2814Enomem:
2815 drbd_destroy_mempools(); /* in case we allocated some */
2816 return -ENOMEM;
2817}
2818
2819static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2820 void *unused)
2821{
2822 /* just so we have it. you never know what interesting things we
2823 * might want to do here some day...
2824 */
2825
2826 return NOTIFY_DONE;
2827}
2828
2829static struct notifier_block drbd_notifier = {
2830 .notifier_call = drbd_notify_sys,
2831};
2832
2833static void drbd_release_ee_lists(struct drbd_conf *mdev)
2834{
2835 int rr;
2836
2837 rr = drbd_release_ee(mdev, &mdev->active_ee);
2838 if (rr)
2839 dev_err(DEV, "%d EEs in active list found!\n", rr);
2840
2841 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2842 if (rr)
2843 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2844
2845 rr = drbd_release_ee(mdev, &mdev->read_ee);
2846 if (rr)
2847 dev_err(DEV, "%d EEs in read list found!\n", rr);
2848
2849 rr = drbd_release_ee(mdev, &mdev->done_ee);
2850 if (rr)
2851 dev_err(DEV, "%d EEs in done list found!\n", rr);
2852
2853 rr = drbd_release_ee(mdev, &mdev->net_ee);
2854 if (rr)
2855 dev_err(DEV, "%d EEs in net list found!\n", rr);
2856}
2857
2858/* caution. no locking.
2859 * currently only used from module cleanup code. */
2860static void drbd_delete_device(unsigned int minor)
2861{
2862 struct drbd_conf *mdev = minor_to_mdev(minor);
2863
2864 if (!mdev)
2865 return;
2866
2867 /* paranoia asserts */
2868 if (mdev->open_cnt != 0)
2869 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2870 __FILE__ , __LINE__);
2871
2872 ERR_IF (!list_empty(&mdev->data.work.q)) {
2873 struct list_head *lp;
2874 list_for_each(lp, &mdev->data.work.q) {
2875 dev_err(DEV, "lp = %p\n", lp);
2876 }
2877 };
2878 /* end paranoia asserts */
2879
2880 del_gendisk(mdev->vdisk);
2881
2882 /* cleanup stuff that may have been allocated during
2883 * device (re-)configuration or state changes */
2884
2885 if (mdev->this_bdev)
2886 bdput(mdev->this_bdev);
2887
2888 drbd_free_resources(mdev);
2889
2890 drbd_release_ee_lists(mdev);
2891
2892 /* should be free'd on disconnect? */
2893 kfree(mdev->ee_hash);
2894 /*
2895 mdev->ee_hash_s = 0;
2896 mdev->ee_hash = NULL;
2897 */
2898
2899 lc_destroy(mdev->act_log);
2900 lc_destroy(mdev->resync);
2901
2902 kfree(mdev->p_uuid);
2903 /* mdev->p_uuid = NULL; */
2904
2905 kfree(mdev->int_dig_out);
2906 kfree(mdev->int_dig_in);
2907 kfree(mdev->int_dig_vv);
2908
2909 /* cleanup the rest that has been
2910 * allocated from drbd_new_device
2911 * and actually free the mdev itself */
2912 drbd_free_mdev(mdev);
2913}
2914
2915static void drbd_cleanup(void)
2916{
2917 unsigned int i;
2918
2919 unregister_reboot_notifier(&drbd_notifier);
2920
2921 drbd_nl_cleanup();
2922
2923 if (minor_table) {
2924 if (drbd_proc)
2925 remove_proc_entry("drbd", NULL);
2926 i = minor_count;
2927 while (i--)
2928 drbd_delete_device(i);
2929 drbd_destroy_mempools();
2930 }
2931
2932 kfree(minor_table);
2933
2934 unregister_blkdev(DRBD_MAJOR, "drbd");
2935
2936 printk(KERN_INFO "drbd: module cleanup done.\n");
2937}
2938
2939/**
2940 * drbd_congested() - Callback for pdflush
2941 * @congested_data: User data
2942 * @bdi_bits: Bits pdflush is currently interested in
2943 *
2944 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2945 */
2946static int drbd_congested(void *congested_data, int bdi_bits)
2947{
2948 struct drbd_conf *mdev = congested_data;
2949 struct request_queue *q;
2950 char reason = '-';
2951 int r = 0;
2952
2953 if (!__inc_ap_bio_cond(mdev)) {
2954 /* DRBD has frozen IO */
2955 r = bdi_bits;
2956 reason = 'd';
2957 goto out;
2958 }
2959
2960 if (get_ldev(mdev)) {
2961 q = bdev_get_queue(mdev->ldev->backing_bdev);
2962 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2963 put_ldev(mdev);
2964 if (r)
2965 reason = 'b';
2966 }
2967
2968 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
2969 r |= (1 << BDI_async_congested);
2970 reason = reason == 'b' ? 'a' : 'n';
2971 }
2972
2973out:
2974 mdev->congestion_reason = reason;
2975 return r;
2976}
2977
2978struct drbd_conf *drbd_new_device(unsigned int minor)
2979{
2980 struct drbd_conf *mdev;
2981 struct gendisk *disk;
2982 struct request_queue *q;
2983
2984 /* GFP_KERNEL, we are outside of all write-out paths */
2985 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2986 if (!mdev)
2987 return NULL;
2988 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
2989 goto out_no_cpumask;
2990
2991 mdev->minor = minor;
2992
2993 drbd_init_set_defaults(mdev);
2994
2995 q = blk_alloc_queue(GFP_KERNEL);
2996 if (!q)
2997 goto out_no_q;
2998 mdev->rq_queue = q;
2999 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003000
3001 disk = alloc_disk(1);
3002 if (!disk)
3003 goto out_no_disk;
3004 mdev->vdisk = disk;
3005
3006 set_disk_ro(disk, TRUE);
3007
3008 disk->queue = q;
3009 disk->major = DRBD_MAJOR;
3010 disk->first_minor = minor;
3011 disk->fops = &drbd_ops;
3012 sprintf(disk->disk_name, "drbd%d", minor);
3013 disk->private_data = mdev;
3014
3015 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3016 /* we have no partitions. we contain only ourselves. */
3017 mdev->this_bdev->bd_contains = mdev->this_bdev;
3018
3019 q->backing_dev_info.congested_fn = drbd_congested;
3020 q->backing_dev_info.congested_data = mdev;
3021
3022 blk_queue_make_request(q, drbd_make_request_26);
Lars Ellenberg98ec2862010-01-21 19:33:14 +01003023 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003024 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3025 blk_queue_merge_bvec(q, drbd_merge_bvec);
3026 q->queue_lock = &mdev->req_lock; /* needed since we use */
3027 /* plugging on a queue, that actually has no requests! */
3028 q->unplug_fn = drbd_unplug_fn;
3029
3030 mdev->md_io_page = alloc_page(GFP_KERNEL);
3031 if (!mdev->md_io_page)
3032 goto out_no_io_page;
3033
3034 if (drbd_bm_init(mdev))
3035 goto out_no_bitmap;
3036 /* no need to lock access, we are still initializing this minor device. */
3037 if (!tl_init(mdev))
3038 goto out_no_tl;
3039
3040 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3041 if (!mdev->app_reads_hash)
3042 goto out_no_app_reads;
3043
3044 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3045 if (!mdev->current_epoch)
3046 goto out_no_epoch;
3047
3048 INIT_LIST_HEAD(&mdev->current_epoch->list);
3049 mdev->epochs = 1;
3050
3051 return mdev;
3052
3053/* out_whatever_else:
3054 kfree(mdev->current_epoch); */
3055out_no_epoch:
3056 kfree(mdev->app_reads_hash);
3057out_no_app_reads:
3058 tl_cleanup(mdev);
3059out_no_tl:
3060 drbd_bm_cleanup(mdev);
3061out_no_bitmap:
3062 __free_page(mdev->md_io_page);
3063out_no_io_page:
3064 put_disk(disk);
3065out_no_disk:
3066 blk_cleanup_queue(q);
3067out_no_q:
3068 free_cpumask_var(mdev->cpu_mask);
3069out_no_cpumask:
3070 kfree(mdev);
3071 return NULL;
3072}
3073
3074/* counterpart of drbd_new_device.
3075 * last part of drbd_delete_device. */
3076void drbd_free_mdev(struct drbd_conf *mdev)
3077{
3078 kfree(mdev->current_epoch);
3079 kfree(mdev->app_reads_hash);
3080 tl_cleanup(mdev);
3081 if (mdev->bitmap) /* should no longer be there. */
3082 drbd_bm_cleanup(mdev);
3083 __free_page(mdev->md_io_page);
3084 put_disk(mdev->vdisk);
3085 blk_cleanup_queue(mdev->rq_queue);
3086 free_cpumask_var(mdev->cpu_mask);
3087 kfree(mdev);
3088}
3089
3090
3091int __init drbd_init(void)
3092{
3093 int err;
3094
3095 if (sizeof(struct p_handshake) != 80) {
3096 printk(KERN_ERR
3097 "drbd: never change the size or layout "
3098 "of the HandShake packet.\n");
3099 return -EINVAL;
3100 }
3101
3102 if (1 > minor_count || minor_count > 255) {
3103 printk(KERN_ERR
3104 "drbd: invalid minor_count (%d)\n", minor_count);
3105#ifdef MODULE
3106 return -EINVAL;
3107#else
3108 minor_count = 8;
3109#endif
3110 }
3111
3112 err = drbd_nl_init();
3113 if (err)
3114 return err;
3115
3116 err = register_blkdev(DRBD_MAJOR, "drbd");
3117 if (err) {
3118 printk(KERN_ERR
3119 "drbd: unable to register block device major %d\n",
3120 DRBD_MAJOR);
3121 return err;
3122 }
3123
3124 register_reboot_notifier(&drbd_notifier);
3125
3126 /*
3127 * allocate all necessary structs
3128 */
3129 err = -ENOMEM;
3130
3131 init_waitqueue_head(&drbd_pp_wait);
3132
3133 drbd_proc = NULL; /* play safe for drbd_cleanup */
3134 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3135 GFP_KERNEL);
3136 if (!minor_table)
3137 goto Enomem;
3138
3139 err = drbd_create_mempools();
3140 if (err)
3141 goto Enomem;
3142
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003143 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003144 if (!drbd_proc) {
3145 printk(KERN_ERR "drbd: unable to register proc file\n");
3146 goto Enomem;
3147 }
3148
3149 rwlock_init(&global_state_lock);
3150
3151 printk(KERN_INFO "drbd: initialized. "
3152 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3153 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3154 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3155 printk(KERN_INFO "drbd: registered as block device major %d\n",
3156 DRBD_MAJOR);
3157 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3158
3159 return 0; /* Success! */
3160
3161Enomem:
3162 drbd_cleanup();
3163 if (err == -ENOMEM)
3164 /* currently always the case */
3165 printk(KERN_ERR "drbd: ran out of memory\n");
3166 else
3167 printk(KERN_ERR "drbd: initialization failure\n");
3168 return err;
3169}
3170
3171void drbd_free_bc(struct drbd_backing_dev *ldev)
3172{
3173 if (ldev == NULL)
3174 return;
3175
3176 bd_release(ldev->backing_bdev);
3177 bd_release(ldev->md_bdev);
3178
3179 fput(ldev->lo_file);
3180 fput(ldev->md_file);
3181
3182 kfree(ldev);
3183}
3184
3185void drbd_free_sock(struct drbd_conf *mdev)
3186{
3187 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003188 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003189 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3190 sock_release(mdev->data.socket);
3191 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003192 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003193 }
3194 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003195 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003196 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3197 sock_release(mdev->meta.socket);
3198 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003199 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003200 }
3201}
3202
3203
3204void drbd_free_resources(struct drbd_conf *mdev)
3205{
3206 crypto_free_hash(mdev->csums_tfm);
3207 mdev->csums_tfm = NULL;
3208 crypto_free_hash(mdev->verify_tfm);
3209 mdev->verify_tfm = NULL;
3210 crypto_free_hash(mdev->cram_hmac_tfm);
3211 mdev->cram_hmac_tfm = NULL;
3212 crypto_free_hash(mdev->integrity_w_tfm);
3213 mdev->integrity_w_tfm = NULL;
3214 crypto_free_hash(mdev->integrity_r_tfm);
3215 mdev->integrity_r_tfm = NULL;
3216
3217 drbd_free_sock(mdev);
3218
3219 __no_warn(local,
3220 drbd_free_bc(mdev->ldev);
3221 mdev->ldev = NULL;);
3222}
3223
3224/* meta data management */
3225
3226struct meta_data_on_disk {
3227 u64 la_size; /* last agreed size. */
3228 u64 uuid[UI_SIZE]; /* UUIDs. */
3229 u64 device_uuid;
3230 u64 reserved_u64_1;
3231 u32 flags; /* MDF */
3232 u32 magic;
3233 u32 md_size_sect;
3234 u32 al_offset; /* offset to this block */
3235 u32 al_nr_extents; /* important for restoring the AL */
3236 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3237 u32 bm_offset; /* offset to the bitmap, from here */
3238 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3239 u32 reserved_u32[4];
3240
3241} __packed;
3242
3243/**
3244 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3245 * @mdev: DRBD device.
3246 */
3247void drbd_md_sync(struct drbd_conf *mdev)
3248{
3249 struct meta_data_on_disk *buffer;
3250 sector_t sector;
3251 int i;
3252
3253 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3254 return;
3255 del_timer(&mdev->md_sync_timer);
3256
3257 /* We use here D_FAILED and not D_ATTACHING because we try to write
3258 * metadata even if we detach due to a disk failure! */
3259 if (!get_ldev_if_state(mdev, D_FAILED))
3260 return;
3261
Philipp Reisnerb411b362009-09-25 16:07:19 -07003262 mutex_lock(&mdev->md_io_mutex);
3263 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3264 memset(buffer, 0, 512);
3265
3266 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3267 for (i = UI_CURRENT; i < UI_SIZE; i++)
3268 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3269 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3270 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3271
3272 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3273 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3274 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3275 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3276 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3277
3278 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3279
3280 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3281 sector = mdev->ldev->md.md_offset;
3282
3283 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3284 clear_bit(MD_DIRTY, &mdev->flags);
3285 } else {
3286 /* this was a try anyways ... */
3287 dev_err(DEV, "meta data update failed!\n");
3288
3289 drbd_chk_io_error(mdev, 1, TRUE);
3290 }
3291
3292 /* Update mdev->ldev->md.la_size_sect,
3293 * since we updated it on metadata. */
3294 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3295
3296 mutex_unlock(&mdev->md_io_mutex);
3297 put_ldev(mdev);
3298}
3299
3300/**
3301 * drbd_md_read() - Reads in the meta data super block
3302 * @mdev: DRBD device.
3303 * @bdev: Device from which the meta data should be read in.
3304 *
3305 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3306 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3307 */
3308int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3309{
3310 struct meta_data_on_disk *buffer;
3311 int i, rv = NO_ERROR;
3312
3313 if (!get_ldev_if_state(mdev, D_ATTACHING))
3314 return ERR_IO_MD_DISK;
3315
Philipp Reisnerb411b362009-09-25 16:07:19 -07003316 mutex_lock(&mdev->md_io_mutex);
3317 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3318
3319 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3320 /* NOTE: cant do normal error processing here as this is
3321 called BEFORE disk is attached */
3322 dev_err(DEV, "Error while reading metadata.\n");
3323 rv = ERR_IO_MD_DISK;
3324 goto err;
3325 }
3326
3327 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3328 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3329 rv = ERR_MD_INVALID;
3330 goto err;
3331 }
3332 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3333 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3334 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3335 rv = ERR_MD_INVALID;
3336 goto err;
3337 }
3338 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3339 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3340 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3341 rv = ERR_MD_INVALID;
3342 goto err;
3343 }
3344 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3345 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3346 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3347 rv = ERR_MD_INVALID;
3348 goto err;
3349 }
3350
3351 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3352 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3353 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3354 rv = ERR_MD_INVALID;
3355 goto err;
3356 }
3357
3358 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3359 for (i = UI_CURRENT; i < UI_SIZE; i++)
3360 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3361 bdev->md.flags = be32_to_cpu(buffer->flags);
3362 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3363 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3364
3365 if (mdev->sync_conf.al_extents < 7)
3366 mdev->sync_conf.al_extents = 127;
3367
3368 err:
3369 mutex_unlock(&mdev->md_io_mutex);
3370 put_ldev(mdev);
3371
3372 return rv;
3373}
3374
3375/**
3376 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3377 * @mdev: DRBD device.
3378 *
3379 * Call this function if you change anything that should be written to
3380 * the meta-data super block. This function sets MD_DIRTY, and starts a
3381 * timer that ensures that within five seconds you have to call drbd_md_sync().
3382 */
3383void drbd_md_mark_dirty(struct drbd_conf *mdev)
3384{
3385 set_bit(MD_DIRTY, &mdev->flags);
3386 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3387}
3388
3389
3390static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3391{
3392 int i;
3393
Jens Axboe6a0afdf2009-10-01 09:04:14 +02003394 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003395 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003396}
3397
3398void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3399{
3400 if (idx == UI_CURRENT) {
3401 if (mdev->state.role == R_PRIMARY)
3402 val |= 1;
3403 else
3404 val &= ~((u64)1);
3405
3406 drbd_set_ed_uuid(mdev, val);
3407 }
3408
3409 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003410 drbd_md_mark_dirty(mdev);
3411}
3412
3413
3414void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3415{
3416 if (mdev->ldev->md.uuid[idx]) {
3417 drbd_uuid_move_history(mdev);
3418 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003419 }
3420 _drbd_uuid_set(mdev, idx, val);
3421}
3422
3423/**
3424 * drbd_uuid_new_current() - Creates a new current UUID
3425 * @mdev: DRBD device.
3426 *
3427 * Creates a new current UUID, and rotates the old current UUID into
3428 * the bitmap slot. Causes an incremental resync upon next connect.
3429 */
3430void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3431{
3432 u64 val;
3433
3434 dev_info(DEV, "Creating new current UUID\n");
3435 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3436 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003437
3438 get_random_bytes(&val, sizeof(u64));
3439 _drbd_uuid_set(mdev, UI_CURRENT, val);
3440}
3441
3442void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3443{
3444 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3445 return;
3446
3447 if (val == 0) {
3448 drbd_uuid_move_history(mdev);
3449 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3450 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003451 } else {
3452 if (mdev->ldev->md.uuid[UI_BITMAP])
3453 dev_warn(DEV, "bm UUID already set");
3454
3455 mdev->ldev->md.uuid[UI_BITMAP] = val;
3456 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3457
Philipp Reisnerb411b362009-09-25 16:07:19 -07003458 }
3459 drbd_md_mark_dirty(mdev);
3460}
3461
3462/**
3463 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3464 * @mdev: DRBD device.
3465 *
3466 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3467 */
3468int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3469{
3470 int rv = -EIO;
3471
3472 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3473 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3474 drbd_md_sync(mdev);
3475 drbd_bm_set_all(mdev);
3476
3477 rv = drbd_bm_write(mdev);
3478
3479 if (!rv) {
3480 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3481 drbd_md_sync(mdev);
3482 }
3483
3484 put_ldev(mdev);
3485 }
3486
3487 return rv;
3488}
3489
3490/**
3491 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3492 * @mdev: DRBD device.
3493 *
3494 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3495 */
3496int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3497{
3498 int rv = -EIO;
3499
3500 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3501 drbd_bm_clear_all(mdev);
3502 rv = drbd_bm_write(mdev);
3503 put_ldev(mdev);
3504 }
3505
3506 return rv;
3507}
3508
3509static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3510{
3511 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3512 int rv;
3513
3514 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3515
3516 drbd_bm_lock(mdev, work->why);
3517 rv = work->io_fn(mdev);
3518 drbd_bm_unlock(mdev);
3519
3520 clear_bit(BITMAP_IO, &mdev->flags);
3521 wake_up(&mdev->misc_wait);
3522
3523 if (work->done)
3524 work->done(mdev, rv);
3525
3526 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3527 work->why = NULL;
3528
3529 return 1;
3530}
3531
3532/**
3533 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3534 * @mdev: DRBD device.
3535 * @io_fn: IO callback to be called when bitmap IO is possible
3536 * @done: callback to be called after the bitmap IO was performed
3537 * @why: Descriptive text of the reason for doing the IO
3538 *
3539 * While IO on the bitmap happens we freeze application IO thus we ensure
3540 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3541 * called from worker context. It MUST NOT be used while a previous such
3542 * work is still pending!
3543 */
3544void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3545 int (*io_fn)(struct drbd_conf *),
3546 void (*done)(struct drbd_conf *, int),
3547 char *why)
3548{
3549 D_ASSERT(current == mdev->worker.task);
3550
3551 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3552 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3553 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3554 if (mdev->bm_io_work.why)
3555 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3556 why, mdev->bm_io_work.why);
3557
3558 mdev->bm_io_work.io_fn = io_fn;
3559 mdev->bm_io_work.done = done;
3560 mdev->bm_io_work.why = why;
3561
3562 set_bit(BITMAP_IO, &mdev->flags);
3563 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3564 if (list_empty(&mdev->bm_io_work.w.list)) {
3565 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3566 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3567 } else
3568 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3569 }
3570}
3571
3572/**
3573 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3574 * @mdev: DRBD device.
3575 * @io_fn: IO callback to be called when bitmap IO is possible
3576 * @why: Descriptive text of the reason for doing the IO
3577 *
3578 * freezes application IO while that the actual IO operations runs. This
3579 * functions MAY NOT be called from worker context.
3580 */
3581int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3582{
3583 int rv;
3584
3585 D_ASSERT(current != mdev->worker.task);
3586
3587 drbd_suspend_io(mdev);
3588
3589 drbd_bm_lock(mdev, why);
3590 rv = io_fn(mdev);
3591 drbd_bm_unlock(mdev);
3592
3593 drbd_resume_io(mdev);
3594
3595 return rv;
3596}
3597
3598void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3599{
3600 if ((mdev->ldev->md.flags & flag) != flag) {
3601 drbd_md_mark_dirty(mdev);
3602 mdev->ldev->md.flags |= flag;
3603 }
3604}
3605
3606void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3607{
3608 if ((mdev->ldev->md.flags & flag) != 0) {
3609 drbd_md_mark_dirty(mdev);
3610 mdev->ldev->md.flags &= ~flag;
3611 }
3612}
3613int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3614{
3615 return (bdev->md.flags & flag) != 0;
3616}
3617
3618static void md_sync_timer_fn(unsigned long data)
3619{
3620 struct drbd_conf *mdev = (struct drbd_conf *) data;
3621
3622 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3623}
3624
3625static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3626{
3627 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3628 drbd_md_sync(mdev);
3629
3630 return 1;
3631}
3632
3633#ifdef CONFIG_DRBD_FAULT_INJECTION
3634/* Fault insertion support including random number generator shamelessly
3635 * stolen from kernel/rcutorture.c */
3636struct fault_random_state {
3637 unsigned long state;
3638 unsigned long count;
3639};
3640
3641#define FAULT_RANDOM_MULT 39916801 /* prime */
3642#define FAULT_RANDOM_ADD 479001701 /* prime */
3643#define FAULT_RANDOM_REFRESH 10000
3644
3645/*
3646 * Crude but fast random-number generator. Uses a linear congruential
3647 * generator, with occasional help from get_random_bytes().
3648 */
3649static unsigned long
3650_drbd_fault_random(struct fault_random_state *rsp)
3651{
3652 long refresh;
3653
Roel Kluin49829ea2009-12-15 22:55:44 +01003654 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003655 get_random_bytes(&refresh, sizeof(refresh));
3656 rsp->state += refresh;
3657 rsp->count = FAULT_RANDOM_REFRESH;
3658 }
3659 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3660 return swahw32(rsp->state);
3661}
3662
3663static char *
3664_drbd_fault_str(unsigned int type) {
3665 static char *_faults[] = {
3666 [DRBD_FAULT_MD_WR] = "Meta-data write",
3667 [DRBD_FAULT_MD_RD] = "Meta-data read",
3668 [DRBD_FAULT_RS_WR] = "Resync write",
3669 [DRBD_FAULT_RS_RD] = "Resync read",
3670 [DRBD_FAULT_DT_WR] = "Data write",
3671 [DRBD_FAULT_DT_RD] = "Data read",
3672 [DRBD_FAULT_DT_RA] = "Data read ahead",
3673 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02003674 [DRBD_FAULT_AL_EE] = "EE allocation",
3675 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07003676 };
3677
3678 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3679}
3680
3681unsigned int
3682_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3683{
3684 static struct fault_random_state rrs = {0, 0};
3685
3686 unsigned int ret = (
3687 (fault_devs == 0 ||
3688 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3689 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3690
3691 if (ret) {
3692 fault_count++;
3693
3694 if (printk_ratelimit())
3695 dev_warn(DEV, "***Simulating %s failure\n",
3696 _drbd_fault_str(type));
3697 }
3698
3699 return ret;
3700}
3701#endif
3702
3703const char *drbd_buildtag(void)
3704{
3705 /* DRBD built from external sources has here a reference to the
3706 git hash of the source code. */
3707
3708 static char buildtag[38] = "\0uilt-in";
3709
3710 if (buildtag[0] == 0) {
3711#ifdef CONFIG_MODULES
3712 if (THIS_MODULE != NULL)
3713 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3714 else
3715#endif
3716 buildtag[0] = 'b';
3717 }
3718
3719 return buildtag;
3720}
3721
3722module_init(drbd_init)
3723module_exit(drbd_cleanup)
3724
Philipp Reisnerb411b362009-09-25 16:07:19 -07003725EXPORT_SYMBOL(drbd_conn_str);
3726EXPORT_SYMBOL(drbd_role_str);
3727EXPORT_SYMBOL(drbd_disk_str);
3728EXPORT_SYMBOL(drbd_set_st_err_str);