blob: 7468d2ce73479181231d11852be03bbfd9f44067 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
35#include <linux/smp_lock.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
67int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *);
70
71int drbd_init(void);
72static int drbd_open(struct block_device *bdev, fmode_t mode);
73static int drbd_release(struct gendisk *gd, fmode_t mode);
74static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76 union drbd_state ns, enum chg_state_flags flags);
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80
Philipp Reisnerb411b362009-09-25 16:07:19 -070081MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82 "Lars Ellenberg <lars@linbit.com>");
83MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84MODULE_VERSION(REL_VERSION);
85MODULE_LICENSE("GPL");
86MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
87MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88
89#include <linux/moduleparam.h>
90/* allow_open_on_secondary */
91MODULE_PARM_DESC(allow_oos, "DONT USE!");
92/* thanks to these macros, if compiled into the kernel (not-module),
93 * this becomes the boot parameter drbd.minor_count */
94module_param(minor_count, uint, 0444);
95module_param(disable_sendpage, bool, 0644);
96module_param(allow_oos, bool, 0);
97module_param(cn_idx, uint, 0444);
98module_param(proc_details, int, 0644);
99
100#ifdef CONFIG_DRBD_FAULT_INJECTION
101int enable_faults;
102int fault_rate;
103static int fault_count;
104int fault_devs;
105/* bitmap of enabled faults */
106module_param(enable_faults, int, 0664);
107/* fault rate % value - applies to all enabled faults */
108module_param(fault_rate, int, 0664);
109/* count of faults inserted */
110module_param(fault_count, int, 0664);
111/* bitmap of devices to insert faults on */
112module_param(fault_devs, int, 0644);
113#endif
114
115/* module parameter, defined */
116unsigned int minor_count = 32;
117int disable_sendpage;
118int allow_oos;
119unsigned int cn_idx = CN_IDX_DRBD;
120int proc_details; /* Detail level in proc drbd*/
121
122/* Module parameter for setting the user mode helper program
123 * to run. Default is /sbin/drbdadm */
124char usermode_helper[80] = "/sbin/drbdadm";
125
126module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
127
128/* in 2.6.x, our device mapping and config info contains our virtual gendisks
129 * as member "struct gendisk *vdisk;"
130 */
131struct drbd_conf **minor_table;
132
133struct kmem_cache *drbd_request_cache;
134struct kmem_cache *drbd_ee_cache; /* epoch entries */
135struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
136struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
137mempool_t *drbd_request_mempool;
138mempool_t *drbd_ee_mempool;
139
140/* I do not use a standard mempool, because:
141 1) I want to hand out the pre-allocated objects first.
142 2) I want to be able to interrupt sleeping allocation with a signal.
143 Note: This is a single linked list, the next pointer is the private
144 member of struct page.
145 */
146struct page *drbd_pp_pool;
147spinlock_t drbd_pp_lock;
148int drbd_pp_vacant;
149wait_queue_head_t drbd_pp_wait;
150
151DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
152
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100153static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700154 .owner = THIS_MODULE,
155 .open = drbd_open,
156 .release = drbd_release,
157};
158
159#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
160
161#ifdef __CHECKER__
162/* When checking with sparse, and this is an inline function, sparse will
163 give tons of false positives. When this is a real functions sparse works.
164 */
165int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
166{
167 int io_allowed;
168
169 atomic_inc(&mdev->local_cnt);
170 io_allowed = (mdev->state.disk >= mins);
171 if (!io_allowed) {
172 if (atomic_dec_and_test(&mdev->local_cnt))
173 wake_up(&mdev->misc_wait);
174 }
175 return io_allowed;
176}
177
178#endif
179
180/**
181 * DOC: The transfer log
182 *
183 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
184 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
185 * of the list. There is always at least one &struct drbd_tl_epoch object.
186 *
187 * Each &struct drbd_tl_epoch has a circular double linked list of requests
188 * attached.
189 */
190static int tl_init(struct drbd_conf *mdev)
191{
192 struct drbd_tl_epoch *b;
193
194 /* during device minor initialization, we may well use GFP_KERNEL */
195 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
196 if (!b)
197 return 0;
198 INIT_LIST_HEAD(&b->requests);
199 INIT_LIST_HEAD(&b->w.list);
200 b->next = NULL;
201 b->br_number = 4711;
202 b->n_req = 0;
203 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204
205 mdev->oldest_tle = b;
206 mdev->newest_tle = b;
207 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
208
209 mdev->tl_hash = NULL;
210 mdev->tl_hash_s = 0;
211
212 return 1;
213}
214
215static void tl_cleanup(struct drbd_conf *mdev)
216{
217 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
218 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
219 kfree(mdev->oldest_tle);
220 mdev->oldest_tle = NULL;
221 kfree(mdev->unused_spare_tle);
222 mdev->unused_spare_tle = NULL;
223 kfree(mdev->tl_hash);
224 mdev->tl_hash = NULL;
225 mdev->tl_hash_s = 0;
226}
227
228/**
229 * _tl_add_barrier() - Adds a barrier to the transfer log
230 * @mdev: DRBD device.
231 * @new: Barrier to be added before the current head of the TL.
232 *
233 * The caller must hold the req_lock.
234 */
235void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
236{
237 struct drbd_tl_epoch *newest_before;
238
239 INIT_LIST_HEAD(&new->requests);
240 INIT_LIST_HEAD(&new->w.list);
241 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242 new->next = NULL;
243 new->n_req = 0;
244
245 newest_before = mdev->newest_tle;
246 /* never send a barrier number == 0, because that is special-cased
247 * when using TCQ for our write ordering code */
248 new->br_number = (newest_before->br_number+1) ?: 1;
249 if (mdev->newest_tle != new) {
250 mdev->newest_tle->next = new;
251 mdev->newest_tle = new;
252 }
253}
254
255/**
256 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
257 * @mdev: DRBD device.
258 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
259 * @set_size: Expected number of requests before that barrier.
260 *
261 * In case the passed barrier_nr or set_size does not match the oldest
262 * &struct drbd_tl_epoch objects this function will cause a termination
263 * of the connection.
264 */
265void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
266 unsigned int set_size)
267{
268 struct drbd_tl_epoch *b, *nob; /* next old barrier */
269 struct list_head *le, *tle;
270 struct drbd_request *r;
271
272 spin_lock_irq(&mdev->req_lock);
273
274 b = mdev->oldest_tle;
275
276 /* first some paranoia code */
277 if (b == NULL) {
278 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
279 barrier_nr);
280 goto bail;
281 }
282 if (b->br_number != barrier_nr) {
283 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
284 barrier_nr, b->br_number);
285 goto bail;
286 }
287 if (b->n_req != set_size) {
288 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
289 barrier_nr, set_size, b->n_req);
290 goto bail;
291 }
292
293 /* Clean up list of requests processed during current epoch */
294 list_for_each_safe(le, tle, &b->requests) {
295 r = list_entry(le, struct drbd_request, tl_requests);
296 _req_mod(r, barrier_acked);
297 }
298 /* There could be requests on the list waiting for completion
299 of the write to the local disk. To avoid corruptions of
300 slab's data structures we have to remove the lists head.
301
302 Also there could have been a barrier ack out of sequence, overtaking
303 the write acks - which would be a bug and violating write ordering.
304 To not deadlock in case we lose connection while such requests are
305 still pending, we need some way to find them for the
306 _req_mode(connection_lost_while_pending).
307
308 These have been list_move'd to the out_of_sequence_requests list in
309 _req_mod(, barrier_acked) above.
310 */
311 list_del_init(&b->requests);
312
313 nob = b->next;
314 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
315 _tl_add_barrier(mdev, b);
316 if (nob)
317 mdev->oldest_tle = nob;
318 /* if nob == NULL b was the only barrier, and becomes the new
319 barrier. Therefore mdev->oldest_tle points already to b */
320 } else {
321 D_ASSERT(nob != NULL);
322 mdev->oldest_tle = nob;
323 kfree(b);
324 }
325
326 spin_unlock_irq(&mdev->req_lock);
327 dec_ap_pending(mdev);
328
329 return;
330
331bail:
332 spin_unlock_irq(&mdev->req_lock);
333 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334}
335
336
337/**
338 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
339 * @mdev: DRBD device.
340 *
341 * This is called after the connection to the peer was lost. The storage covered
342 * by the requests on the transfer gets marked as our of sync. Called from the
343 * receiver thread and the worker thread.
344 */
345void tl_clear(struct drbd_conf *mdev)
346{
347 struct drbd_tl_epoch *b, *tmp;
348 struct list_head *le, *tle;
349 struct drbd_request *r;
350 int new_initial_bnr = net_random();
351
352 spin_lock_irq(&mdev->req_lock);
353
354 b = mdev->oldest_tle;
355 while (b) {
356 list_for_each_safe(le, tle, &b->requests) {
357 r = list_entry(le, struct drbd_request, tl_requests);
358 /* It would be nice to complete outside of spinlock.
359 * But this is easier for now. */
360 _req_mod(r, connection_lost_while_pending);
361 }
362 tmp = b->next;
363
364 /* there could still be requests on that ring list,
365 * in case local io is still pending */
366 list_del(&b->requests);
367
368 /* dec_ap_pending corresponding to queue_barrier.
369 * the newest barrier may not have been queued yet,
370 * in which case w.cb is still NULL. */
371 if (b->w.cb != NULL)
372 dec_ap_pending(mdev);
373
374 if (b == mdev->newest_tle) {
375 /* recycle, but reinit! */
376 D_ASSERT(tmp == NULL);
377 INIT_LIST_HEAD(&b->requests);
378 INIT_LIST_HEAD(&b->w.list);
379 b->w.cb = NULL;
380 b->br_number = new_initial_bnr;
381 b->n_req = 0;
382
383 mdev->oldest_tle = b;
384 break;
385 }
386 kfree(b);
387 b = tmp;
388 }
389
390 /* we expect this list to be empty. */
391 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
392
393 /* but just in case, clean it up anyways! */
394 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
395 r = list_entry(le, struct drbd_request, tl_requests);
396 /* It would be nice to complete outside of spinlock.
397 * But this is easier for now. */
398 _req_mod(r, connection_lost_while_pending);
399 }
400
401 /* ensure bit indicating barrier is required is clear */
402 clear_bit(CREATE_BARRIER, &mdev->flags);
403
404 spin_unlock_irq(&mdev->req_lock);
405}
406
407/**
408 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
409 * @mdev: DRBD device.
410 * @os: old (current) state.
411 * @ns: new (wanted) state.
412 */
413static int cl_wide_st_chg(struct drbd_conf *mdev,
414 union drbd_state os, union drbd_state ns)
415{
416 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
417 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
418 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
419 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
420 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
421 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
422 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
423}
424
425int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
426 union drbd_state mask, union drbd_state val)
427{
428 unsigned long flags;
429 union drbd_state os, ns;
430 int rv;
431
432 spin_lock_irqsave(&mdev->req_lock, flags);
433 os = mdev->state;
434 ns.i = (os.i & ~mask.i) | val.i;
435 rv = _drbd_set_state(mdev, ns, f, NULL);
436 ns = mdev->state;
437 spin_unlock_irqrestore(&mdev->req_lock, flags);
438
439 return rv;
440}
441
442/**
443 * drbd_force_state() - Impose a change which happens outside our control on our state
444 * @mdev: DRBD device.
445 * @mask: mask of state bits to change.
446 * @val: value of new state bits.
447 */
448void drbd_force_state(struct drbd_conf *mdev,
449 union drbd_state mask, union drbd_state val)
450{
451 drbd_change_state(mdev, CS_HARD, mask, val);
452}
453
454static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
455static int is_valid_state_transition(struct drbd_conf *,
456 union drbd_state, union drbd_state);
457static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
458 union drbd_state ns, int *warn_sync_abort);
459int drbd_send_state_req(struct drbd_conf *,
460 union drbd_state, union drbd_state);
461
462static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
463 union drbd_state mask, union drbd_state val)
464{
465 union drbd_state os, ns;
466 unsigned long flags;
467 int rv;
468
469 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
470 return SS_CW_SUCCESS;
471
472 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
473 return SS_CW_FAILED_BY_PEER;
474
475 rv = 0;
476 spin_lock_irqsave(&mdev->req_lock, flags);
477 os = mdev->state;
478 ns.i = (os.i & ~mask.i) | val.i;
479 ns = sanitize_state(mdev, os, ns, NULL);
480
481 if (!cl_wide_st_chg(mdev, os, ns))
482 rv = SS_CW_NO_NEED;
483 if (!rv) {
484 rv = is_valid_state(mdev, ns);
485 if (rv == SS_SUCCESS) {
486 rv = is_valid_state_transition(mdev, ns, os);
487 if (rv == SS_SUCCESS)
488 rv = 0; /* cont waiting, otherwise fail. */
489 }
490 }
491 spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493 return rv;
494}
495
496/**
497 * drbd_req_state() - Perform an eventually cluster wide state change
498 * @mdev: DRBD device.
499 * @mask: mask of state bits to change.
500 * @val: value of new state bits.
501 * @f: flags
502 *
503 * Should not be called directly, use drbd_request_state() or
504 * _drbd_request_state().
505 */
506static int drbd_req_state(struct drbd_conf *mdev,
507 union drbd_state mask, union drbd_state val,
508 enum chg_state_flags f)
509{
510 struct completion done;
511 unsigned long flags;
512 union drbd_state os, ns;
513 int rv;
514
515 init_completion(&done);
516
517 if (f & CS_SERIALIZE)
518 mutex_lock(&mdev->state_mutex);
519
520 spin_lock_irqsave(&mdev->req_lock, flags);
521 os = mdev->state;
522 ns.i = (os.i & ~mask.i) | val.i;
523 ns = sanitize_state(mdev, os, ns, NULL);
524
525 if (cl_wide_st_chg(mdev, os, ns)) {
526 rv = is_valid_state(mdev, ns);
527 if (rv == SS_SUCCESS)
528 rv = is_valid_state_transition(mdev, ns, os);
529 spin_unlock_irqrestore(&mdev->req_lock, flags);
530
531 if (rv < SS_SUCCESS) {
532 if (f & CS_VERBOSE)
533 print_st_err(mdev, os, ns, rv);
534 goto abort;
535 }
536
537 drbd_state_lock(mdev);
538 if (!drbd_send_state_req(mdev, mask, val)) {
539 drbd_state_unlock(mdev);
540 rv = SS_CW_FAILED_BY_PEER;
541 if (f & CS_VERBOSE)
542 print_st_err(mdev, os, ns, rv);
543 goto abort;
544 }
545
546 wait_event(mdev->state_wait,
547 (rv = _req_st_cond(mdev, mask, val)));
548
549 if (rv < SS_SUCCESS) {
550 drbd_state_unlock(mdev);
551 if (f & CS_VERBOSE)
552 print_st_err(mdev, os, ns, rv);
553 goto abort;
554 }
555 spin_lock_irqsave(&mdev->req_lock, flags);
556 os = mdev->state;
557 ns.i = (os.i & ~mask.i) | val.i;
558 rv = _drbd_set_state(mdev, ns, f, &done);
559 drbd_state_unlock(mdev);
560 } else {
561 rv = _drbd_set_state(mdev, ns, f, &done);
562 }
563
564 spin_unlock_irqrestore(&mdev->req_lock, flags);
565
566 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
567 D_ASSERT(current != mdev->worker.task);
568 wait_for_completion(&done);
569 }
570
571abort:
572 if (f & CS_SERIALIZE)
573 mutex_unlock(&mdev->state_mutex);
574
575 return rv;
576}
577
578/**
579 * _drbd_request_state() - Request a state change (with flags)
580 * @mdev: DRBD device.
581 * @mask: mask of state bits to change.
582 * @val: value of new state bits.
583 * @f: flags
584 *
585 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
586 * flag, or when logging of failed state change requests is not desired.
587 */
588int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
589 union drbd_state val, enum chg_state_flags f)
590{
591 int rv;
592
593 wait_event(mdev->state_wait,
594 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
595
596 return rv;
597}
598
599static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
600{
601 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
602 name,
603 drbd_conn_str(ns.conn),
604 drbd_role_str(ns.role),
605 drbd_role_str(ns.peer),
606 drbd_disk_str(ns.disk),
607 drbd_disk_str(ns.pdsk),
608 ns.susp ? 's' : 'r',
609 ns.aftr_isp ? 'a' : '-',
610 ns.peer_isp ? 'p' : '-',
611 ns.user_isp ? 'u' : '-'
612 );
613}
614
615void print_st_err(struct drbd_conf *mdev,
616 union drbd_state os, union drbd_state ns, int err)
617{
618 if (err == SS_IN_TRANSIENT_STATE)
619 return;
620 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
621 print_st(mdev, " state", os);
622 print_st(mdev, "wanted", ns);
623}
624
625
626#define drbd_peer_str drbd_role_str
627#define drbd_pdsk_str drbd_disk_str
628
629#define drbd_susp_str(A) ((A) ? "1" : "0")
630#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
631#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
632#define drbd_user_isp_str(A) ((A) ? "1" : "0")
633
634#define PSC(A) \
635 ({ if (ns.A != os.A) { \
636 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
637 drbd_##A##_str(os.A), \
638 drbd_##A##_str(ns.A)); \
639 } })
640
641/**
642 * is_valid_state() - Returns an SS_ error code if ns is not valid
643 * @mdev: DRBD device.
644 * @ns: State to consider.
645 */
646static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
647{
648 /* See drbd_state_sw_errors in drbd_strings.c */
649
650 enum drbd_fencing_p fp;
651 int rv = SS_SUCCESS;
652
653 fp = FP_DONT_CARE;
654 if (get_ldev(mdev)) {
655 fp = mdev->ldev->dc.fencing;
656 put_ldev(mdev);
657 }
658
659 if (get_net_conf(mdev)) {
660 if (!mdev->net_conf->two_primaries &&
661 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
662 rv = SS_TWO_PRIMARIES;
663 put_net_conf(mdev);
664 }
665
666 if (rv <= 0)
667 /* already found a reason to abort */;
668 else if (ns.role == R_SECONDARY && mdev->open_cnt)
669 rv = SS_DEVICE_IN_USE;
670
671 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
672 rv = SS_NO_UP_TO_DATE_DISK;
673
674 else if (fp >= FP_RESOURCE &&
675 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
676 rv = SS_PRIMARY_NOP;
677
678 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
679 rv = SS_NO_UP_TO_DATE_DISK;
680
681 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
682 rv = SS_NO_LOCAL_DISK;
683
684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
685 rv = SS_NO_REMOTE_DISK;
686
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200687 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
688 rv = SS_NO_UP_TO_DATE_DISK;
689
Philipp Reisnerb411b362009-09-25 16:07:19 -0700690 else if ((ns.conn == C_CONNECTED ||
691 ns.conn == C_WF_BITMAP_S ||
692 ns.conn == C_SYNC_SOURCE ||
693 ns.conn == C_PAUSED_SYNC_S) &&
694 ns.disk == D_OUTDATED)
695 rv = SS_CONNECTED_OUTDATES;
696
697 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
698 (mdev->sync_conf.verify_alg[0] == 0))
699 rv = SS_NO_VERIFY_ALG;
700
701 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
702 mdev->agreed_pro_version < 88)
703 rv = SS_NOT_SUPPORTED;
704
705 return rv;
706}
707
708/**
709 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
710 * @mdev: DRBD device.
711 * @ns: new state.
712 * @os: old state.
713 */
714static int is_valid_state_transition(struct drbd_conf *mdev,
715 union drbd_state ns, union drbd_state os)
716{
717 int rv = SS_SUCCESS;
718
719 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
720 os.conn > C_CONNECTED)
721 rv = SS_RESYNC_RUNNING;
722
723 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
724 rv = SS_ALREADY_STANDALONE;
725
726 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
727 rv = SS_IS_DISKLESS;
728
729 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
730 rv = SS_NO_NET_CONFIG;
731
732 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
733 rv = SS_LOWER_THAN_OUTDATED;
734
735 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
736 rv = SS_IN_TRANSIENT_STATE;
737
738 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
739 rv = SS_IN_TRANSIENT_STATE;
740
741 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
742 rv = SS_NEED_CONNECTION;
743
744 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
745 ns.conn != os.conn && os.conn > C_CONNECTED)
746 rv = SS_RESYNC_RUNNING;
747
748 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
749 os.conn < C_CONNECTED)
750 rv = SS_NEED_CONNECTION;
751
752 return rv;
753}
754
755/**
756 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
757 * @mdev: DRBD device.
758 * @os: old state.
759 * @ns: new state.
760 * @warn_sync_abort:
761 *
762 * When we loose connection, we have to set the state of the peers disk (pdsk)
763 * to D_UNKNOWN. This rule and many more along those lines are in this function.
764 */
765static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
766 union drbd_state ns, int *warn_sync_abort)
767{
768 enum drbd_fencing_p fp;
769
770 fp = FP_DONT_CARE;
771 if (get_ldev(mdev)) {
772 fp = mdev->ldev->dc.fencing;
773 put_ldev(mdev);
774 }
775
776 /* Disallow Network errors to configure a device's network part */
777 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
778 os.conn <= C_DISCONNECTING)
779 ns.conn = os.conn;
780
781 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
782 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
783 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
784 ns.conn = os.conn;
785
786 /* After C_DISCONNECTING only C_STANDALONE may follow */
787 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
788 ns.conn = os.conn;
789
790 if (ns.conn < C_CONNECTED) {
791 ns.peer_isp = 0;
792 ns.peer = R_UNKNOWN;
793 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
794 ns.pdsk = D_UNKNOWN;
795 }
796
797 /* Clear the aftr_isp when becoming unconfigured */
798 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
799 ns.aftr_isp = 0;
800
801 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
802 ns.pdsk = D_UNKNOWN;
803
804 /* Abort resync if a disk fails/detaches */
805 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
806 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
807 if (warn_sync_abort)
808 *warn_sync_abort = 1;
809 ns.conn = C_CONNECTED;
810 }
811
812 if (ns.conn >= C_CONNECTED &&
813 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
814 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
815 switch (ns.conn) {
816 case C_WF_BITMAP_T:
817 case C_PAUSED_SYNC_T:
818 ns.disk = D_OUTDATED;
819 break;
820 case C_CONNECTED:
821 case C_WF_BITMAP_S:
822 case C_SYNC_SOURCE:
823 case C_PAUSED_SYNC_S:
824 ns.disk = D_UP_TO_DATE;
825 break;
826 case C_SYNC_TARGET:
827 ns.disk = D_INCONSISTENT;
828 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
829 break;
830 }
831 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
832 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
833 }
834
835 if (ns.conn >= C_CONNECTED &&
836 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
837 switch (ns.conn) {
838 case C_CONNECTED:
839 case C_WF_BITMAP_T:
840 case C_PAUSED_SYNC_T:
841 case C_SYNC_TARGET:
842 ns.pdsk = D_UP_TO_DATE;
843 break;
844 case C_WF_BITMAP_S:
845 case C_PAUSED_SYNC_S:
Lars Ellenberge0f83012010-04-01 15:13:19 +0200846 /* remap any consistent state to D_OUTDATED,
847 * but disallow "upgrade" of not even consistent states.
848 */
849 ns.pdsk =
850 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
851 ? os.pdsk : D_OUTDATED;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700852 break;
853 case C_SYNC_SOURCE:
854 ns.pdsk = D_INCONSISTENT;
855 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
856 break;
857 }
858 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
859 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
860 }
861
862 /* Connection breaks down before we finished "Negotiating" */
863 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
864 get_ldev_if_state(mdev, D_NEGOTIATING)) {
865 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
866 ns.disk = mdev->new_state_tmp.disk;
867 ns.pdsk = mdev->new_state_tmp.pdsk;
868 } else {
869 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
870 ns.disk = D_DISKLESS;
871 ns.pdsk = D_UNKNOWN;
872 }
873 put_ldev(mdev);
874 }
875
876 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200877 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
878 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
879 ns.susp = 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700880
881 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
882 if (ns.conn == C_SYNC_SOURCE)
883 ns.conn = C_PAUSED_SYNC_S;
884 if (ns.conn == C_SYNC_TARGET)
885 ns.conn = C_PAUSED_SYNC_T;
886 } else {
887 if (ns.conn == C_PAUSED_SYNC_S)
888 ns.conn = C_SYNC_SOURCE;
889 if (ns.conn == C_PAUSED_SYNC_T)
890 ns.conn = C_SYNC_TARGET;
891 }
892
893 return ns;
894}
895
896/* helper for __drbd_set_state */
897static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
898{
899 if (cs == C_VERIFY_T) {
900 /* starting online verify from an arbitrary position
901 * does not fit well into the existing protocol.
902 * on C_VERIFY_T, we initialize ov_left and friends
903 * implicitly in receive_DataRequest once the
904 * first P_OV_REQUEST is received */
905 mdev->ov_start_sector = ~(sector_t)0;
906 } else {
907 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
908 if (bit >= mdev->rs_total)
909 mdev->ov_start_sector =
910 BM_BIT_TO_SECT(mdev->rs_total - 1);
911 mdev->ov_position = mdev->ov_start_sector;
912 }
913}
914
915/**
916 * __drbd_set_state() - Set a new DRBD state
917 * @mdev: DRBD device.
918 * @ns: new state.
919 * @flags: Flags
920 * @done: Optional completion, that will get completed after the after_state_ch() finished
921 *
922 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
923 */
924int __drbd_set_state(struct drbd_conf *mdev,
925 union drbd_state ns, enum chg_state_flags flags,
926 struct completion *done)
927{
928 union drbd_state os;
929 int rv = SS_SUCCESS;
930 int warn_sync_abort = 0;
931 struct after_state_chg_work *ascw;
932
933 os = mdev->state;
934
935 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
936
937 if (ns.i == os.i)
938 return SS_NOTHING_TO_DO;
939
940 if (!(flags & CS_HARD)) {
941 /* pre-state-change checks ; only look at ns */
942 /* See drbd_state_sw_errors in drbd_strings.c */
943
944 rv = is_valid_state(mdev, ns);
945 if (rv < SS_SUCCESS) {
946 /* If the old state was illegal as well, then let
947 this happen...*/
948
949 if (is_valid_state(mdev, os) == rv) {
950 dev_err(DEV, "Considering state change from bad state. "
951 "Error would be: '%s'\n",
952 drbd_set_st_err_str(rv));
953 print_st(mdev, "old", os);
954 print_st(mdev, "new", ns);
955 rv = is_valid_state_transition(mdev, ns, os);
956 }
957 } else
958 rv = is_valid_state_transition(mdev, ns, os);
959 }
960
961 if (rv < SS_SUCCESS) {
962 if (flags & CS_VERBOSE)
963 print_st_err(mdev, os, ns, rv);
964 return rv;
965 }
966
967 if (warn_sync_abort)
968 dev_warn(DEV, "Resync aborted.\n");
969
970 {
971 char *pbp, pb[300];
972 pbp = pb;
973 *pbp = 0;
974 PSC(role);
975 PSC(peer);
976 PSC(conn);
977 PSC(disk);
978 PSC(pdsk);
979 PSC(susp);
980 PSC(aftr_isp);
981 PSC(peer_isp);
982 PSC(user_isp);
983 dev_info(DEV, "%s\n", pb);
984 }
985
986 /* solve the race between becoming unconfigured,
987 * worker doing the cleanup, and
988 * admin reconfiguring us:
989 * on (re)configure, first set CONFIG_PENDING,
990 * then wait for a potentially exiting worker,
991 * start the worker, and schedule one no_op.
992 * then proceed with configuration.
993 */
994 if (ns.disk == D_DISKLESS &&
995 ns.conn == C_STANDALONE &&
996 ns.role == R_SECONDARY &&
997 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
998 set_bit(DEVICE_DYING, &mdev->flags);
999
1000 mdev->state.i = ns.i;
1001 wake_up(&mdev->misc_wait);
1002 wake_up(&mdev->state_wait);
1003
1004 /* post-state-change actions */
1005 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
1006 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1007 mod_timer(&mdev->resync_timer, jiffies);
1008 }
1009
1010 /* aborted verify run. log the last position */
1011 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1012 ns.conn < C_CONNECTED) {
1013 mdev->ov_start_sector =
1014 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1015 dev_info(DEV, "Online Verify reached sector %llu\n",
1016 (unsigned long long)mdev->ov_start_sector);
1017 }
1018
1019 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1020 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1021 dev_info(DEV, "Syncer continues.\n");
1022 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1023 if (ns.conn == C_SYNC_TARGET) {
1024 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1025 mod_timer(&mdev->resync_timer, jiffies);
1026 /* This if (!test_bit) is only needed for the case
1027 that a device that has ceased to used its timer,
1028 i.e. it is already in drbd_resync_finished() gets
1029 paused and resumed. */
1030 }
1031 }
1032
1033 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1034 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1035 dev_info(DEV, "Resync suspended\n");
1036 mdev->rs_mark_time = jiffies;
1037 if (ns.conn == C_PAUSED_SYNC_T)
1038 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1039 }
1040
1041 if (os.conn == C_CONNECTED &&
1042 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1043 mdev->ov_position = 0;
1044 mdev->rs_total =
1045 mdev->rs_mark_left = drbd_bm_bits(mdev);
1046 if (mdev->agreed_pro_version >= 90)
1047 set_ov_position(mdev, ns.conn);
1048 else
1049 mdev->ov_start_sector = 0;
1050 mdev->ov_left = mdev->rs_total
1051 - BM_SECT_TO_BIT(mdev->ov_position);
1052 mdev->rs_start =
1053 mdev->rs_mark_time = jiffies;
1054 mdev->ov_last_oos_size = 0;
1055 mdev->ov_last_oos_start = 0;
1056
1057 if (ns.conn == C_VERIFY_S) {
1058 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1059 (unsigned long long)mdev->ov_position);
1060 mod_timer(&mdev->resync_timer, jiffies);
1061 }
1062 }
1063
1064 if (get_ldev(mdev)) {
1065 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1066 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1067 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1068
1069 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1070 mdf |= MDF_CRASHED_PRIMARY;
1071 if (mdev->state.role == R_PRIMARY ||
1072 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1073 mdf |= MDF_PRIMARY_IND;
1074 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1075 mdf |= MDF_CONNECTED_IND;
1076 if (mdev->state.disk > D_INCONSISTENT)
1077 mdf |= MDF_CONSISTENT;
1078 if (mdev->state.disk > D_OUTDATED)
1079 mdf |= MDF_WAS_UP_TO_DATE;
1080 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1081 mdf |= MDF_PEER_OUT_DATED;
1082 if (mdf != mdev->ldev->md.flags) {
1083 mdev->ldev->md.flags = mdf;
1084 drbd_md_mark_dirty(mdev);
1085 }
1086 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1087 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1088 put_ldev(mdev);
1089 }
1090
1091 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1092 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1093 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1094 set_bit(CONSIDER_RESYNC, &mdev->flags);
1095
1096 /* Receiver should clean up itself */
1097 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1098 drbd_thread_stop_nowait(&mdev->receiver);
1099
1100 /* Now the receiver finished cleaning up itself, it should die */
1101 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1102 drbd_thread_stop_nowait(&mdev->receiver);
1103
1104 /* Upon network failure, we need to restart the receiver. */
1105 if (os.conn > C_TEAR_DOWN &&
1106 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1107 drbd_thread_restart_nowait(&mdev->receiver);
1108
1109 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1110 if (ascw) {
1111 ascw->os = os;
1112 ascw->ns = ns;
1113 ascw->flags = flags;
1114 ascw->w.cb = w_after_state_ch;
1115 ascw->done = done;
1116 drbd_queue_work(&mdev->data.work, &ascw->w);
1117 } else {
1118 dev_warn(DEV, "Could not kmalloc an ascw\n");
1119 }
1120
1121 return rv;
1122}
1123
1124static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1125{
1126 struct after_state_chg_work *ascw =
1127 container_of(w, struct after_state_chg_work, w);
1128 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1129 if (ascw->flags & CS_WAIT_COMPLETE) {
1130 D_ASSERT(ascw->done != NULL);
1131 complete(ascw->done);
1132 }
1133 kfree(ascw);
1134
1135 return 1;
1136}
1137
1138static void abw_start_sync(struct drbd_conf *mdev, int rv)
1139{
1140 if (rv) {
1141 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1142 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1143 return;
1144 }
1145
1146 switch (mdev->state.conn) {
1147 case C_STARTING_SYNC_T:
1148 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1149 break;
1150 case C_STARTING_SYNC_S:
1151 drbd_start_resync(mdev, C_SYNC_SOURCE);
1152 break;
1153 }
1154}
1155
1156/**
1157 * after_state_ch() - Perform after state change actions that may sleep
1158 * @mdev: DRBD device.
1159 * @os: old state.
1160 * @ns: new state.
1161 * @flags: Flags
1162 */
1163static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1164 union drbd_state ns, enum chg_state_flags flags)
1165{
1166 enum drbd_fencing_p fp;
1167
1168 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1169 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1170 if (mdev->p_uuid)
1171 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1172 }
1173
1174 fp = FP_DONT_CARE;
1175 if (get_ldev(mdev)) {
1176 fp = mdev->ldev->dc.fencing;
1177 put_ldev(mdev);
1178 }
1179
1180 /* Inform userspace about the change... */
1181 drbd_bcast_state(mdev, ns);
1182
1183 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1184 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1185 drbd_khelper(mdev, "pri-on-incon-degr");
1186
1187 /* Here we have the actions that are performed after a
1188 state change. This function might sleep */
1189
1190 if (fp == FP_STONITH && ns.susp) {
1191 /* case1: The outdate peer handler is successful:
1192 * case2: The connection was established again: */
1193 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
1194 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1195 tl_clear(mdev);
1196 spin_lock_irq(&mdev->req_lock);
1197 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1198 spin_unlock_irq(&mdev->req_lock);
1199 }
1200 }
1201 /* Do not change the order of the if above and the two below... */
1202 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1203 drbd_send_uuids(mdev);
1204 drbd_send_state(mdev);
1205 }
1206 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1207 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1208
1209 /* Lost contact to peer's copy of the data */
1210 if ((os.pdsk >= D_INCONSISTENT &&
1211 os.pdsk != D_UNKNOWN &&
1212 os.pdsk != D_OUTDATED)
1213 && (ns.pdsk < D_INCONSISTENT ||
1214 ns.pdsk == D_UNKNOWN ||
1215 ns.pdsk == D_OUTDATED)) {
1216 kfree(mdev->p_uuid);
1217 mdev->p_uuid = NULL;
1218 if (get_ldev(mdev)) {
1219 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1220 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1221 drbd_uuid_new_current(mdev);
1222 drbd_send_uuids(mdev);
1223 }
1224 put_ldev(mdev);
1225 }
1226 }
1227
1228 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1229 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1230 drbd_uuid_new_current(mdev);
1231
1232 /* D_DISKLESS Peer becomes secondary */
1233 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1234 drbd_al_to_on_disk_bm(mdev);
1235 put_ldev(mdev);
1236 }
1237
1238 /* Last part of the attaching process ... */
1239 if (ns.conn >= C_CONNECTED &&
1240 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1241 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1242 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
Philipp Reisnere89b5912010-03-24 17:11:33 +01001243 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001244 drbd_send_uuids(mdev);
1245 drbd_send_state(mdev);
1246 }
1247
1248 /* We want to pause/continue resync, tell peer. */
1249 if (ns.conn >= C_CONNECTED &&
1250 ((os.aftr_isp != ns.aftr_isp) ||
1251 (os.user_isp != ns.user_isp)))
1252 drbd_send_state(mdev);
1253
1254 /* In case one of the isp bits got set, suspend other devices. */
1255 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1256 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1257 suspend_other_sg(mdev);
1258
1259 /* Make sure the peer gets informed about eventual state
1260 changes (ISP bits) while we were in WFReportParams. */
1261 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1262 drbd_send_state(mdev);
1263
1264 /* We are in the progress to start a full sync... */
1265 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1266 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1267 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1268
1269 /* We are invalidating our self... */
1270 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1271 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1272 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1273
1274 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1275 enum drbd_io_error_p eh;
1276
1277 eh = EP_PASS_ON;
1278 if (get_ldev_if_state(mdev, D_FAILED)) {
1279 eh = mdev->ldev->dc.on_io_error;
1280 put_ldev(mdev);
1281 }
1282
1283 drbd_rs_cancel_all(mdev);
1284 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1285 and it is D_DISKLESS here, local_cnt can only go down, it can
1286 not increase... It will reach zero */
1287 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1288 mdev->rs_total = 0;
1289 mdev->rs_failed = 0;
1290 atomic_set(&mdev->rs_pending_cnt, 0);
1291
1292 spin_lock_irq(&mdev->req_lock);
1293 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1294 spin_unlock_irq(&mdev->req_lock);
1295
1296 if (eh == EP_CALL_HELPER)
1297 drbd_khelper(mdev, "local-io-error");
1298 }
1299
1300 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1301
1302 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1303 if (drbd_send_state(mdev))
1304 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1305 else
1306 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1307 }
1308
Philipp Reisner0a6dbf22009-12-28 16:58:38 +01001309 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001310 lc_destroy(mdev->resync);
1311 mdev->resync = NULL;
1312 lc_destroy(mdev->act_log);
1313 mdev->act_log = NULL;
1314 __no_warn(local,
1315 drbd_free_bc(mdev->ldev);
1316 mdev->ldev = NULL;);
1317
1318 if (mdev->md_io_tmpp)
1319 __free_page(mdev->md_io_tmpp);
1320 }
1321
1322 /* Disks got bigger while they were detached */
1323 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1324 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1325 if (ns.conn == C_CONNECTED)
1326 resync_after_online_grow(mdev);
1327 }
1328
1329 /* A resync finished or aborted, wake paused devices... */
1330 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1331 (os.peer_isp && !ns.peer_isp) ||
1332 (os.user_isp && !ns.user_isp))
1333 resume_next_sg(mdev);
1334
1335 /* Upon network connection, we need to start the receiver */
1336 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1337 drbd_thread_start(&mdev->receiver);
1338
1339 /* Terminate worker thread if we are unconfigured - it will be
1340 restarted as needed... */
1341 if (ns.disk == D_DISKLESS &&
1342 ns.conn == C_STANDALONE &&
1343 ns.role == R_SECONDARY) {
1344 if (os.aftr_isp != ns.aftr_isp)
1345 resume_next_sg(mdev);
1346 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1347 if (test_bit(DEVICE_DYING, &mdev->flags))
1348 drbd_thread_stop_nowait(&mdev->worker);
1349 }
1350
1351 drbd_md_sync(mdev);
1352}
1353
1354
1355static int drbd_thread_setup(void *arg)
1356{
1357 struct drbd_thread *thi = (struct drbd_thread *) arg;
1358 struct drbd_conf *mdev = thi->mdev;
1359 unsigned long flags;
1360 int retval;
1361
1362restart:
1363 retval = thi->function(thi);
1364
1365 spin_lock_irqsave(&thi->t_lock, flags);
1366
1367 /* if the receiver has been "Exiting", the last thing it did
1368 * was set the conn state to "StandAlone",
1369 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1370 * and receiver thread will be "started".
1371 * drbd_thread_start needs to set "Restarting" in that case.
1372 * t_state check and assignment needs to be within the same spinlock,
1373 * so either thread_start sees Exiting, and can remap to Restarting,
1374 * or thread_start see None, and can proceed as normal.
1375 */
1376
1377 if (thi->t_state == Restarting) {
1378 dev_info(DEV, "Restarting %s\n", current->comm);
1379 thi->t_state = Running;
1380 spin_unlock_irqrestore(&thi->t_lock, flags);
1381 goto restart;
1382 }
1383
1384 thi->task = NULL;
1385 thi->t_state = None;
1386 smp_mb();
1387 complete(&thi->stop);
1388 spin_unlock_irqrestore(&thi->t_lock, flags);
1389
1390 dev_info(DEV, "Terminating %s\n", current->comm);
1391
1392 /* Release mod reference taken when thread was started */
1393 module_put(THIS_MODULE);
1394 return retval;
1395}
1396
1397static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1398 int (*func) (struct drbd_thread *))
1399{
1400 spin_lock_init(&thi->t_lock);
1401 thi->task = NULL;
1402 thi->t_state = None;
1403 thi->function = func;
1404 thi->mdev = mdev;
1405}
1406
1407int drbd_thread_start(struct drbd_thread *thi)
1408{
1409 struct drbd_conf *mdev = thi->mdev;
1410 struct task_struct *nt;
1411 unsigned long flags;
1412
1413 const char *me =
1414 thi == &mdev->receiver ? "receiver" :
1415 thi == &mdev->asender ? "asender" :
1416 thi == &mdev->worker ? "worker" : "NONSENSE";
1417
1418 /* is used from state engine doing drbd_thread_stop_nowait,
1419 * while holding the req lock irqsave */
1420 spin_lock_irqsave(&thi->t_lock, flags);
1421
1422 switch (thi->t_state) {
1423 case None:
1424 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1425 me, current->comm, current->pid);
1426
1427 /* Get ref on module for thread - this is released when thread exits */
1428 if (!try_module_get(THIS_MODULE)) {
1429 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1430 spin_unlock_irqrestore(&thi->t_lock, flags);
1431 return FALSE;
1432 }
1433
1434 init_completion(&thi->stop);
1435 D_ASSERT(thi->task == NULL);
1436 thi->reset_cpu_mask = 1;
1437 thi->t_state = Running;
1438 spin_unlock_irqrestore(&thi->t_lock, flags);
1439 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1440
1441 nt = kthread_create(drbd_thread_setup, (void *) thi,
1442 "drbd%d_%s", mdev_to_minor(mdev), me);
1443
1444 if (IS_ERR(nt)) {
1445 dev_err(DEV, "Couldn't start thread\n");
1446
1447 module_put(THIS_MODULE);
1448 return FALSE;
1449 }
1450 spin_lock_irqsave(&thi->t_lock, flags);
1451 thi->task = nt;
1452 thi->t_state = Running;
1453 spin_unlock_irqrestore(&thi->t_lock, flags);
1454 wake_up_process(nt);
1455 break;
1456 case Exiting:
1457 thi->t_state = Restarting;
1458 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1459 me, current->comm, current->pid);
1460 /* fall through */
1461 case Running:
1462 case Restarting:
1463 default:
1464 spin_unlock_irqrestore(&thi->t_lock, flags);
1465 break;
1466 }
1467
1468 return TRUE;
1469}
1470
1471
1472void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1473{
1474 unsigned long flags;
1475
1476 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1477
1478 /* may be called from state engine, holding the req lock irqsave */
1479 spin_lock_irqsave(&thi->t_lock, flags);
1480
1481 if (thi->t_state == None) {
1482 spin_unlock_irqrestore(&thi->t_lock, flags);
1483 if (restart)
1484 drbd_thread_start(thi);
1485 return;
1486 }
1487
1488 if (thi->t_state != ns) {
1489 if (thi->task == NULL) {
1490 spin_unlock_irqrestore(&thi->t_lock, flags);
1491 return;
1492 }
1493
1494 thi->t_state = ns;
1495 smp_mb();
1496 init_completion(&thi->stop);
1497 if (thi->task != current)
1498 force_sig(DRBD_SIGKILL, thi->task);
1499
1500 }
1501
1502 spin_unlock_irqrestore(&thi->t_lock, flags);
1503
1504 if (wait)
1505 wait_for_completion(&thi->stop);
1506}
1507
1508#ifdef CONFIG_SMP
1509/**
1510 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1511 * @mdev: DRBD device.
1512 *
1513 * Forces all threads of a device onto the same CPU. This is beneficial for
1514 * DRBD's performance. May be overwritten by user's configuration.
1515 */
1516void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1517{
1518 int ord, cpu;
1519
1520 /* user override. */
1521 if (cpumask_weight(mdev->cpu_mask))
1522 return;
1523
1524 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1525 for_each_online_cpu(cpu) {
1526 if (ord-- == 0) {
1527 cpumask_set_cpu(cpu, mdev->cpu_mask);
1528 return;
1529 }
1530 }
1531 /* should not be reached */
1532 cpumask_setall(mdev->cpu_mask);
1533}
1534
1535/**
1536 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1537 * @mdev: DRBD device.
1538 *
1539 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1540 * prematurely.
1541 */
1542void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1543{
1544 struct task_struct *p = current;
1545 struct drbd_thread *thi =
1546 p == mdev->asender.task ? &mdev->asender :
1547 p == mdev->receiver.task ? &mdev->receiver :
1548 p == mdev->worker.task ? &mdev->worker :
1549 NULL;
1550 ERR_IF(thi == NULL)
1551 return;
1552 if (!thi->reset_cpu_mask)
1553 return;
1554 thi->reset_cpu_mask = 0;
1555 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1556}
1557#endif
1558
1559/* the appropriate socket mutex must be held already */
1560int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1561 enum drbd_packets cmd, struct p_header *h,
1562 size_t size, unsigned msg_flags)
1563{
1564 int sent, ok;
1565
1566 ERR_IF(!h) return FALSE;
1567 ERR_IF(!size) return FALSE;
1568
1569 h->magic = BE_DRBD_MAGIC;
1570 h->command = cpu_to_be16(cmd);
1571 h->length = cpu_to_be16(size-sizeof(struct p_header));
1572
Philipp Reisnerb411b362009-09-25 16:07:19 -07001573 sent = drbd_send(mdev, sock, h, size, msg_flags);
1574
1575 ok = (sent == size);
1576 if (!ok)
1577 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1578 cmdname(cmd), (int)size, sent);
1579 return ok;
1580}
1581
1582/* don't pass the socket. we may only look at it
1583 * when we hold the appropriate socket mutex.
1584 */
1585int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1586 enum drbd_packets cmd, struct p_header *h, size_t size)
1587{
1588 int ok = 0;
1589 struct socket *sock;
1590
1591 if (use_data_socket) {
1592 mutex_lock(&mdev->data.mutex);
1593 sock = mdev->data.socket;
1594 } else {
1595 mutex_lock(&mdev->meta.mutex);
1596 sock = mdev->meta.socket;
1597 }
1598
1599 /* drbd_disconnect() could have called drbd_free_sock()
1600 * while we were waiting in down()... */
1601 if (likely(sock != NULL))
1602 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1603
1604 if (use_data_socket)
1605 mutex_unlock(&mdev->data.mutex);
1606 else
1607 mutex_unlock(&mdev->meta.mutex);
1608 return ok;
1609}
1610
1611int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1612 size_t size)
1613{
1614 struct p_header h;
1615 int ok;
1616
1617 h.magic = BE_DRBD_MAGIC;
1618 h.command = cpu_to_be16(cmd);
1619 h.length = cpu_to_be16(size);
1620
1621 if (!drbd_get_data_sock(mdev))
1622 return 0;
1623
Philipp Reisnerb411b362009-09-25 16:07:19 -07001624 ok = (sizeof(h) ==
1625 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1626 ok = ok && (size ==
1627 drbd_send(mdev, mdev->data.socket, data, size, 0));
1628
1629 drbd_put_data_sock(mdev);
1630
1631 return ok;
1632}
1633
1634int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1635{
1636 struct p_rs_param_89 *p;
1637 struct socket *sock;
1638 int size, rv;
1639 const int apv = mdev->agreed_pro_version;
1640
1641 size = apv <= 87 ? sizeof(struct p_rs_param)
1642 : apv == 88 ? sizeof(struct p_rs_param)
1643 + strlen(mdev->sync_conf.verify_alg) + 1
1644 : /* 89 */ sizeof(struct p_rs_param_89);
1645
1646 /* used from admin command context and receiver/worker context.
1647 * to avoid kmalloc, grab the socket right here,
1648 * then use the pre-allocated sbuf there */
1649 mutex_lock(&mdev->data.mutex);
1650 sock = mdev->data.socket;
1651
1652 if (likely(sock != NULL)) {
1653 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1654
1655 p = &mdev->data.sbuf.rs_param_89;
1656
1657 /* initialize verify_alg and csums_alg */
1658 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1659
1660 p->rate = cpu_to_be32(sc->rate);
1661
1662 if (apv >= 88)
1663 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1664 if (apv >= 89)
1665 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1666
1667 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1668 } else
1669 rv = 0; /* not ok */
1670
1671 mutex_unlock(&mdev->data.mutex);
1672
1673 return rv;
1674}
1675
1676int drbd_send_protocol(struct drbd_conf *mdev)
1677{
1678 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001679 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001680
1681 size = sizeof(struct p_protocol);
1682
1683 if (mdev->agreed_pro_version >= 87)
1684 size += strlen(mdev->net_conf->integrity_alg) + 1;
1685
1686 /* we must not recurse into our own queue,
1687 * as that is blocked during handshake */
1688 p = kmalloc(size, GFP_NOIO);
1689 if (p == NULL)
1690 return 0;
1691
1692 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1693 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1694 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1695 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001696 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1697
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001698 cf = 0;
1699 if (mdev->net_conf->want_lose)
1700 cf |= CF_WANT_LOSE;
1701 if (mdev->net_conf->dry_run) {
1702 if (mdev->agreed_pro_version >= 92)
1703 cf |= CF_DRY_RUN;
1704 else {
1705 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001706 kfree(p);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001707 return 0;
1708 }
1709 }
1710 p->conn_flags = cpu_to_be32(cf);
1711
Philipp Reisnerb411b362009-09-25 16:07:19 -07001712 if (mdev->agreed_pro_version >= 87)
1713 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1714
1715 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1716 (struct p_header *)p, size);
1717 kfree(p);
1718 return rv;
1719}
1720
1721int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1722{
1723 struct p_uuids p;
1724 int i;
1725
1726 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1727 return 1;
1728
1729 for (i = UI_CURRENT; i < UI_SIZE; i++)
1730 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1731
1732 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1733 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1734 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1735 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1736 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1737 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1738
1739 put_ldev(mdev);
1740
1741 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1742 (struct p_header *)&p, sizeof(p));
1743}
1744
1745int drbd_send_uuids(struct drbd_conf *mdev)
1746{
1747 return _drbd_send_uuids(mdev, 0);
1748}
1749
1750int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1751{
1752 return _drbd_send_uuids(mdev, 8);
1753}
1754
1755
1756int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1757{
1758 struct p_rs_uuid p;
1759
1760 p.uuid = cpu_to_be64(val);
1761
1762 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1763 (struct p_header *)&p, sizeof(p));
1764}
1765
Philipp Reisnere89b5912010-03-24 17:11:33 +01001766int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001767{
1768 struct p_sizes p;
1769 sector_t d_size, u_size;
1770 int q_order_type;
1771 int ok;
1772
1773 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1774 D_ASSERT(mdev->ldev->backing_bdev);
1775 d_size = drbd_get_max_capacity(mdev->ldev);
1776 u_size = mdev->ldev->dc.disk_size;
1777 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001778 put_ldev(mdev);
1779 } else {
1780 d_size = 0;
1781 u_size = 0;
1782 q_order_type = QUEUE_ORDERED_NONE;
1783 }
1784
1785 p.d_size = cpu_to_be64(d_size);
1786 p.u_size = cpu_to_be64(u_size);
1787 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1788 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
Philipp Reisnere89b5912010-03-24 17:11:33 +01001789 p.queue_order_type = cpu_to_be16(q_order_type);
1790 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001791
1792 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1793 (struct p_header *)&p, sizeof(p));
1794 return ok;
1795}
1796
1797/**
1798 * drbd_send_state() - Sends the drbd state to the peer
1799 * @mdev: DRBD device.
1800 */
1801int drbd_send_state(struct drbd_conf *mdev)
1802{
1803 struct socket *sock;
1804 struct p_state p;
1805 int ok = 0;
1806
1807 /* Grab state lock so we wont send state if we're in the middle
1808 * of a cluster wide state change on another thread */
1809 drbd_state_lock(mdev);
1810
1811 mutex_lock(&mdev->data.mutex);
1812
1813 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1814 sock = mdev->data.socket;
1815
1816 if (likely(sock != NULL)) {
1817 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1818 (struct p_header *)&p, sizeof(p), 0);
1819 }
1820
1821 mutex_unlock(&mdev->data.mutex);
1822
1823 drbd_state_unlock(mdev);
1824 return ok;
1825}
1826
1827int drbd_send_state_req(struct drbd_conf *mdev,
1828 union drbd_state mask, union drbd_state val)
1829{
1830 struct p_req_state p;
1831
1832 p.mask = cpu_to_be32(mask.i);
1833 p.val = cpu_to_be32(val.i);
1834
1835 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1836 (struct p_header *)&p, sizeof(p));
1837}
1838
1839int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1840{
1841 struct p_req_state_reply p;
1842
1843 p.retcode = cpu_to_be32(retcode);
1844
1845 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1846 (struct p_header *)&p, sizeof(p));
1847}
1848
1849int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1850 struct p_compressed_bm *p,
1851 struct bm_xfer_ctx *c)
1852{
1853 struct bitstream bs;
1854 unsigned long plain_bits;
1855 unsigned long tmp;
1856 unsigned long rl;
1857 unsigned len;
1858 unsigned toggle;
1859 int bits;
1860
1861 /* may we use this feature? */
1862 if ((mdev->sync_conf.use_rle == 0) ||
1863 (mdev->agreed_pro_version < 90))
1864 return 0;
1865
1866 if (c->bit_offset >= c->bm_bits)
1867 return 0; /* nothing to do. */
1868
1869 /* use at most thus many bytes */
1870 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1871 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1872 /* plain bits covered in this code string */
1873 plain_bits = 0;
1874
1875 /* p->encoding & 0x80 stores whether the first run length is set.
1876 * bit offset is implicit.
1877 * start with toggle == 2 to be able to tell the first iteration */
1878 toggle = 2;
1879
1880 /* see how much plain bits we can stuff into one packet
1881 * using RLE and VLI. */
1882 do {
1883 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1884 : _drbd_bm_find_next(mdev, c->bit_offset);
1885 if (tmp == -1UL)
1886 tmp = c->bm_bits;
1887 rl = tmp - c->bit_offset;
1888
1889 if (toggle == 2) { /* first iteration */
1890 if (rl == 0) {
1891 /* the first checked bit was set,
1892 * store start value, */
1893 DCBP_set_start(p, 1);
1894 /* but skip encoding of zero run length */
1895 toggle = !toggle;
1896 continue;
1897 }
1898 DCBP_set_start(p, 0);
1899 }
1900
1901 /* paranoia: catch zero runlength.
1902 * can only happen if bitmap is modified while we scan it. */
1903 if (rl == 0) {
1904 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1905 "t:%u bo:%lu\n", toggle, c->bit_offset);
1906 return -1;
1907 }
1908
1909 bits = vli_encode_bits(&bs, rl);
1910 if (bits == -ENOBUFS) /* buffer full */
1911 break;
1912 if (bits <= 0) {
1913 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1914 return 0;
1915 }
1916
1917 toggle = !toggle;
1918 plain_bits += rl;
1919 c->bit_offset = tmp;
1920 } while (c->bit_offset < c->bm_bits);
1921
1922 len = bs.cur.b - p->code + !!bs.cur.bit;
1923
1924 if (plain_bits < (len << 3)) {
1925 /* incompressible with this method.
1926 * we need to rewind both word and bit position. */
1927 c->bit_offset -= plain_bits;
1928 bm_xfer_ctx_bit_to_word_offset(c);
1929 c->bit_offset = c->word_offset * BITS_PER_LONG;
1930 return 0;
1931 }
1932
1933 /* RLE + VLI was able to compress it just fine.
1934 * update c->word_offset. */
1935 bm_xfer_ctx_bit_to_word_offset(c);
1936
1937 /* store pad_bits */
1938 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1939
1940 return len;
1941}
1942
1943enum { OK, FAILED, DONE }
1944send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1945 struct p_header *h, struct bm_xfer_ctx *c)
1946{
1947 struct p_compressed_bm *p = (void*)h;
1948 unsigned long num_words;
1949 int len;
1950 int ok;
1951
1952 len = fill_bitmap_rle_bits(mdev, p, c);
1953
1954 if (len < 0)
1955 return FAILED;
1956
1957 if (len) {
1958 DCBP_set_code(p, RLE_VLI_Bits);
1959 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1960 sizeof(*p) + len, 0);
1961
1962 c->packets[0]++;
1963 c->bytes[0] += sizeof(*p) + len;
1964
1965 if (c->bit_offset >= c->bm_bits)
1966 len = 0; /* DONE */
1967 } else {
1968 /* was not compressible.
1969 * send a buffer full of plain text bits instead. */
1970 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1971 len = num_words * sizeof(long);
1972 if (len)
1973 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1974 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1975 h, sizeof(struct p_header) + len, 0);
1976 c->word_offset += num_words;
1977 c->bit_offset = c->word_offset * BITS_PER_LONG;
1978
1979 c->packets[1]++;
1980 c->bytes[1] += sizeof(struct p_header) + len;
1981
1982 if (c->bit_offset > c->bm_bits)
1983 c->bit_offset = c->bm_bits;
1984 }
1985 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1986
1987 if (ok == DONE)
1988 INFO_bm_xfer_stats(mdev, "send", c);
1989 return ok;
1990}
1991
1992/* See the comment at receive_bitmap() */
1993int _drbd_send_bitmap(struct drbd_conf *mdev)
1994{
1995 struct bm_xfer_ctx c;
1996 struct p_header *p;
1997 int ret;
1998
1999 ERR_IF(!mdev->bitmap) return FALSE;
2000
2001 /* maybe we should use some per thread scratch page,
2002 * and allocate that during initial device creation? */
2003 p = (struct p_header *) __get_free_page(GFP_NOIO);
2004 if (!p) {
2005 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2006 return FALSE;
2007 }
2008
2009 if (get_ldev(mdev)) {
2010 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2011 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2012 drbd_bm_set_all(mdev);
2013 if (drbd_bm_write(mdev)) {
2014 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2015 * but otherwise process as per normal - need to tell other
2016 * side that a full resync is required! */
2017 dev_err(DEV, "Failed to write bitmap to disk!\n");
2018 } else {
2019 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2020 drbd_md_sync(mdev);
2021 }
2022 }
2023 put_ldev(mdev);
2024 }
2025
2026 c = (struct bm_xfer_ctx) {
2027 .bm_bits = drbd_bm_bits(mdev),
2028 .bm_words = drbd_bm_words(mdev),
2029 };
2030
2031 do {
2032 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2033 } while (ret == OK);
2034
2035 free_page((unsigned long) p);
2036 return (ret == DONE);
2037}
2038
2039int drbd_send_bitmap(struct drbd_conf *mdev)
2040{
2041 int err;
2042
2043 if (!drbd_get_data_sock(mdev))
2044 return -1;
2045 err = !_drbd_send_bitmap(mdev);
2046 drbd_put_data_sock(mdev);
2047 return err;
2048}
2049
2050int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2051{
2052 int ok;
2053 struct p_barrier_ack p;
2054
2055 p.barrier = barrier_nr;
2056 p.set_size = cpu_to_be32(set_size);
2057
2058 if (mdev->state.conn < C_CONNECTED)
2059 return FALSE;
2060 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2061 (struct p_header *)&p, sizeof(p));
2062 return ok;
2063}
2064
2065/**
2066 * _drbd_send_ack() - Sends an ack packet
2067 * @mdev: DRBD device.
2068 * @cmd: Packet command code.
2069 * @sector: sector, needs to be in big endian byte order
2070 * @blksize: size in byte, needs to be in big endian byte order
2071 * @block_id: Id, big endian byte order
2072 */
2073static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2074 u64 sector,
2075 u32 blksize,
2076 u64 block_id)
2077{
2078 int ok;
2079 struct p_block_ack p;
2080
2081 p.sector = sector;
2082 p.block_id = block_id;
2083 p.blksize = blksize;
2084 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2085
2086 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2087 return FALSE;
2088 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2089 (struct p_header *)&p, sizeof(p));
2090 return ok;
2091}
2092
2093int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2094 struct p_data *dp)
2095{
2096 const int header_size = sizeof(struct p_data)
2097 - sizeof(struct p_header);
2098 int data_size = ((struct p_header *)dp)->length - header_size;
2099
2100 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2101 dp->block_id);
2102}
2103
2104int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2105 struct p_block_req *rp)
2106{
2107 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2108}
2109
2110/**
2111 * drbd_send_ack() - Sends an ack packet
2112 * @mdev: DRBD device.
2113 * @cmd: Packet command code.
2114 * @e: Epoch entry.
2115 */
2116int drbd_send_ack(struct drbd_conf *mdev,
2117 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2118{
2119 return _drbd_send_ack(mdev, cmd,
2120 cpu_to_be64(e->sector),
2121 cpu_to_be32(e->size),
2122 e->block_id);
2123}
2124
2125/* This function misuses the block_id field to signal if the blocks
2126 * are is sync or not. */
2127int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2128 sector_t sector, int blksize, u64 block_id)
2129{
2130 return _drbd_send_ack(mdev, cmd,
2131 cpu_to_be64(sector),
2132 cpu_to_be32(blksize),
2133 cpu_to_be64(block_id));
2134}
2135
2136int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2137 sector_t sector, int size, u64 block_id)
2138{
2139 int ok;
2140 struct p_block_req p;
2141
2142 p.sector = cpu_to_be64(sector);
2143 p.block_id = block_id;
2144 p.blksize = cpu_to_be32(size);
2145
2146 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2147 (struct p_header *)&p, sizeof(p));
2148 return ok;
2149}
2150
2151int drbd_send_drequest_csum(struct drbd_conf *mdev,
2152 sector_t sector, int size,
2153 void *digest, int digest_size,
2154 enum drbd_packets cmd)
2155{
2156 int ok;
2157 struct p_block_req p;
2158
2159 p.sector = cpu_to_be64(sector);
2160 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2161 p.blksize = cpu_to_be32(size);
2162
2163 p.head.magic = BE_DRBD_MAGIC;
2164 p.head.command = cpu_to_be16(cmd);
2165 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2166
2167 mutex_lock(&mdev->data.mutex);
2168
2169 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2170 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2171
2172 mutex_unlock(&mdev->data.mutex);
2173
2174 return ok;
2175}
2176
2177int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2178{
2179 int ok;
2180 struct p_block_req p;
2181
2182 p.sector = cpu_to_be64(sector);
2183 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2184 p.blksize = cpu_to_be32(size);
2185
2186 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2187 (struct p_header *)&p, sizeof(p));
2188 return ok;
2189}
2190
2191/* called on sndtimeo
2192 * returns FALSE if we should retry,
2193 * TRUE if we think connection is dead
2194 */
2195static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2196{
2197 int drop_it;
2198 /* long elapsed = (long)(jiffies - mdev->last_received); */
2199
2200 drop_it = mdev->meta.socket == sock
2201 || !mdev->asender.task
2202 || get_t_state(&mdev->asender) != Running
2203 || mdev->state.conn < C_CONNECTED;
2204
2205 if (drop_it)
2206 return TRUE;
2207
2208 drop_it = !--mdev->ko_count;
2209 if (!drop_it) {
2210 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2211 current->comm, current->pid, mdev->ko_count);
2212 request_ping(mdev);
2213 }
2214
2215 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2216}
2217
2218/* The idea of sendpage seems to be to put some kind of reference
2219 * to the page into the skb, and to hand it over to the NIC. In
2220 * this process get_page() gets called.
2221 *
2222 * As soon as the page was really sent over the network put_page()
2223 * gets called by some part of the network layer. [ NIC driver? ]
2224 *
2225 * [ get_page() / put_page() increment/decrement the count. If count
2226 * reaches 0 the page will be freed. ]
2227 *
2228 * This works nicely with pages from FSs.
2229 * But this means that in protocol A we might signal IO completion too early!
2230 *
2231 * In order not to corrupt data during a resync we must make sure
2232 * that we do not reuse our own buffer pages (EEs) to early, therefore
2233 * we have the net_ee list.
2234 *
2235 * XFS seems to have problems, still, it submits pages with page_count == 0!
2236 * As a workaround, we disable sendpage on pages
2237 * with page_count == 0 or PageSlab.
2238 */
2239static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2240 int offset, size_t size)
2241{
2242 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2243 kunmap(page);
2244 if (sent == size)
2245 mdev->send_cnt += size>>9;
2246 return sent == size;
2247}
2248
2249static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2250 int offset, size_t size)
2251{
2252 mm_segment_t oldfs = get_fs();
2253 int sent, ok;
2254 int len = size;
2255
2256 /* e.g. XFS meta- & log-data is in slab pages, which have a
2257 * page_count of 0 and/or have PageSlab() set.
2258 * we cannot use send_page for those, as that does get_page();
2259 * put_page(); and would cause either a VM_BUG directly, or
2260 * __page_cache_release a page that would actually still be referenced
2261 * by someone, leading to some obscure delayed Oops somewhere else. */
2262 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2263 return _drbd_no_send_page(mdev, page, offset, size);
2264
2265 drbd_update_congested(mdev);
2266 set_fs(KERNEL_DS);
2267 do {
2268 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2269 offset, len,
2270 MSG_NOSIGNAL);
2271 if (sent == -EAGAIN) {
2272 if (we_should_drop_the_connection(mdev,
2273 mdev->data.socket))
2274 break;
2275 else
2276 continue;
2277 }
2278 if (sent <= 0) {
2279 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2280 __func__, (int)size, len, sent);
2281 break;
2282 }
2283 len -= sent;
2284 offset += sent;
2285 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2286 set_fs(oldfs);
2287 clear_bit(NET_CONGESTED, &mdev->flags);
2288
2289 ok = (len == 0);
2290 if (likely(ok))
2291 mdev->send_cnt += size>>9;
2292 return ok;
2293}
2294
2295static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2296{
2297 struct bio_vec *bvec;
2298 int i;
2299 __bio_for_each_segment(bvec, bio, i, 0) {
2300 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2301 bvec->bv_offset, bvec->bv_len))
2302 return 0;
2303 }
2304 return 1;
2305}
2306
2307static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2308{
2309 struct bio_vec *bvec;
2310 int i;
2311 __bio_for_each_segment(bvec, bio, i, 0) {
2312 if (!_drbd_send_page(mdev, bvec->bv_page,
2313 bvec->bv_offset, bvec->bv_len))
2314 return 0;
2315 }
2316
2317 return 1;
2318}
2319
2320/* Used to send write requests
2321 * R_PRIMARY -> Peer (P_DATA)
2322 */
2323int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2324{
2325 int ok = 1;
2326 struct p_data p;
2327 unsigned int dp_flags = 0;
2328 void *dgb;
2329 int dgs;
2330
2331 if (!drbd_get_data_sock(mdev))
2332 return 0;
2333
2334 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2335 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2336
2337 p.head.magic = BE_DRBD_MAGIC;
2338 p.head.command = cpu_to_be16(P_DATA);
2339 p.head.length =
2340 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2341
2342 p.sector = cpu_to_be64(req->sector);
2343 p.block_id = (unsigned long)req;
2344 p.seq_num = cpu_to_be32(req->seq_num =
2345 atomic_add_return(1, &mdev->packet_seq));
2346 dp_flags = 0;
2347
2348 /* NOTE: no need to check if barriers supported here as we would
2349 * not pass the test in make_request_common in that case
2350 */
2351 if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2352 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2353 /* dp_flags |= DP_HARDBARRIER; */
2354 }
2355 if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2356 dp_flags |= DP_RW_SYNC;
2357 /* for now handle SYNCIO and UNPLUG
2358 * as if they still were one and the same flag */
2359 if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2360 dp_flags |= DP_RW_SYNC;
2361 if (mdev->state.conn >= C_SYNC_SOURCE &&
2362 mdev->state.conn <= C_PAUSED_SYNC_T)
2363 dp_flags |= DP_MAY_SET_IN_SYNC;
2364
2365 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002366 set_bit(UNPLUG_REMOTE, &mdev->flags);
2367 ok = (sizeof(p) ==
2368 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2369 if (ok && dgs) {
2370 dgb = mdev->int_dig_out;
2371 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2372 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2373 }
2374 if (ok) {
2375 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2376 ok = _drbd_send_bio(mdev, req->master_bio);
2377 else
2378 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2379 }
2380
2381 drbd_put_data_sock(mdev);
2382 return ok;
2383}
2384
2385/* answer packet, used to send data back for read requests:
2386 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2387 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2388 */
2389int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2390 struct drbd_epoch_entry *e)
2391{
2392 int ok;
2393 struct p_data p;
2394 void *dgb;
2395 int dgs;
2396
2397 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2398 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2399
2400 p.head.magic = BE_DRBD_MAGIC;
2401 p.head.command = cpu_to_be16(cmd);
2402 p.head.length =
2403 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2404
2405 p.sector = cpu_to_be64(e->sector);
2406 p.block_id = e->block_id;
2407 /* p.seq_num = 0; No sequence numbers here.. */
2408
2409 /* Only called by our kernel thread.
2410 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2411 * in response to admin command or module unload.
2412 */
2413 if (!drbd_get_data_sock(mdev))
2414 return 0;
2415
Philipp Reisnerb411b362009-09-25 16:07:19 -07002416 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2417 sizeof(p), MSG_MORE);
2418 if (ok && dgs) {
2419 dgb = mdev->int_dig_out;
2420 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
2421 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2422 }
2423 if (ok)
2424 ok = _drbd_send_zc_bio(mdev, e->private_bio);
2425
2426 drbd_put_data_sock(mdev);
2427 return ok;
2428}
2429
2430/*
2431 drbd_send distinguishes two cases:
2432
2433 Packets sent via the data socket "sock"
2434 and packets sent via the meta data socket "msock"
2435
2436 sock msock
2437 -----------------+-------------------------+------------------------------
2438 timeout conf.timeout / 2 conf.timeout / 2
2439 timeout action send a ping via msock Abort communication
2440 and close all sockets
2441*/
2442
2443/*
2444 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2445 */
2446int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2447 void *buf, size_t size, unsigned msg_flags)
2448{
2449 struct kvec iov;
2450 struct msghdr msg;
2451 int rv, sent = 0;
2452
2453 if (!sock)
2454 return -1000;
2455
2456 /* THINK if (signal_pending) return ... ? */
2457
2458 iov.iov_base = buf;
2459 iov.iov_len = size;
2460
2461 msg.msg_name = NULL;
2462 msg.msg_namelen = 0;
2463 msg.msg_control = NULL;
2464 msg.msg_controllen = 0;
2465 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2466
2467 if (sock == mdev->data.socket) {
2468 mdev->ko_count = mdev->net_conf->ko_count;
2469 drbd_update_congested(mdev);
2470 }
2471 do {
2472 /* STRANGE
2473 * tcp_sendmsg does _not_ use its size parameter at all ?
2474 *
2475 * -EAGAIN on timeout, -EINTR on signal.
2476 */
2477/* THINK
2478 * do we need to block DRBD_SIG if sock == &meta.socket ??
2479 * otherwise wake_asender() might interrupt some send_*Ack !
2480 */
2481 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2482 if (rv == -EAGAIN) {
2483 if (we_should_drop_the_connection(mdev, sock))
2484 break;
2485 else
2486 continue;
2487 }
2488 D_ASSERT(rv != 0);
2489 if (rv == -EINTR) {
2490 flush_signals(current);
2491 rv = 0;
2492 }
2493 if (rv < 0)
2494 break;
2495 sent += rv;
2496 iov.iov_base += rv;
2497 iov.iov_len -= rv;
2498 } while (sent < size);
2499
2500 if (sock == mdev->data.socket)
2501 clear_bit(NET_CONGESTED, &mdev->flags);
2502
2503 if (rv <= 0) {
2504 if (rv != -EAGAIN) {
2505 dev_err(DEV, "%s_sendmsg returned %d\n",
2506 sock == mdev->meta.socket ? "msock" : "sock",
2507 rv);
2508 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2509 } else
2510 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2511 }
2512
2513 return sent;
2514}
2515
2516static int drbd_open(struct block_device *bdev, fmode_t mode)
2517{
2518 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2519 unsigned long flags;
2520 int rv = 0;
2521
2522 spin_lock_irqsave(&mdev->req_lock, flags);
2523 /* to have a stable mdev->state.role
2524 * and no race with updating open_cnt */
2525
2526 if (mdev->state.role != R_PRIMARY) {
2527 if (mode & FMODE_WRITE)
2528 rv = -EROFS;
2529 else if (!allow_oos)
2530 rv = -EMEDIUMTYPE;
2531 }
2532
2533 if (!rv)
2534 mdev->open_cnt++;
2535 spin_unlock_irqrestore(&mdev->req_lock, flags);
2536
2537 return rv;
2538}
2539
2540static int drbd_release(struct gendisk *gd, fmode_t mode)
2541{
2542 struct drbd_conf *mdev = gd->private_data;
2543 mdev->open_cnt--;
2544 return 0;
2545}
2546
2547static void drbd_unplug_fn(struct request_queue *q)
2548{
2549 struct drbd_conf *mdev = q->queuedata;
2550
Philipp Reisnerb411b362009-09-25 16:07:19 -07002551 /* unplug FIRST */
2552 spin_lock_irq(q->queue_lock);
2553 blk_remove_plug(q);
2554 spin_unlock_irq(q->queue_lock);
2555
2556 /* only if connected */
2557 spin_lock_irq(&mdev->req_lock);
2558 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2559 D_ASSERT(mdev->state.role == R_PRIMARY);
2560 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2561 /* add to the data.work queue,
2562 * unless already queued.
2563 * XXX this might be a good addition to drbd_queue_work
2564 * anyways, to detect "double queuing" ... */
2565 if (list_empty(&mdev->unplug_work.list))
2566 drbd_queue_work(&mdev->data.work,
2567 &mdev->unplug_work);
2568 }
2569 }
2570 spin_unlock_irq(&mdev->req_lock);
2571
2572 if (mdev->state.disk >= D_INCONSISTENT)
2573 drbd_kick_lo(mdev);
2574}
2575
2576static void drbd_set_defaults(struct drbd_conf *mdev)
2577{
2578 mdev->sync_conf.after = DRBD_AFTER_DEF;
2579 mdev->sync_conf.rate = DRBD_RATE_DEF;
2580 mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2581 mdev->state = (union drbd_state) {
2582 { .role = R_SECONDARY,
2583 .peer = R_UNKNOWN,
2584 .conn = C_STANDALONE,
2585 .disk = D_DISKLESS,
2586 .pdsk = D_UNKNOWN,
2587 .susp = 0
2588 } };
2589}
2590
2591void drbd_init_set_defaults(struct drbd_conf *mdev)
2592{
2593 /* the memset(,0,) did most of this.
2594 * note: only assignments, no allocation in here */
2595
2596 drbd_set_defaults(mdev);
2597
2598 /* for now, we do NOT yet support it,
2599 * even though we start some framework
2600 * to eventually support barriers */
2601 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2602
2603 atomic_set(&mdev->ap_bio_cnt, 0);
2604 atomic_set(&mdev->ap_pending_cnt, 0);
2605 atomic_set(&mdev->rs_pending_cnt, 0);
2606 atomic_set(&mdev->unacked_cnt, 0);
2607 atomic_set(&mdev->local_cnt, 0);
2608 atomic_set(&mdev->net_cnt, 0);
2609 atomic_set(&mdev->packet_seq, 0);
2610 atomic_set(&mdev->pp_in_use, 0);
2611
2612 mutex_init(&mdev->md_io_mutex);
2613 mutex_init(&mdev->data.mutex);
2614 mutex_init(&mdev->meta.mutex);
2615 sema_init(&mdev->data.work.s, 0);
2616 sema_init(&mdev->meta.work.s, 0);
2617 mutex_init(&mdev->state_mutex);
2618
2619 spin_lock_init(&mdev->data.work.q_lock);
2620 spin_lock_init(&mdev->meta.work.q_lock);
2621
2622 spin_lock_init(&mdev->al_lock);
2623 spin_lock_init(&mdev->req_lock);
2624 spin_lock_init(&mdev->peer_seq_lock);
2625 spin_lock_init(&mdev->epoch_lock);
2626
2627 INIT_LIST_HEAD(&mdev->active_ee);
2628 INIT_LIST_HEAD(&mdev->sync_ee);
2629 INIT_LIST_HEAD(&mdev->done_ee);
2630 INIT_LIST_HEAD(&mdev->read_ee);
2631 INIT_LIST_HEAD(&mdev->net_ee);
2632 INIT_LIST_HEAD(&mdev->resync_reads);
2633 INIT_LIST_HEAD(&mdev->data.work.q);
2634 INIT_LIST_HEAD(&mdev->meta.work.q);
2635 INIT_LIST_HEAD(&mdev->resync_work.list);
2636 INIT_LIST_HEAD(&mdev->unplug_work.list);
2637 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2638 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2639 mdev->resync_work.cb = w_resync_inactive;
2640 mdev->unplug_work.cb = w_send_write_hint;
2641 mdev->md_sync_work.cb = w_md_sync;
2642 mdev->bm_io_work.w.cb = w_bitmap_io;
2643 init_timer(&mdev->resync_timer);
2644 init_timer(&mdev->md_sync_timer);
2645 mdev->resync_timer.function = resync_timer_fn;
2646 mdev->resync_timer.data = (unsigned long) mdev;
2647 mdev->md_sync_timer.function = md_sync_timer_fn;
2648 mdev->md_sync_timer.data = (unsigned long) mdev;
2649
2650 init_waitqueue_head(&mdev->misc_wait);
2651 init_waitqueue_head(&mdev->state_wait);
2652 init_waitqueue_head(&mdev->ee_wait);
2653 init_waitqueue_head(&mdev->al_wait);
2654 init_waitqueue_head(&mdev->seq_wait);
2655
2656 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2657 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2658 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2659
2660 mdev->agreed_pro_version = PRO_VERSION_MAX;
2661 mdev->write_ordering = WO_bio_barrier;
2662 mdev->resync_wenr = LC_FREE;
2663}
2664
2665void drbd_mdev_cleanup(struct drbd_conf *mdev)
2666{
2667 if (mdev->receiver.t_state != None)
2668 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2669 mdev->receiver.t_state);
2670
2671 /* no need to lock it, I'm the only thread alive */
2672 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2673 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2674 mdev->al_writ_cnt =
2675 mdev->bm_writ_cnt =
2676 mdev->read_cnt =
2677 mdev->recv_cnt =
2678 mdev->send_cnt =
2679 mdev->writ_cnt =
2680 mdev->p_size =
2681 mdev->rs_start =
2682 mdev->rs_total =
2683 mdev->rs_failed =
2684 mdev->rs_mark_left =
2685 mdev->rs_mark_time = 0;
2686 D_ASSERT(mdev->net_conf == NULL);
2687
2688 drbd_set_my_capacity(mdev, 0);
2689 if (mdev->bitmap) {
2690 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01002691 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002692 drbd_bm_cleanup(mdev);
2693 }
2694
2695 drbd_free_resources(mdev);
2696
2697 /*
2698 * currently we drbd_init_ee only on module load, so
2699 * we may do drbd_release_ee only on module unload!
2700 */
2701 D_ASSERT(list_empty(&mdev->active_ee));
2702 D_ASSERT(list_empty(&mdev->sync_ee));
2703 D_ASSERT(list_empty(&mdev->done_ee));
2704 D_ASSERT(list_empty(&mdev->read_ee));
2705 D_ASSERT(list_empty(&mdev->net_ee));
2706 D_ASSERT(list_empty(&mdev->resync_reads));
2707 D_ASSERT(list_empty(&mdev->data.work.q));
2708 D_ASSERT(list_empty(&mdev->meta.work.q));
2709 D_ASSERT(list_empty(&mdev->resync_work.list));
2710 D_ASSERT(list_empty(&mdev->unplug_work.list));
2711
2712}
2713
2714
2715static void drbd_destroy_mempools(void)
2716{
2717 struct page *page;
2718
2719 while (drbd_pp_pool) {
2720 page = drbd_pp_pool;
2721 drbd_pp_pool = (struct page *)page_private(page);
2722 __free_page(page);
2723 drbd_pp_vacant--;
2724 }
2725
2726 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2727
2728 if (drbd_ee_mempool)
2729 mempool_destroy(drbd_ee_mempool);
2730 if (drbd_request_mempool)
2731 mempool_destroy(drbd_request_mempool);
2732 if (drbd_ee_cache)
2733 kmem_cache_destroy(drbd_ee_cache);
2734 if (drbd_request_cache)
2735 kmem_cache_destroy(drbd_request_cache);
2736 if (drbd_bm_ext_cache)
2737 kmem_cache_destroy(drbd_bm_ext_cache);
2738 if (drbd_al_ext_cache)
2739 kmem_cache_destroy(drbd_al_ext_cache);
2740
2741 drbd_ee_mempool = NULL;
2742 drbd_request_mempool = NULL;
2743 drbd_ee_cache = NULL;
2744 drbd_request_cache = NULL;
2745 drbd_bm_ext_cache = NULL;
2746 drbd_al_ext_cache = NULL;
2747
2748 return;
2749}
2750
2751static int drbd_create_mempools(void)
2752{
2753 struct page *page;
2754 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2755 int i;
2756
2757 /* prepare our caches and mempools */
2758 drbd_request_mempool = NULL;
2759 drbd_ee_cache = NULL;
2760 drbd_request_cache = NULL;
2761 drbd_bm_ext_cache = NULL;
2762 drbd_al_ext_cache = NULL;
2763 drbd_pp_pool = NULL;
2764
2765 /* caches */
2766 drbd_request_cache = kmem_cache_create(
2767 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2768 if (drbd_request_cache == NULL)
2769 goto Enomem;
2770
2771 drbd_ee_cache = kmem_cache_create(
2772 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2773 if (drbd_ee_cache == NULL)
2774 goto Enomem;
2775
2776 drbd_bm_ext_cache = kmem_cache_create(
2777 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2778 if (drbd_bm_ext_cache == NULL)
2779 goto Enomem;
2780
2781 drbd_al_ext_cache = kmem_cache_create(
2782 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2783 if (drbd_al_ext_cache == NULL)
2784 goto Enomem;
2785
2786 /* mempools */
2787 drbd_request_mempool = mempool_create(number,
2788 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2789 if (drbd_request_mempool == NULL)
2790 goto Enomem;
2791
2792 drbd_ee_mempool = mempool_create(number,
2793 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2794 if (drbd_request_mempool == NULL)
2795 goto Enomem;
2796
2797 /* drbd's page pool */
2798 spin_lock_init(&drbd_pp_lock);
2799
2800 for (i = 0; i < number; i++) {
2801 page = alloc_page(GFP_HIGHUSER);
2802 if (!page)
2803 goto Enomem;
2804 set_page_private(page, (unsigned long)drbd_pp_pool);
2805 drbd_pp_pool = page;
2806 }
2807 drbd_pp_vacant = number;
2808
2809 return 0;
2810
2811Enomem:
2812 drbd_destroy_mempools(); /* in case we allocated some */
2813 return -ENOMEM;
2814}
2815
2816static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2817 void *unused)
2818{
2819 /* just so we have it. you never know what interesting things we
2820 * might want to do here some day...
2821 */
2822
2823 return NOTIFY_DONE;
2824}
2825
2826static struct notifier_block drbd_notifier = {
2827 .notifier_call = drbd_notify_sys,
2828};
2829
2830static void drbd_release_ee_lists(struct drbd_conf *mdev)
2831{
2832 int rr;
2833
2834 rr = drbd_release_ee(mdev, &mdev->active_ee);
2835 if (rr)
2836 dev_err(DEV, "%d EEs in active list found!\n", rr);
2837
2838 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2839 if (rr)
2840 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2841
2842 rr = drbd_release_ee(mdev, &mdev->read_ee);
2843 if (rr)
2844 dev_err(DEV, "%d EEs in read list found!\n", rr);
2845
2846 rr = drbd_release_ee(mdev, &mdev->done_ee);
2847 if (rr)
2848 dev_err(DEV, "%d EEs in done list found!\n", rr);
2849
2850 rr = drbd_release_ee(mdev, &mdev->net_ee);
2851 if (rr)
2852 dev_err(DEV, "%d EEs in net list found!\n", rr);
2853}
2854
2855/* caution. no locking.
2856 * currently only used from module cleanup code. */
2857static void drbd_delete_device(unsigned int minor)
2858{
2859 struct drbd_conf *mdev = minor_to_mdev(minor);
2860
2861 if (!mdev)
2862 return;
2863
2864 /* paranoia asserts */
2865 if (mdev->open_cnt != 0)
2866 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2867 __FILE__ , __LINE__);
2868
2869 ERR_IF (!list_empty(&mdev->data.work.q)) {
2870 struct list_head *lp;
2871 list_for_each(lp, &mdev->data.work.q) {
2872 dev_err(DEV, "lp = %p\n", lp);
2873 }
2874 };
2875 /* end paranoia asserts */
2876
2877 del_gendisk(mdev->vdisk);
2878
2879 /* cleanup stuff that may have been allocated during
2880 * device (re-)configuration or state changes */
2881
2882 if (mdev->this_bdev)
2883 bdput(mdev->this_bdev);
2884
2885 drbd_free_resources(mdev);
2886
2887 drbd_release_ee_lists(mdev);
2888
2889 /* should be free'd on disconnect? */
2890 kfree(mdev->ee_hash);
2891 /*
2892 mdev->ee_hash_s = 0;
2893 mdev->ee_hash = NULL;
2894 */
2895
2896 lc_destroy(mdev->act_log);
2897 lc_destroy(mdev->resync);
2898
2899 kfree(mdev->p_uuid);
2900 /* mdev->p_uuid = NULL; */
2901
2902 kfree(mdev->int_dig_out);
2903 kfree(mdev->int_dig_in);
2904 kfree(mdev->int_dig_vv);
2905
2906 /* cleanup the rest that has been
2907 * allocated from drbd_new_device
2908 * and actually free the mdev itself */
2909 drbd_free_mdev(mdev);
2910}
2911
2912static void drbd_cleanup(void)
2913{
2914 unsigned int i;
2915
2916 unregister_reboot_notifier(&drbd_notifier);
2917
2918 drbd_nl_cleanup();
2919
2920 if (minor_table) {
2921 if (drbd_proc)
2922 remove_proc_entry("drbd", NULL);
2923 i = minor_count;
2924 while (i--)
2925 drbd_delete_device(i);
2926 drbd_destroy_mempools();
2927 }
2928
2929 kfree(minor_table);
2930
2931 unregister_blkdev(DRBD_MAJOR, "drbd");
2932
2933 printk(KERN_INFO "drbd: module cleanup done.\n");
2934}
2935
2936/**
2937 * drbd_congested() - Callback for pdflush
2938 * @congested_data: User data
2939 * @bdi_bits: Bits pdflush is currently interested in
2940 *
2941 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2942 */
2943static int drbd_congested(void *congested_data, int bdi_bits)
2944{
2945 struct drbd_conf *mdev = congested_data;
2946 struct request_queue *q;
2947 char reason = '-';
2948 int r = 0;
2949
2950 if (!__inc_ap_bio_cond(mdev)) {
2951 /* DRBD has frozen IO */
2952 r = bdi_bits;
2953 reason = 'd';
2954 goto out;
2955 }
2956
2957 if (get_ldev(mdev)) {
2958 q = bdev_get_queue(mdev->ldev->backing_bdev);
2959 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2960 put_ldev(mdev);
2961 if (r)
2962 reason = 'b';
2963 }
2964
2965 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
2966 r |= (1 << BDI_async_congested);
2967 reason = reason == 'b' ? 'a' : 'n';
2968 }
2969
2970out:
2971 mdev->congestion_reason = reason;
2972 return r;
2973}
2974
2975struct drbd_conf *drbd_new_device(unsigned int minor)
2976{
2977 struct drbd_conf *mdev;
2978 struct gendisk *disk;
2979 struct request_queue *q;
2980
2981 /* GFP_KERNEL, we are outside of all write-out paths */
2982 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2983 if (!mdev)
2984 return NULL;
2985 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
2986 goto out_no_cpumask;
2987
2988 mdev->minor = minor;
2989
2990 drbd_init_set_defaults(mdev);
2991
2992 q = blk_alloc_queue(GFP_KERNEL);
2993 if (!q)
2994 goto out_no_q;
2995 mdev->rq_queue = q;
2996 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002997
2998 disk = alloc_disk(1);
2999 if (!disk)
3000 goto out_no_disk;
3001 mdev->vdisk = disk;
3002
3003 set_disk_ro(disk, TRUE);
3004
3005 disk->queue = q;
3006 disk->major = DRBD_MAJOR;
3007 disk->first_minor = minor;
3008 disk->fops = &drbd_ops;
3009 sprintf(disk->disk_name, "drbd%d", minor);
3010 disk->private_data = mdev;
3011
3012 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3013 /* we have no partitions. we contain only ourselves. */
3014 mdev->this_bdev->bd_contains = mdev->this_bdev;
3015
3016 q->backing_dev_info.congested_fn = drbd_congested;
3017 q->backing_dev_info.congested_data = mdev;
3018
3019 blk_queue_make_request(q, drbd_make_request_26);
Lars Ellenberg98ec2862010-01-21 19:33:14 +01003020 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003021 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3022 blk_queue_merge_bvec(q, drbd_merge_bvec);
3023 q->queue_lock = &mdev->req_lock; /* needed since we use */
3024 /* plugging on a queue, that actually has no requests! */
3025 q->unplug_fn = drbd_unplug_fn;
3026
3027 mdev->md_io_page = alloc_page(GFP_KERNEL);
3028 if (!mdev->md_io_page)
3029 goto out_no_io_page;
3030
3031 if (drbd_bm_init(mdev))
3032 goto out_no_bitmap;
3033 /* no need to lock access, we are still initializing this minor device. */
3034 if (!tl_init(mdev))
3035 goto out_no_tl;
3036
3037 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3038 if (!mdev->app_reads_hash)
3039 goto out_no_app_reads;
3040
3041 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3042 if (!mdev->current_epoch)
3043 goto out_no_epoch;
3044
3045 INIT_LIST_HEAD(&mdev->current_epoch->list);
3046 mdev->epochs = 1;
3047
3048 return mdev;
3049
3050/* out_whatever_else:
3051 kfree(mdev->current_epoch); */
3052out_no_epoch:
3053 kfree(mdev->app_reads_hash);
3054out_no_app_reads:
3055 tl_cleanup(mdev);
3056out_no_tl:
3057 drbd_bm_cleanup(mdev);
3058out_no_bitmap:
3059 __free_page(mdev->md_io_page);
3060out_no_io_page:
3061 put_disk(disk);
3062out_no_disk:
3063 blk_cleanup_queue(q);
3064out_no_q:
3065 free_cpumask_var(mdev->cpu_mask);
3066out_no_cpumask:
3067 kfree(mdev);
3068 return NULL;
3069}
3070
3071/* counterpart of drbd_new_device.
3072 * last part of drbd_delete_device. */
3073void drbd_free_mdev(struct drbd_conf *mdev)
3074{
3075 kfree(mdev->current_epoch);
3076 kfree(mdev->app_reads_hash);
3077 tl_cleanup(mdev);
3078 if (mdev->bitmap) /* should no longer be there. */
3079 drbd_bm_cleanup(mdev);
3080 __free_page(mdev->md_io_page);
3081 put_disk(mdev->vdisk);
3082 blk_cleanup_queue(mdev->rq_queue);
3083 free_cpumask_var(mdev->cpu_mask);
3084 kfree(mdev);
3085}
3086
3087
3088int __init drbd_init(void)
3089{
3090 int err;
3091
3092 if (sizeof(struct p_handshake) != 80) {
3093 printk(KERN_ERR
3094 "drbd: never change the size or layout "
3095 "of the HandShake packet.\n");
3096 return -EINVAL;
3097 }
3098
3099 if (1 > minor_count || minor_count > 255) {
3100 printk(KERN_ERR
3101 "drbd: invalid minor_count (%d)\n", minor_count);
3102#ifdef MODULE
3103 return -EINVAL;
3104#else
3105 minor_count = 8;
3106#endif
3107 }
3108
3109 err = drbd_nl_init();
3110 if (err)
3111 return err;
3112
3113 err = register_blkdev(DRBD_MAJOR, "drbd");
3114 if (err) {
3115 printk(KERN_ERR
3116 "drbd: unable to register block device major %d\n",
3117 DRBD_MAJOR);
3118 return err;
3119 }
3120
3121 register_reboot_notifier(&drbd_notifier);
3122
3123 /*
3124 * allocate all necessary structs
3125 */
3126 err = -ENOMEM;
3127
3128 init_waitqueue_head(&drbd_pp_wait);
3129
3130 drbd_proc = NULL; /* play safe for drbd_cleanup */
3131 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3132 GFP_KERNEL);
3133 if (!minor_table)
3134 goto Enomem;
3135
3136 err = drbd_create_mempools();
3137 if (err)
3138 goto Enomem;
3139
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003140 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003141 if (!drbd_proc) {
3142 printk(KERN_ERR "drbd: unable to register proc file\n");
3143 goto Enomem;
3144 }
3145
3146 rwlock_init(&global_state_lock);
3147
3148 printk(KERN_INFO "drbd: initialized. "
3149 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3150 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3151 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3152 printk(KERN_INFO "drbd: registered as block device major %d\n",
3153 DRBD_MAJOR);
3154 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3155
3156 return 0; /* Success! */
3157
3158Enomem:
3159 drbd_cleanup();
3160 if (err == -ENOMEM)
3161 /* currently always the case */
3162 printk(KERN_ERR "drbd: ran out of memory\n");
3163 else
3164 printk(KERN_ERR "drbd: initialization failure\n");
3165 return err;
3166}
3167
3168void drbd_free_bc(struct drbd_backing_dev *ldev)
3169{
3170 if (ldev == NULL)
3171 return;
3172
3173 bd_release(ldev->backing_bdev);
3174 bd_release(ldev->md_bdev);
3175
3176 fput(ldev->lo_file);
3177 fput(ldev->md_file);
3178
3179 kfree(ldev);
3180}
3181
3182void drbd_free_sock(struct drbd_conf *mdev)
3183{
3184 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003185 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003186 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3187 sock_release(mdev->data.socket);
3188 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003189 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003190 }
3191 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003192 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003193 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3194 sock_release(mdev->meta.socket);
3195 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003196 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003197 }
3198}
3199
3200
3201void drbd_free_resources(struct drbd_conf *mdev)
3202{
3203 crypto_free_hash(mdev->csums_tfm);
3204 mdev->csums_tfm = NULL;
3205 crypto_free_hash(mdev->verify_tfm);
3206 mdev->verify_tfm = NULL;
3207 crypto_free_hash(mdev->cram_hmac_tfm);
3208 mdev->cram_hmac_tfm = NULL;
3209 crypto_free_hash(mdev->integrity_w_tfm);
3210 mdev->integrity_w_tfm = NULL;
3211 crypto_free_hash(mdev->integrity_r_tfm);
3212 mdev->integrity_r_tfm = NULL;
3213
3214 drbd_free_sock(mdev);
3215
3216 __no_warn(local,
3217 drbd_free_bc(mdev->ldev);
3218 mdev->ldev = NULL;);
3219}
3220
3221/* meta data management */
3222
3223struct meta_data_on_disk {
3224 u64 la_size; /* last agreed size. */
3225 u64 uuid[UI_SIZE]; /* UUIDs. */
3226 u64 device_uuid;
3227 u64 reserved_u64_1;
3228 u32 flags; /* MDF */
3229 u32 magic;
3230 u32 md_size_sect;
3231 u32 al_offset; /* offset to this block */
3232 u32 al_nr_extents; /* important for restoring the AL */
3233 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3234 u32 bm_offset; /* offset to the bitmap, from here */
3235 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3236 u32 reserved_u32[4];
3237
3238} __packed;
3239
3240/**
3241 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3242 * @mdev: DRBD device.
3243 */
3244void drbd_md_sync(struct drbd_conf *mdev)
3245{
3246 struct meta_data_on_disk *buffer;
3247 sector_t sector;
3248 int i;
3249
3250 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3251 return;
3252 del_timer(&mdev->md_sync_timer);
3253
3254 /* We use here D_FAILED and not D_ATTACHING because we try to write
3255 * metadata even if we detach due to a disk failure! */
3256 if (!get_ldev_if_state(mdev, D_FAILED))
3257 return;
3258
Philipp Reisnerb411b362009-09-25 16:07:19 -07003259 mutex_lock(&mdev->md_io_mutex);
3260 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3261 memset(buffer, 0, 512);
3262
3263 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3264 for (i = UI_CURRENT; i < UI_SIZE; i++)
3265 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3266 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3267 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3268
3269 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3270 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3271 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3272 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3273 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3274
3275 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3276
3277 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3278 sector = mdev->ldev->md.md_offset;
3279
3280 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3281 clear_bit(MD_DIRTY, &mdev->flags);
3282 } else {
3283 /* this was a try anyways ... */
3284 dev_err(DEV, "meta data update failed!\n");
3285
3286 drbd_chk_io_error(mdev, 1, TRUE);
3287 }
3288
3289 /* Update mdev->ldev->md.la_size_sect,
3290 * since we updated it on metadata. */
3291 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3292
3293 mutex_unlock(&mdev->md_io_mutex);
3294 put_ldev(mdev);
3295}
3296
3297/**
3298 * drbd_md_read() - Reads in the meta data super block
3299 * @mdev: DRBD device.
3300 * @bdev: Device from which the meta data should be read in.
3301 *
3302 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3303 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3304 */
3305int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3306{
3307 struct meta_data_on_disk *buffer;
3308 int i, rv = NO_ERROR;
3309
3310 if (!get_ldev_if_state(mdev, D_ATTACHING))
3311 return ERR_IO_MD_DISK;
3312
Philipp Reisnerb411b362009-09-25 16:07:19 -07003313 mutex_lock(&mdev->md_io_mutex);
3314 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3315
3316 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3317 /* NOTE: cant do normal error processing here as this is
3318 called BEFORE disk is attached */
3319 dev_err(DEV, "Error while reading metadata.\n");
3320 rv = ERR_IO_MD_DISK;
3321 goto err;
3322 }
3323
3324 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3325 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3326 rv = ERR_MD_INVALID;
3327 goto err;
3328 }
3329 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3330 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3331 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3332 rv = ERR_MD_INVALID;
3333 goto err;
3334 }
3335 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3336 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3337 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3338 rv = ERR_MD_INVALID;
3339 goto err;
3340 }
3341 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3342 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3343 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3344 rv = ERR_MD_INVALID;
3345 goto err;
3346 }
3347
3348 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3349 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3350 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3351 rv = ERR_MD_INVALID;
3352 goto err;
3353 }
3354
3355 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3356 for (i = UI_CURRENT; i < UI_SIZE; i++)
3357 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3358 bdev->md.flags = be32_to_cpu(buffer->flags);
3359 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3360 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3361
3362 if (mdev->sync_conf.al_extents < 7)
3363 mdev->sync_conf.al_extents = 127;
3364
3365 err:
3366 mutex_unlock(&mdev->md_io_mutex);
3367 put_ldev(mdev);
3368
3369 return rv;
3370}
3371
3372/**
3373 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3374 * @mdev: DRBD device.
3375 *
3376 * Call this function if you change anything that should be written to
3377 * the meta-data super block. This function sets MD_DIRTY, and starts a
3378 * timer that ensures that within five seconds you have to call drbd_md_sync().
3379 */
3380void drbd_md_mark_dirty(struct drbd_conf *mdev)
3381{
3382 set_bit(MD_DIRTY, &mdev->flags);
3383 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3384}
3385
3386
3387static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3388{
3389 int i;
3390
Jens Axboe6a0afdf2009-10-01 09:04:14 +02003391 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003392 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003393}
3394
3395void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3396{
3397 if (idx == UI_CURRENT) {
3398 if (mdev->state.role == R_PRIMARY)
3399 val |= 1;
3400 else
3401 val &= ~((u64)1);
3402
3403 drbd_set_ed_uuid(mdev, val);
3404 }
3405
3406 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003407 drbd_md_mark_dirty(mdev);
3408}
3409
3410
3411void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3412{
3413 if (mdev->ldev->md.uuid[idx]) {
3414 drbd_uuid_move_history(mdev);
3415 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003416 }
3417 _drbd_uuid_set(mdev, idx, val);
3418}
3419
3420/**
3421 * drbd_uuid_new_current() - Creates a new current UUID
3422 * @mdev: DRBD device.
3423 *
3424 * Creates a new current UUID, and rotates the old current UUID into
3425 * the bitmap slot. Causes an incremental resync upon next connect.
3426 */
3427void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3428{
3429 u64 val;
3430
3431 dev_info(DEV, "Creating new current UUID\n");
3432 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3433 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003434
3435 get_random_bytes(&val, sizeof(u64));
3436 _drbd_uuid_set(mdev, UI_CURRENT, val);
3437}
3438
3439void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3440{
3441 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3442 return;
3443
3444 if (val == 0) {
3445 drbd_uuid_move_history(mdev);
3446 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3447 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003448 } else {
3449 if (mdev->ldev->md.uuid[UI_BITMAP])
3450 dev_warn(DEV, "bm UUID already set");
3451
3452 mdev->ldev->md.uuid[UI_BITMAP] = val;
3453 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3454
Philipp Reisnerb411b362009-09-25 16:07:19 -07003455 }
3456 drbd_md_mark_dirty(mdev);
3457}
3458
3459/**
3460 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3461 * @mdev: DRBD device.
3462 *
3463 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3464 */
3465int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3466{
3467 int rv = -EIO;
3468
3469 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3470 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3471 drbd_md_sync(mdev);
3472 drbd_bm_set_all(mdev);
3473
3474 rv = drbd_bm_write(mdev);
3475
3476 if (!rv) {
3477 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3478 drbd_md_sync(mdev);
3479 }
3480
3481 put_ldev(mdev);
3482 }
3483
3484 return rv;
3485}
3486
3487/**
3488 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3489 * @mdev: DRBD device.
3490 *
3491 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3492 */
3493int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3494{
3495 int rv = -EIO;
3496
3497 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3498 drbd_bm_clear_all(mdev);
3499 rv = drbd_bm_write(mdev);
3500 put_ldev(mdev);
3501 }
3502
3503 return rv;
3504}
3505
3506static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3507{
3508 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3509 int rv;
3510
3511 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3512
3513 drbd_bm_lock(mdev, work->why);
3514 rv = work->io_fn(mdev);
3515 drbd_bm_unlock(mdev);
3516
3517 clear_bit(BITMAP_IO, &mdev->flags);
3518 wake_up(&mdev->misc_wait);
3519
3520 if (work->done)
3521 work->done(mdev, rv);
3522
3523 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3524 work->why = NULL;
3525
3526 return 1;
3527}
3528
3529/**
3530 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3531 * @mdev: DRBD device.
3532 * @io_fn: IO callback to be called when bitmap IO is possible
3533 * @done: callback to be called after the bitmap IO was performed
3534 * @why: Descriptive text of the reason for doing the IO
3535 *
3536 * While IO on the bitmap happens we freeze application IO thus we ensure
3537 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3538 * called from worker context. It MUST NOT be used while a previous such
3539 * work is still pending!
3540 */
3541void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3542 int (*io_fn)(struct drbd_conf *),
3543 void (*done)(struct drbd_conf *, int),
3544 char *why)
3545{
3546 D_ASSERT(current == mdev->worker.task);
3547
3548 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3549 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3550 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3551 if (mdev->bm_io_work.why)
3552 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3553 why, mdev->bm_io_work.why);
3554
3555 mdev->bm_io_work.io_fn = io_fn;
3556 mdev->bm_io_work.done = done;
3557 mdev->bm_io_work.why = why;
3558
3559 set_bit(BITMAP_IO, &mdev->flags);
3560 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3561 if (list_empty(&mdev->bm_io_work.w.list)) {
3562 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3563 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3564 } else
3565 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3566 }
3567}
3568
3569/**
3570 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3571 * @mdev: DRBD device.
3572 * @io_fn: IO callback to be called when bitmap IO is possible
3573 * @why: Descriptive text of the reason for doing the IO
3574 *
3575 * freezes application IO while that the actual IO operations runs. This
3576 * functions MAY NOT be called from worker context.
3577 */
3578int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3579{
3580 int rv;
3581
3582 D_ASSERT(current != mdev->worker.task);
3583
3584 drbd_suspend_io(mdev);
3585
3586 drbd_bm_lock(mdev, why);
3587 rv = io_fn(mdev);
3588 drbd_bm_unlock(mdev);
3589
3590 drbd_resume_io(mdev);
3591
3592 return rv;
3593}
3594
3595void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3596{
3597 if ((mdev->ldev->md.flags & flag) != flag) {
3598 drbd_md_mark_dirty(mdev);
3599 mdev->ldev->md.flags |= flag;
3600 }
3601}
3602
3603void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3604{
3605 if ((mdev->ldev->md.flags & flag) != 0) {
3606 drbd_md_mark_dirty(mdev);
3607 mdev->ldev->md.flags &= ~flag;
3608 }
3609}
3610int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3611{
3612 return (bdev->md.flags & flag) != 0;
3613}
3614
3615static void md_sync_timer_fn(unsigned long data)
3616{
3617 struct drbd_conf *mdev = (struct drbd_conf *) data;
3618
3619 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3620}
3621
3622static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3623{
3624 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3625 drbd_md_sync(mdev);
3626
3627 return 1;
3628}
3629
3630#ifdef CONFIG_DRBD_FAULT_INJECTION
3631/* Fault insertion support including random number generator shamelessly
3632 * stolen from kernel/rcutorture.c */
3633struct fault_random_state {
3634 unsigned long state;
3635 unsigned long count;
3636};
3637
3638#define FAULT_RANDOM_MULT 39916801 /* prime */
3639#define FAULT_RANDOM_ADD 479001701 /* prime */
3640#define FAULT_RANDOM_REFRESH 10000
3641
3642/*
3643 * Crude but fast random-number generator. Uses a linear congruential
3644 * generator, with occasional help from get_random_bytes().
3645 */
3646static unsigned long
3647_drbd_fault_random(struct fault_random_state *rsp)
3648{
3649 long refresh;
3650
Roel Kluin49829ea2009-12-15 22:55:44 +01003651 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003652 get_random_bytes(&refresh, sizeof(refresh));
3653 rsp->state += refresh;
3654 rsp->count = FAULT_RANDOM_REFRESH;
3655 }
3656 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3657 return swahw32(rsp->state);
3658}
3659
3660static char *
3661_drbd_fault_str(unsigned int type) {
3662 static char *_faults[] = {
3663 [DRBD_FAULT_MD_WR] = "Meta-data write",
3664 [DRBD_FAULT_MD_RD] = "Meta-data read",
3665 [DRBD_FAULT_RS_WR] = "Resync write",
3666 [DRBD_FAULT_RS_RD] = "Resync read",
3667 [DRBD_FAULT_DT_WR] = "Data write",
3668 [DRBD_FAULT_DT_RD] = "Data read",
3669 [DRBD_FAULT_DT_RA] = "Data read ahead",
3670 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02003671 [DRBD_FAULT_AL_EE] = "EE allocation",
3672 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07003673 };
3674
3675 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3676}
3677
3678unsigned int
3679_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3680{
3681 static struct fault_random_state rrs = {0, 0};
3682
3683 unsigned int ret = (
3684 (fault_devs == 0 ||
3685 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3686 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3687
3688 if (ret) {
3689 fault_count++;
3690
3691 if (printk_ratelimit())
3692 dev_warn(DEV, "***Simulating %s failure\n",
3693 _drbd_fault_str(type));
3694 }
3695
3696 return ret;
3697}
3698#endif
3699
3700const char *drbd_buildtag(void)
3701{
3702 /* DRBD built from external sources has here a reference to the
3703 git hash of the source code. */
3704
3705 static char buildtag[38] = "\0uilt-in";
3706
3707 if (buildtag[0] == 0) {
3708#ifdef CONFIG_MODULES
3709 if (THIS_MODULE != NULL)
3710 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3711 else
3712#endif
3713 buildtag[0] = 'b';
3714 }
3715
3716 return buildtag;
3717}
3718
3719module_init(drbd_init)
3720module_exit(drbd_cleanup)
3721
Philipp Reisnerb411b362009-09-25 16:07:19 -07003722EXPORT_SYMBOL(drbd_conn_str);
3723EXPORT_SYMBOL(drbd_role_str);
3724EXPORT_SYMBOL(drbd_disk_str);
3725EXPORT_SYMBOL(drbd_set_st_err_str);