blob: f529c25e1ad5f55336c64c4d4ad5d5a95c64cb8d [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
89MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90
91#include <linux/moduleparam.h>
92/* allow_open_on_secondary */
93MODULE_PARM_DESC(allow_oos, "DONT USE!");
94/* thanks to these macros, if compiled into the kernel (not-module),
95 * this becomes the boot parameter drbd.minor_count */
96module_param(minor_count, uint, 0444);
97module_param(disable_sendpage, bool, 0644);
98module_param(allow_oos, bool, 0);
99module_param(cn_idx, uint, 0444);
100module_param(proc_details, int, 0644);
101
102#ifdef CONFIG_DRBD_FAULT_INJECTION
103int enable_faults;
104int fault_rate;
105static int fault_count;
106int fault_devs;
107/* bitmap of enabled faults */
108module_param(enable_faults, int, 0664);
109/* fault rate % value - applies to all enabled faults */
110module_param(fault_rate, int, 0664);
111/* count of faults inserted */
112module_param(fault_count, int, 0664);
113/* bitmap of devices to insert faults on */
114module_param(fault_devs, int, 0644);
115#endif
116
117/* module parameter, defined */
118unsigned int minor_count = 32;
119int disable_sendpage;
120int allow_oos;
121unsigned int cn_idx = CN_IDX_DRBD;
122int proc_details; /* Detail level in proc drbd*/
123
124/* Module parameter for setting the user mode helper program
125 * to run. Default is /sbin/drbdadm */
126char usermode_helper[80] = "/sbin/drbdadm";
127
128module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
129
130/* in 2.6.x, our device mapping and config info contains our virtual gendisks
131 * as member "struct gendisk *vdisk;"
132 */
133struct drbd_conf **minor_table;
134
135struct kmem_cache *drbd_request_cache;
136struct kmem_cache *drbd_ee_cache; /* epoch entries */
137struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
138struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
139mempool_t *drbd_request_mempool;
140mempool_t *drbd_ee_mempool;
141
142/* I do not use a standard mempool, because:
143 1) I want to hand out the pre-allocated objects first.
144 2) I want to be able to interrupt sleeping allocation with a signal.
145 Note: This is a single linked list, the next pointer is the private
146 member of struct page.
147 */
148struct page *drbd_pp_pool;
149spinlock_t drbd_pp_lock;
150int drbd_pp_vacant;
151wait_queue_head_t drbd_pp_wait;
152
153DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
154
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100155static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700156 .owner = THIS_MODULE,
157 .open = drbd_open,
158 .release = drbd_release,
159};
160
161#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
162
163#ifdef __CHECKER__
164/* When checking with sparse, and this is an inline function, sparse will
165 give tons of false positives. When this is a real functions sparse works.
166 */
167int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
168{
169 int io_allowed;
170
171 atomic_inc(&mdev->local_cnt);
172 io_allowed = (mdev->state.disk >= mins);
173 if (!io_allowed) {
174 if (atomic_dec_and_test(&mdev->local_cnt))
175 wake_up(&mdev->misc_wait);
176 }
177 return io_allowed;
178}
179
180#endif
181
182/**
183 * DOC: The transfer log
184 *
185 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
186 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
187 * of the list. There is always at least one &struct drbd_tl_epoch object.
188 *
189 * Each &struct drbd_tl_epoch has a circular double linked list of requests
190 * attached.
191 */
192static int tl_init(struct drbd_conf *mdev)
193{
194 struct drbd_tl_epoch *b;
195
196 /* during device minor initialization, we may well use GFP_KERNEL */
197 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
198 if (!b)
199 return 0;
200 INIT_LIST_HEAD(&b->requests);
201 INIT_LIST_HEAD(&b->w.list);
202 b->next = NULL;
203 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200204 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700205 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
206
207 mdev->oldest_tle = b;
208 mdev->newest_tle = b;
209 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
210
211 mdev->tl_hash = NULL;
212 mdev->tl_hash_s = 0;
213
214 return 1;
215}
216
217static void tl_cleanup(struct drbd_conf *mdev)
218{
219 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
220 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
221 kfree(mdev->oldest_tle);
222 mdev->oldest_tle = NULL;
223 kfree(mdev->unused_spare_tle);
224 mdev->unused_spare_tle = NULL;
225 kfree(mdev->tl_hash);
226 mdev->tl_hash = NULL;
227 mdev->tl_hash_s = 0;
228}
229
230/**
231 * _tl_add_barrier() - Adds a barrier to the transfer log
232 * @mdev: DRBD device.
233 * @new: Barrier to be added before the current head of the TL.
234 *
235 * The caller must hold the req_lock.
236 */
237void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
238{
239 struct drbd_tl_epoch *newest_before;
240
241 INIT_LIST_HEAD(&new->requests);
242 INIT_LIST_HEAD(&new->w.list);
243 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
244 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200245 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700246
247 newest_before = mdev->newest_tle;
248 /* never send a barrier number == 0, because that is special-cased
249 * when using TCQ for our write ordering code */
250 new->br_number = (newest_before->br_number+1) ?: 1;
251 if (mdev->newest_tle != new) {
252 mdev->newest_tle->next = new;
253 mdev->newest_tle = new;
254 }
255}
256
257/**
258 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259 * @mdev: DRBD device.
260 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261 * @set_size: Expected number of requests before that barrier.
262 *
263 * In case the passed barrier_nr or set_size does not match the oldest
264 * &struct drbd_tl_epoch objects this function will cause a termination
265 * of the connection.
266 */
267void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268 unsigned int set_size)
269{
270 struct drbd_tl_epoch *b, *nob; /* next old barrier */
271 struct list_head *le, *tle;
272 struct drbd_request *r;
273
274 spin_lock_irq(&mdev->req_lock);
275
276 b = mdev->oldest_tle;
277
278 /* first some paranoia code */
279 if (b == NULL) {
280 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281 barrier_nr);
282 goto bail;
283 }
284 if (b->br_number != barrier_nr) {
285 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286 barrier_nr, b->br_number);
287 goto bail;
288 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200289 if (b->n_writes != set_size) {
290 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700292 goto bail;
293 }
294
295 /* Clean up list of requests processed during current epoch */
296 list_for_each_safe(le, tle, &b->requests) {
297 r = list_entry(le, struct drbd_request, tl_requests);
298 _req_mod(r, barrier_acked);
299 }
300 /* There could be requests on the list waiting for completion
301 of the write to the local disk. To avoid corruptions of
302 slab's data structures we have to remove the lists head.
303
304 Also there could have been a barrier ack out of sequence, overtaking
305 the write acks - which would be a bug and violating write ordering.
306 To not deadlock in case we lose connection while such requests are
307 still pending, we need some way to find them for the
308 _req_mode(connection_lost_while_pending).
309
310 These have been list_move'd to the out_of_sequence_requests list in
311 _req_mod(, barrier_acked) above.
312 */
313 list_del_init(&b->requests);
314
315 nob = b->next;
316 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317 _tl_add_barrier(mdev, b);
318 if (nob)
319 mdev->oldest_tle = nob;
320 /* if nob == NULL b was the only barrier, and becomes the new
321 barrier. Therefore mdev->oldest_tle points already to b */
322 } else {
323 D_ASSERT(nob != NULL);
324 mdev->oldest_tle = nob;
325 kfree(b);
326 }
327
328 spin_unlock_irq(&mdev->req_lock);
329 dec_ap_pending(mdev);
330
331 return;
332
333bail:
334 spin_unlock_irq(&mdev->req_lock);
335 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336}
337
Philipp Reisner11b58e72010-05-12 17:08:26 +0200338/**
339 * _tl_restart() - Walks the transfer log, and applies an action to all requests
340 * @mdev: DRBD device.
341 * @what: The action/event to perform with all request objects
342 *
343 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344 * restart_frozen_disk_io.
345 */
346static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347{
348 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200349 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200350 struct drbd_request *req;
351 int rv, n_writes, n_reads;
352
353 b = mdev->oldest_tle;
354 pn = &mdev->oldest_tle;
355 while (b) {
356 n_writes = 0;
357 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200358 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200359 list_for_each_safe(le, tle, &b->requests) {
360 req = list_entry(le, struct drbd_request, tl_requests);
361 rv = _req_mod(req, what);
362
363 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
365 }
366 tmp = b->next;
367
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200368 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200369 if (what == resend) {
370 b->n_writes = n_writes;
371 if (b->w.cb == NULL) {
372 b->w.cb = w_send_barrier;
373 inc_ap_pending(mdev);
374 set_bit(CREATE_BARRIER, &mdev->flags);
375 }
376
377 drbd_queue_work(&mdev->data.work, &b->w);
378 }
379 pn = &b->next;
380 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200381 if (n_reads)
382 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200383 /* there could still be requests on that ring list,
384 * in case local io is still pending */
385 list_del(&b->requests);
386
387 /* dec_ap_pending corresponding to queue_barrier.
388 * the newest barrier may not have been queued yet,
389 * in which case w.cb is still NULL. */
390 if (b->w.cb != NULL)
391 dec_ap_pending(mdev);
392
393 if (b == mdev->newest_tle) {
394 /* recycle, but reinit! */
395 D_ASSERT(tmp == NULL);
396 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200397 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200398 INIT_LIST_HEAD(&b->w.list);
399 b->w.cb = NULL;
400 b->br_number = net_random();
401 b->n_writes = 0;
402
403 *pn = b;
404 break;
405 }
406 *pn = tmp;
407 kfree(b);
408 }
409 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200410 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200411 }
412}
413
Philipp Reisnerb411b362009-09-25 16:07:19 -0700414
415/**
416 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
417 * @mdev: DRBD device.
418 *
419 * This is called after the connection to the peer was lost. The storage covered
420 * by the requests on the transfer gets marked as our of sync. Called from the
421 * receiver thread and the worker thread.
422 */
423void tl_clear(struct drbd_conf *mdev)
424{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700425 struct list_head *le, *tle;
426 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700427
428 spin_lock_irq(&mdev->req_lock);
429
Philipp Reisner11b58e72010-05-12 17:08:26 +0200430 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700431
432 /* we expect this list to be empty. */
433 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
434
435 /* but just in case, clean it up anyways! */
436 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
437 r = list_entry(le, struct drbd_request, tl_requests);
438 /* It would be nice to complete outside of spinlock.
439 * But this is easier for now. */
440 _req_mod(r, connection_lost_while_pending);
441 }
442
443 /* ensure bit indicating barrier is required is clear */
444 clear_bit(CREATE_BARRIER, &mdev->flags);
445
Philipp Reisner288f4222010-05-27 15:07:43 +0200446 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
Philipp Reisnerb411b362009-09-25 16:07:19 -0700448 spin_unlock_irq(&mdev->req_lock);
449}
450
Philipp Reisner11b58e72010-05-12 17:08:26 +0200451void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452{
453 spin_lock_irq(&mdev->req_lock);
454 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700455 spin_unlock_irq(&mdev->req_lock);
456}
457
458/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100459 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700460 * @mdev: DRBD device.
461 * @os: old (current) state.
462 * @ns: new (wanted) state.
463 */
464static int cl_wide_st_chg(struct drbd_conf *mdev,
465 union drbd_state os, union drbd_state ns)
466{
467 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
468 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
469 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
470 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
471 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
472 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
473 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474}
475
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100476enum drbd_state_rv
477drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
478 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700479{
480 unsigned long flags;
481 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100482 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700483
484 spin_lock_irqsave(&mdev->req_lock, flags);
485 os = mdev->state;
486 ns.i = (os.i & ~mask.i) | val.i;
487 rv = _drbd_set_state(mdev, ns, f, NULL);
488 ns = mdev->state;
489 spin_unlock_irqrestore(&mdev->req_lock, flags);
490
491 return rv;
492}
493
494/**
495 * drbd_force_state() - Impose a change which happens outside our control on our state
496 * @mdev: DRBD device.
497 * @mask: mask of state bits to change.
498 * @val: value of new state bits.
499 */
500void drbd_force_state(struct drbd_conf *mdev,
501 union drbd_state mask, union drbd_state val)
502{
503 drbd_change_state(mdev, CS_HARD, mask, val);
504}
505
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100506static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
507static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
508 union drbd_state,
509 union drbd_state);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700510static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200511 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700512int drbd_send_state_req(struct drbd_conf *,
513 union drbd_state, union drbd_state);
514
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100515static enum drbd_state_rv
516_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
517 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700518{
519 union drbd_state os, ns;
520 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100521 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700522
523 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
524 return SS_CW_SUCCESS;
525
526 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
527 return SS_CW_FAILED_BY_PEER;
528
529 rv = 0;
530 spin_lock_irqsave(&mdev->req_lock, flags);
531 os = mdev->state;
532 ns.i = (os.i & ~mask.i) | val.i;
533 ns = sanitize_state(mdev, os, ns, NULL);
534
535 if (!cl_wide_st_chg(mdev, os, ns))
536 rv = SS_CW_NO_NEED;
537 if (!rv) {
538 rv = is_valid_state(mdev, ns);
539 if (rv == SS_SUCCESS) {
540 rv = is_valid_state_transition(mdev, ns, os);
541 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100542 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700543 }
544 }
545 spin_unlock_irqrestore(&mdev->req_lock, flags);
546
547 return rv;
548}
549
550/**
551 * drbd_req_state() - Perform an eventually cluster wide state change
552 * @mdev: DRBD device.
553 * @mask: mask of state bits to change.
554 * @val: value of new state bits.
555 * @f: flags
556 *
557 * Should not be called directly, use drbd_request_state() or
558 * _drbd_request_state().
559 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100560static enum drbd_state_rv
561drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
562 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700563{
564 struct completion done;
565 unsigned long flags;
566 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100567 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700568
569 init_completion(&done);
570
571 if (f & CS_SERIALIZE)
572 mutex_lock(&mdev->state_mutex);
573
574 spin_lock_irqsave(&mdev->req_lock, flags);
575 os = mdev->state;
576 ns.i = (os.i & ~mask.i) | val.i;
577 ns = sanitize_state(mdev, os, ns, NULL);
578
579 if (cl_wide_st_chg(mdev, os, ns)) {
580 rv = is_valid_state(mdev, ns);
581 if (rv == SS_SUCCESS)
582 rv = is_valid_state_transition(mdev, ns, os);
583 spin_unlock_irqrestore(&mdev->req_lock, flags);
584
585 if (rv < SS_SUCCESS) {
586 if (f & CS_VERBOSE)
587 print_st_err(mdev, os, ns, rv);
588 goto abort;
589 }
590
591 drbd_state_lock(mdev);
592 if (!drbd_send_state_req(mdev, mask, val)) {
593 drbd_state_unlock(mdev);
594 rv = SS_CW_FAILED_BY_PEER;
595 if (f & CS_VERBOSE)
596 print_st_err(mdev, os, ns, rv);
597 goto abort;
598 }
599
600 wait_event(mdev->state_wait,
601 (rv = _req_st_cond(mdev, mask, val)));
602
603 if (rv < SS_SUCCESS) {
604 drbd_state_unlock(mdev);
605 if (f & CS_VERBOSE)
606 print_st_err(mdev, os, ns, rv);
607 goto abort;
608 }
609 spin_lock_irqsave(&mdev->req_lock, flags);
610 os = mdev->state;
611 ns.i = (os.i & ~mask.i) | val.i;
612 rv = _drbd_set_state(mdev, ns, f, &done);
613 drbd_state_unlock(mdev);
614 } else {
615 rv = _drbd_set_state(mdev, ns, f, &done);
616 }
617
618 spin_unlock_irqrestore(&mdev->req_lock, flags);
619
620 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
621 D_ASSERT(current != mdev->worker.task);
622 wait_for_completion(&done);
623 }
624
625abort:
626 if (f & CS_SERIALIZE)
627 mutex_unlock(&mdev->state_mutex);
628
629 return rv;
630}
631
632/**
633 * _drbd_request_state() - Request a state change (with flags)
634 * @mdev: DRBD device.
635 * @mask: mask of state bits to change.
636 * @val: value of new state bits.
637 * @f: flags
638 *
639 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
640 * flag, or when logging of failed state change requests is not desired.
641 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100642enum drbd_state_rv
643_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
644 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700645{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100646 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700647
648 wait_event(mdev->state_wait,
649 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
650
651 return rv;
652}
653
654static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
655{
656 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
657 name,
658 drbd_conn_str(ns.conn),
659 drbd_role_str(ns.role),
660 drbd_role_str(ns.peer),
661 drbd_disk_str(ns.disk),
662 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200663 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700664 ns.aftr_isp ? 'a' : '-',
665 ns.peer_isp ? 'p' : '-',
666 ns.user_isp ? 'u' : '-'
667 );
668}
669
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100670void print_st_err(struct drbd_conf *mdev, union drbd_state os,
671 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700672{
673 if (err == SS_IN_TRANSIENT_STATE)
674 return;
675 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
676 print_st(mdev, " state", os);
677 print_st(mdev, "wanted", ns);
678}
679
680
Philipp Reisnerb411b362009-09-25 16:07:19 -0700681/**
682 * is_valid_state() - Returns an SS_ error code if ns is not valid
683 * @mdev: DRBD device.
684 * @ns: State to consider.
685 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100686static enum drbd_state_rv
687is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700688{
689 /* See drbd_state_sw_errors in drbd_strings.c */
690
691 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100692 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700693
694 fp = FP_DONT_CARE;
695 if (get_ldev(mdev)) {
696 fp = mdev->ldev->dc.fencing;
697 put_ldev(mdev);
698 }
699
700 if (get_net_conf(mdev)) {
701 if (!mdev->net_conf->two_primaries &&
702 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
703 rv = SS_TWO_PRIMARIES;
704 put_net_conf(mdev);
705 }
706
707 if (rv <= 0)
708 /* already found a reason to abort */;
709 else if (ns.role == R_SECONDARY && mdev->open_cnt)
710 rv = SS_DEVICE_IN_USE;
711
712 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
713 rv = SS_NO_UP_TO_DATE_DISK;
714
715 else if (fp >= FP_RESOURCE &&
716 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
717 rv = SS_PRIMARY_NOP;
718
719 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
720 rv = SS_NO_UP_TO_DATE_DISK;
721
722 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
723 rv = SS_NO_LOCAL_DISK;
724
725 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
726 rv = SS_NO_REMOTE_DISK;
727
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200728 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
729 rv = SS_NO_UP_TO_DATE_DISK;
730
Philipp Reisnerb411b362009-09-25 16:07:19 -0700731 else if ((ns.conn == C_CONNECTED ||
732 ns.conn == C_WF_BITMAP_S ||
733 ns.conn == C_SYNC_SOURCE ||
734 ns.conn == C_PAUSED_SYNC_S) &&
735 ns.disk == D_OUTDATED)
736 rv = SS_CONNECTED_OUTDATES;
737
738 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
739 (mdev->sync_conf.verify_alg[0] == 0))
740 rv = SS_NO_VERIFY_ALG;
741
742 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
743 mdev->agreed_pro_version < 88)
744 rv = SS_NOT_SUPPORTED;
745
746 return rv;
747}
748
749/**
750 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
751 * @mdev: DRBD device.
752 * @ns: new state.
753 * @os: old state.
754 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100755static enum drbd_state_rv
756is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
757 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700758{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100759 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700760
761 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
762 os.conn > C_CONNECTED)
763 rv = SS_RESYNC_RUNNING;
764
765 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
766 rv = SS_ALREADY_STANDALONE;
767
768 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
769 rv = SS_IS_DISKLESS;
770
771 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
772 rv = SS_NO_NET_CONFIG;
773
774 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
775 rv = SS_LOWER_THAN_OUTDATED;
776
777 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
778 rv = SS_IN_TRANSIENT_STATE;
779
780 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
781 rv = SS_IN_TRANSIENT_STATE;
782
783 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
784 rv = SS_NEED_CONNECTION;
785
786 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
787 ns.conn != os.conn && os.conn > C_CONNECTED)
788 rv = SS_RESYNC_RUNNING;
789
790 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
791 os.conn < C_CONNECTED)
792 rv = SS_NEED_CONNECTION;
793
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100794 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
795 && os.conn < C_WF_REPORT_PARAMS)
796 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
797
Philipp Reisnerb411b362009-09-25 16:07:19 -0700798 return rv;
799}
800
801/**
802 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
803 * @mdev: DRBD device.
804 * @os: old state.
805 * @ns: new state.
806 * @warn_sync_abort:
807 *
808 * When we loose connection, we have to set the state of the peers disk (pdsk)
809 * to D_UNKNOWN. This rule and many more along those lines are in this function.
810 */
811static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200812 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700813{
814 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100815 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700816
817 fp = FP_DONT_CARE;
818 if (get_ldev(mdev)) {
819 fp = mdev->ldev->dc.fencing;
820 put_ldev(mdev);
821 }
822
823 /* Disallow Network errors to configure a device's network part */
824 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
825 os.conn <= C_DISCONNECTING)
826 ns.conn = os.conn;
827
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200828 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
829 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700830 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200831 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700832 ns.conn = os.conn;
833
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200834 /* we cannot fail (again) if we already detached */
835 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
836 ns.disk = D_DISKLESS;
837
838 /* if we are only D_ATTACHING yet,
839 * we can (and should) go directly to D_DISKLESS. */
840 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
841 ns.disk = D_DISKLESS;
842
Philipp Reisnerb411b362009-09-25 16:07:19 -0700843 /* After C_DISCONNECTING only C_STANDALONE may follow */
844 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
845 ns.conn = os.conn;
846
847 if (ns.conn < C_CONNECTED) {
848 ns.peer_isp = 0;
849 ns.peer = R_UNKNOWN;
850 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
851 ns.pdsk = D_UNKNOWN;
852 }
853
854 /* Clear the aftr_isp when becoming unconfigured */
855 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
856 ns.aftr_isp = 0;
857
Philipp Reisnerb411b362009-09-25 16:07:19 -0700858 /* Abort resync if a disk fails/detaches */
859 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
860 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
861 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200862 *warn_sync_abort =
863 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
864 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700865 ns.conn = C_CONNECTED;
866 }
867
Philipp Reisnerb411b362009-09-25 16:07:19 -0700868 /* Connection breaks down before we finished "Negotiating" */
869 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
870 get_ldev_if_state(mdev, D_NEGOTIATING)) {
871 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
872 ns.disk = mdev->new_state_tmp.disk;
873 ns.pdsk = mdev->new_state_tmp.pdsk;
874 } else {
875 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
876 ns.disk = D_DISKLESS;
877 ns.pdsk = D_UNKNOWN;
878 }
879 put_ldev(mdev);
880 }
881
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100882 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
883 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
884 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
885 ns.disk = D_UP_TO_DATE;
886 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
887 ns.pdsk = D_UP_TO_DATE;
888 }
889
890 /* Implications of the connection stat on the disk states */
891 disk_min = D_DISKLESS;
892 disk_max = D_UP_TO_DATE;
893 pdsk_min = D_INCONSISTENT;
894 pdsk_max = D_UNKNOWN;
895 switch ((enum drbd_conns)ns.conn) {
896 case C_WF_BITMAP_T:
897 case C_PAUSED_SYNC_T:
898 case C_STARTING_SYNC_T:
899 case C_WF_SYNC_UUID:
900 case C_BEHIND:
901 disk_min = D_INCONSISTENT;
902 disk_max = D_OUTDATED;
903 pdsk_min = D_UP_TO_DATE;
904 pdsk_max = D_UP_TO_DATE;
905 break;
906 case C_VERIFY_S:
907 case C_VERIFY_T:
908 disk_min = D_UP_TO_DATE;
909 disk_max = D_UP_TO_DATE;
910 pdsk_min = D_UP_TO_DATE;
911 pdsk_max = D_UP_TO_DATE;
912 break;
913 case C_CONNECTED:
914 disk_min = D_DISKLESS;
915 disk_max = D_UP_TO_DATE;
916 pdsk_min = D_DISKLESS;
917 pdsk_max = D_UP_TO_DATE;
918 break;
919 case C_WF_BITMAP_S:
920 case C_PAUSED_SYNC_S:
921 case C_STARTING_SYNC_S:
922 case C_AHEAD:
923 disk_min = D_UP_TO_DATE;
924 disk_max = D_UP_TO_DATE;
925 pdsk_min = D_INCONSISTENT;
926 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
927 break;
928 case C_SYNC_TARGET:
929 disk_min = D_INCONSISTENT;
930 disk_max = D_INCONSISTENT;
931 pdsk_min = D_UP_TO_DATE;
932 pdsk_max = D_UP_TO_DATE;
933 break;
934 case C_SYNC_SOURCE:
935 disk_min = D_UP_TO_DATE;
936 disk_max = D_UP_TO_DATE;
937 pdsk_min = D_INCONSISTENT;
938 pdsk_max = D_INCONSISTENT;
939 break;
940 case C_STANDALONE:
941 case C_DISCONNECTING:
942 case C_UNCONNECTED:
943 case C_TIMEOUT:
944 case C_BROKEN_PIPE:
945 case C_NETWORK_FAILURE:
946 case C_PROTOCOL_ERROR:
947 case C_TEAR_DOWN:
948 case C_WF_CONNECTION:
949 case C_WF_REPORT_PARAMS:
950 case C_MASK:
951 break;
952 }
953 if (ns.disk > disk_max)
954 ns.disk = disk_max;
955
956 if (ns.disk < disk_min) {
957 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
958 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
959 ns.disk = disk_min;
960 }
961 if (ns.pdsk > pdsk_max)
962 ns.pdsk = pdsk_max;
963
964 if (ns.pdsk < pdsk_min) {
965 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
966 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
967 ns.pdsk = pdsk_min;
968 }
969
Philipp Reisnerb411b362009-09-25 16:07:19 -0700970 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200971 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
972 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200973 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +0200974
975 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
976 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
977 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200978 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700979
980 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
981 if (ns.conn == C_SYNC_SOURCE)
982 ns.conn = C_PAUSED_SYNC_S;
983 if (ns.conn == C_SYNC_TARGET)
984 ns.conn = C_PAUSED_SYNC_T;
985 } else {
986 if (ns.conn == C_PAUSED_SYNC_S)
987 ns.conn = C_SYNC_SOURCE;
988 if (ns.conn == C_PAUSED_SYNC_T)
989 ns.conn = C_SYNC_TARGET;
990 }
991
992 return ns;
993}
994
995/* helper for __drbd_set_state */
996static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
997{
Lars Ellenberg30b743a2010-11-05 09:39:06 +0100998 if (mdev->agreed_pro_version < 90)
999 mdev->ov_start_sector = 0;
1000 mdev->rs_total = drbd_bm_bits(mdev);
1001 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001002 if (cs == C_VERIFY_T) {
1003 /* starting online verify from an arbitrary position
1004 * does not fit well into the existing protocol.
1005 * on C_VERIFY_T, we initialize ov_left and friends
1006 * implicitly in receive_DataRequest once the
1007 * first P_OV_REQUEST is received */
1008 mdev->ov_start_sector = ~(sector_t)0;
1009 } else {
1010 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001011 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001012 mdev->ov_start_sector =
1013 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001014 mdev->rs_total = 1;
1015 } else
1016 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001017 mdev->ov_position = mdev->ov_start_sector;
1018 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001019 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001020}
1021
Philipp Reisner07782862010-08-31 12:00:50 +02001022static void drbd_resume_al(struct drbd_conf *mdev)
1023{
1024 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1025 dev_info(DEV, "Resumed AL updates\n");
1026}
1027
Philipp Reisnerb411b362009-09-25 16:07:19 -07001028/**
1029 * __drbd_set_state() - Set a new DRBD state
1030 * @mdev: DRBD device.
1031 * @ns: new state.
1032 * @flags: Flags
1033 * @done: Optional completion, that will get completed after the after_state_ch() finished
1034 *
1035 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1036 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001037enum drbd_state_rv
1038__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1039 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001040{
1041 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001042 enum drbd_state_rv rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001043 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001044 struct after_state_chg_work *ascw;
1045
1046 os = mdev->state;
1047
1048 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1049
1050 if (ns.i == os.i)
1051 return SS_NOTHING_TO_DO;
1052
1053 if (!(flags & CS_HARD)) {
1054 /* pre-state-change checks ; only look at ns */
1055 /* See drbd_state_sw_errors in drbd_strings.c */
1056
1057 rv = is_valid_state(mdev, ns);
1058 if (rv < SS_SUCCESS) {
1059 /* If the old state was illegal as well, then let
1060 this happen...*/
1061
Philipp Reisner1616a252010-06-10 16:55:15 +02001062 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001063 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001064 } else
1065 rv = is_valid_state_transition(mdev, ns, os);
1066 }
1067
1068 if (rv < SS_SUCCESS) {
1069 if (flags & CS_VERBOSE)
1070 print_st_err(mdev, os, ns, rv);
1071 return rv;
1072 }
1073
1074 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001075 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001076
1077 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001078 char *pbp, pb[300];
1079 pbp = pb;
1080 *pbp = 0;
1081 if (ns.role != os.role)
1082 pbp += sprintf(pbp, "role( %s -> %s ) ",
1083 drbd_role_str(os.role),
1084 drbd_role_str(ns.role));
1085 if (ns.peer != os.peer)
1086 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1087 drbd_role_str(os.peer),
1088 drbd_role_str(ns.peer));
1089 if (ns.conn != os.conn)
1090 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1091 drbd_conn_str(os.conn),
1092 drbd_conn_str(ns.conn));
1093 if (ns.disk != os.disk)
1094 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1095 drbd_disk_str(os.disk),
1096 drbd_disk_str(ns.disk));
1097 if (ns.pdsk != os.pdsk)
1098 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1099 drbd_disk_str(os.pdsk),
1100 drbd_disk_str(ns.pdsk));
1101 if (is_susp(ns) != is_susp(os))
1102 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1103 is_susp(os),
1104 is_susp(ns));
1105 if (ns.aftr_isp != os.aftr_isp)
1106 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1107 os.aftr_isp,
1108 ns.aftr_isp);
1109 if (ns.peer_isp != os.peer_isp)
1110 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1111 os.peer_isp,
1112 ns.peer_isp);
1113 if (ns.user_isp != os.user_isp)
1114 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1115 os.user_isp,
1116 ns.user_isp);
1117 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001118 }
1119
1120 /* solve the race between becoming unconfigured,
1121 * worker doing the cleanup, and
1122 * admin reconfiguring us:
1123 * on (re)configure, first set CONFIG_PENDING,
1124 * then wait for a potentially exiting worker,
1125 * start the worker, and schedule one no_op.
1126 * then proceed with configuration.
1127 */
1128 if (ns.disk == D_DISKLESS &&
1129 ns.conn == C_STANDALONE &&
1130 ns.role == R_SECONDARY &&
1131 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1132 set_bit(DEVICE_DYING, &mdev->flags);
1133
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001134 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1135 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1136 * drbd_ldev_destroy() won't happen before our corresponding
1137 * after_state_ch works run, where we put_ldev again. */
1138 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1139 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1140 atomic_inc(&mdev->local_cnt);
1141
1142 mdev->state = ns;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001143 wake_up(&mdev->misc_wait);
1144 wake_up(&mdev->state_wait);
1145
Philipp Reisnerb411b362009-09-25 16:07:19 -07001146 /* aborted verify run. log the last position */
1147 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1148 ns.conn < C_CONNECTED) {
1149 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001150 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001151 dev_info(DEV, "Online Verify reached sector %llu\n",
1152 (unsigned long long)mdev->ov_start_sector);
1153 }
1154
1155 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1156 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1157 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001158 mdev->rs_paused += (long)jiffies
1159 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001160 if (ns.conn == C_SYNC_TARGET)
1161 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001162 }
1163
1164 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1165 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1166 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001167 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001168 }
1169
1170 if (os.conn == C_CONNECTED &&
1171 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001172 unsigned long now = jiffies;
1173 int i;
1174
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001175 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001176 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001177 mdev->rs_last_events = 0;
1178 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001179 mdev->ov_last_oos_size = 0;
1180 mdev->ov_last_oos_start = 0;
1181
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001182 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001183 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001184 mdev->rs_mark_time[i] = now;
1185 }
1186
Lars Ellenberg2649f082010-11-05 10:05:47 +01001187 drbd_rs_controller_reset(mdev);
1188
Philipp Reisnerb411b362009-09-25 16:07:19 -07001189 if (ns.conn == C_VERIFY_S) {
1190 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1191 (unsigned long long)mdev->ov_position);
1192 mod_timer(&mdev->resync_timer, jiffies);
1193 }
1194 }
1195
1196 if (get_ldev(mdev)) {
1197 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1198 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1199 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1200
1201 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1202 mdf |= MDF_CRASHED_PRIMARY;
1203 if (mdev->state.role == R_PRIMARY ||
1204 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1205 mdf |= MDF_PRIMARY_IND;
1206 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1207 mdf |= MDF_CONNECTED_IND;
1208 if (mdev->state.disk > D_INCONSISTENT)
1209 mdf |= MDF_CONSISTENT;
1210 if (mdev->state.disk > D_OUTDATED)
1211 mdf |= MDF_WAS_UP_TO_DATE;
1212 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1213 mdf |= MDF_PEER_OUT_DATED;
1214 if (mdf != mdev->ldev->md.flags) {
1215 mdev->ldev->md.flags = mdf;
1216 drbd_md_mark_dirty(mdev);
1217 }
1218 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1219 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1220 put_ldev(mdev);
1221 }
1222
1223 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1224 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1225 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1226 set_bit(CONSIDER_RESYNC, &mdev->flags);
1227
1228 /* Receiver should clean up itself */
1229 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1230 drbd_thread_stop_nowait(&mdev->receiver);
1231
1232 /* Now the receiver finished cleaning up itself, it should die */
1233 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1234 drbd_thread_stop_nowait(&mdev->receiver);
1235
1236 /* Upon network failure, we need to restart the receiver. */
1237 if (os.conn > C_TEAR_DOWN &&
1238 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1239 drbd_thread_restart_nowait(&mdev->receiver);
1240
Philipp Reisner07782862010-08-31 12:00:50 +02001241 /* Resume AL writing if we get a connection */
1242 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1243 drbd_resume_al(mdev);
1244
Philipp Reisnerb411b362009-09-25 16:07:19 -07001245 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1246 if (ascw) {
1247 ascw->os = os;
1248 ascw->ns = ns;
1249 ascw->flags = flags;
1250 ascw->w.cb = w_after_state_ch;
1251 ascw->done = done;
1252 drbd_queue_work(&mdev->data.work, &ascw->w);
1253 } else {
1254 dev_warn(DEV, "Could not kmalloc an ascw\n");
1255 }
1256
1257 return rv;
1258}
1259
1260static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1261{
1262 struct after_state_chg_work *ascw =
1263 container_of(w, struct after_state_chg_work, w);
1264 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1265 if (ascw->flags & CS_WAIT_COMPLETE) {
1266 D_ASSERT(ascw->done != NULL);
1267 complete(ascw->done);
1268 }
1269 kfree(ascw);
1270
1271 return 1;
1272}
1273
1274static void abw_start_sync(struct drbd_conf *mdev, int rv)
1275{
1276 if (rv) {
1277 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1278 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1279 return;
1280 }
1281
1282 switch (mdev->state.conn) {
1283 case C_STARTING_SYNC_T:
1284 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1285 break;
1286 case C_STARTING_SYNC_S:
1287 drbd_start_resync(mdev, C_SYNC_SOURCE);
1288 break;
1289 }
1290}
1291
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001292int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
1293{
1294 int rv;
1295
1296 D_ASSERT(current == mdev->worker.task);
1297
1298 /* open coded non-blocking drbd_suspend_io(mdev); */
1299 set_bit(SUSPEND_IO, &mdev->flags);
1300 if (!is_susp(mdev->state))
1301 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
1302
1303 drbd_bm_lock(mdev, why);
1304 rv = io_fn(mdev);
1305 drbd_bm_unlock(mdev);
1306
1307 drbd_resume_io(mdev);
1308
1309 return rv;
1310}
1311
Philipp Reisnerb411b362009-09-25 16:07:19 -07001312/**
1313 * after_state_ch() - Perform after state change actions that may sleep
1314 * @mdev: DRBD device.
1315 * @os: old state.
1316 * @ns: new state.
1317 * @flags: Flags
1318 */
1319static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1320 union drbd_state ns, enum chg_state_flags flags)
1321{
1322 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001323 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001324 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001325
1326 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1327 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1328 if (mdev->p_uuid)
1329 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1330 }
1331
1332 fp = FP_DONT_CARE;
1333 if (get_ldev(mdev)) {
1334 fp = mdev->ldev->dc.fencing;
1335 put_ldev(mdev);
1336 }
1337
1338 /* Inform userspace about the change... */
1339 drbd_bcast_state(mdev, ns);
1340
1341 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1342 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1343 drbd_khelper(mdev, "pri-on-incon-degr");
1344
1345 /* Here we have the actions that are performed after a
1346 state change. This function might sleep */
1347
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001348 nsm.i = -1;
1349 if (ns.susp_nod) {
Philipp Reisner265be2d2010-05-31 10:14:17 +02001350 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
Philipp Reisner67098932010-06-24 16:24:25 +02001351 if (ns.conn == C_CONNECTED)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001352 what = resend, nsm.susp_nod = 0;
Philipp Reisner67098932010-06-24 16:24:25 +02001353 else /* ns.conn > C_CONNECTED */
Lars Ellenberg418e0a92010-12-18 13:36:54 +01001354 dev_err(DEV, "Unexpected Resync going on!\n");
Philipp Reisner265be2d2010-05-31 10:14:17 +02001355 }
1356
Philipp Reisner67098932010-06-24 16:24:25 +02001357 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001358 what = restart_frozen_disk_io, nsm.susp_nod = 0;
1359
Philipp Reisner265be2d2010-05-31 10:14:17 +02001360 }
1361
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001362 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001363 /* case1: The outdate peer handler is successful: */
1364 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001365 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001366 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1367 drbd_uuid_new_current(mdev);
1368 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001369 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001370 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001371 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001372 spin_unlock_irq(&mdev->req_lock);
1373 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001374 /* case2: The connection was established again: */
1375 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1376 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001377 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001378 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001379 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001380 }
Philipp Reisner67098932010-06-24 16:24:25 +02001381
1382 if (what != nothing) {
1383 spin_lock_irq(&mdev->req_lock);
1384 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001385 nsm.i &= mdev->state.i;
1386 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001387 spin_unlock_irq(&mdev->req_lock);
1388 }
1389
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001390 /* Became sync source. With protocol >= 96, we still need to send out
1391 * the sync uuid now. Need to do that before any drbd_send_state, or
1392 * the other side may go "paused sync" before receiving the sync uuids,
1393 * which is unexpected. */
1394 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1395 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1396 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1397 drbd_gen_and_send_sync_uuid(mdev);
1398 put_ldev(mdev);
1399 }
1400
Philipp Reisnerb411b362009-09-25 16:07:19 -07001401 /* Do not change the order of the if above and the two below... */
1402 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1403 drbd_send_uuids(mdev);
1404 drbd_send_state(mdev);
1405 }
1406 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1407 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1408
1409 /* Lost contact to peer's copy of the data */
1410 if ((os.pdsk >= D_INCONSISTENT &&
1411 os.pdsk != D_UNKNOWN &&
1412 os.pdsk != D_OUTDATED)
1413 && (ns.pdsk < D_INCONSISTENT ||
1414 ns.pdsk == D_UNKNOWN ||
1415 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001416 if (get_ldev(mdev)) {
1417 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001418 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001419 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001420 set_bit(NEW_CUR_UUID, &mdev->flags);
1421 } else {
1422 drbd_uuid_new_current(mdev);
1423 drbd_send_uuids(mdev);
1424 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001425 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001426 put_ldev(mdev);
1427 }
1428 }
1429
1430 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001431 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001432 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001433 drbd_send_uuids(mdev);
1434 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001435
1436 /* D_DISKLESS Peer becomes secondary */
1437 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001438 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, "demote diskless peer");
1439 put_ldev(mdev);
1440 }
1441
1442 if (os.role == R_PRIMARY && ns.role == R_SECONDARY && get_ldev(mdev)) {
1443 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, "demote");
Philipp Reisnerb411b362009-09-25 16:07:19 -07001444 put_ldev(mdev);
1445 }
1446
1447 /* Last part of the attaching process ... */
1448 if (ns.conn >= C_CONNECTED &&
1449 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001450 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001451 drbd_send_uuids(mdev);
1452 drbd_send_state(mdev);
1453 }
1454
1455 /* We want to pause/continue resync, tell peer. */
1456 if (ns.conn >= C_CONNECTED &&
1457 ((os.aftr_isp != ns.aftr_isp) ||
1458 (os.user_isp != ns.user_isp)))
1459 drbd_send_state(mdev);
1460
1461 /* In case one of the isp bits got set, suspend other devices. */
1462 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1463 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1464 suspend_other_sg(mdev);
1465
1466 /* Make sure the peer gets informed about eventual state
1467 changes (ISP bits) while we were in WFReportParams. */
1468 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1469 drbd_send_state(mdev);
1470
Philipp Reisner67531712010-10-27 12:21:30 +02001471 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1472 drbd_send_state(mdev);
1473
Philipp Reisnerb411b362009-09-25 16:07:19 -07001474 /* We are in the progress to start a full sync... */
1475 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1476 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1477 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1478
1479 /* We are invalidating our self... */
1480 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1481 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1482 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1483
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001484 /* first half of local IO error, failure to attach,
1485 * or administrative detach */
1486 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1487 enum drbd_io_error_p eh;
1488 int was_io_error;
1489 /* corresponding get_ldev was in __drbd_set_state, to serialize
1490 * our cleanup here with the transition to D_DISKLESS,
1491 * so it is safe to dreference ldev here. */
1492 eh = mdev->ldev->dc.on_io_error;
1493 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1494
1495 /* current state still has to be D_FAILED,
1496 * there is only one way out: to D_DISKLESS,
1497 * and that may only happen after our put_ldev below. */
1498 if (mdev->state.disk != D_FAILED)
1499 dev_err(DEV,
1500 "ASSERT FAILED: disk is %s during detach\n",
1501 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001502
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001503 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001504 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001505 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001506 dev_err(DEV, "Sending state for detaching disk failed\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001507
1508 drbd_rs_cancel_all(mdev);
1509
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001510 /* In case we want to get something to stable storage still,
1511 * this may be the last chance.
1512 * Following put_ldev may transition to D_DISKLESS. */
1513 drbd_md_sync(mdev);
1514 put_ldev(mdev);
1515
1516 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001517 drbd_khelper(mdev, "local-io-error");
1518 }
1519
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001520 /* second half of local IO error, failure to attach,
1521 * or administrative detach,
1522 * after local_cnt references have reached zero again */
1523 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1524 /* We must still be diskless,
1525 * re-attach has to be serialized with this! */
1526 if (mdev->state.disk != D_DISKLESS)
1527 dev_err(DEV,
1528 "ASSERT FAILED: disk is %s while going diskless\n",
1529 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001530
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001531 mdev->rs_total = 0;
1532 mdev->rs_failed = 0;
1533 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001534
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001535 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001536 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001537 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001538 dev_err(DEV, "Sending state for being diskless failed\n");
1539 /* corresponding get_ldev in __drbd_set_state
1540 * this may finaly trigger drbd_ldev_destroy. */
1541 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001542 }
1543
1544 /* Disks got bigger while they were detached */
1545 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1546 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1547 if (ns.conn == C_CONNECTED)
1548 resync_after_online_grow(mdev);
1549 }
1550
1551 /* A resync finished or aborted, wake paused devices... */
1552 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1553 (os.peer_isp && !ns.peer_isp) ||
1554 (os.user_isp && !ns.user_isp))
1555 resume_next_sg(mdev);
1556
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001557 /* sync target done with resync. Explicitly notify peer, even though
1558 * it should (at least for non-empty resyncs) already know itself. */
1559 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1560 drbd_send_state(mdev);
1561
Lars Ellenberg02851e92010-12-16 14:47:39 +01001562 if (os.conn > C_CONNECTED && ns.conn == C_CONNECTED)
1563 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
1564
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001565 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001566 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001567 drbd_free_tl_hash(mdev);
1568
Philipp Reisnerb411b362009-09-25 16:07:19 -07001569 /* Upon network connection, we need to start the receiver */
1570 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1571 drbd_thread_start(&mdev->receiver);
1572
1573 /* Terminate worker thread if we are unconfigured - it will be
1574 restarted as needed... */
1575 if (ns.disk == D_DISKLESS &&
1576 ns.conn == C_STANDALONE &&
1577 ns.role == R_SECONDARY) {
1578 if (os.aftr_isp != ns.aftr_isp)
1579 resume_next_sg(mdev);
1580 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1581 if (test_bit(DEVICE_DYING, &mdev->flags))
1582 drbd_thread_stop_nowait(&mdev->worker);
1583 }
1584
1585 drbd_md_sync(mdev);
1586}
1587
1588
1589static int drbd_thread_setup(void *arg)
1590{
1591 struct drbd_thread *thi = (struct drbd_thread *) arg;
1592 struct drbd_conf *mdev = thi->mdev;
1593 unsigned long flags;
1594 int retval;
1595
1596restart:
1597 retval = thi->function(thi);
1598
1599 spin_lock_irqsave(&thi->t_lock, flags);
1600
1601 /* if the receiver has been "Exiting", the last thing it did
1602 * was set the conn state to "StandAlone",
1603 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1604 * and receiver thread will be "started".
1605 * drbd_thread_start needs to set "Restarting" in that case.
1606 * t_state check and assignment needs to be within the same spinlock,
1607 * so either thread_start sees Exiting, and can remap to Restarting,
1608 * or thread_start see None, and can proceed as normal.
1609 */
1610
1611 if (thi->t_state == Restarting) {
1612 dev_info(DEV, "Restarting %s\n", current->comm);
1613 thi->t_state = Running;
1614 spin_unlock_irqrestore(&thi->t_lock, flags);
1615 goto restart;
1616 }
1617
1618 thi->task = NULL;
1619 thi->t_state = None;
1620 smp_mb();
1621 complete(&thi->stop);
1622 spin_unlock_irqrestore(&thi->t_lock, flags);
1623
1624 dev_info(DEV, "Terminating %s\n", current->comm);
1625
1626 /* Release mod reference taken when thread was started */
1627 module_put(THIS_MODULE);
1628 return retval;
1629}
1630
1631static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1632 int (*func) (struct drbd_thread *))
1633{
1634 spin_lock_init(&thi->t_lock);
1635 thi->task = NULL;
1636 thi->t_state = None;
1637 thi->function = func;
1638 thi->mdev = mdev;
1639}
1640
1641int drbd_thread_start(struct drbd_thread *thi)
1642{
1643 struct drbd_conf *mdev = thi->mdev;
1644 struct task_struct *nt;
1645 unsigned long flags;
1646
1647 const char *me =
1648 thi == &mdev->receiver ? "receiver" :
1649 thi == &mdev->asender ? "asender" :
1650 thi == &mdev->worker ? "worker" : "NONSENSE";
1651
1652 /* is used from state engine doing drbd_thread_stop_nowait,
1653 * while holding the req lock irqsave */
1654 spin_lock_irqsave(&thi->t_lock, flags);
1655
1656 switch (thi->t_state) {
1657 case None:
1658 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1659 me, current->comm, current->pid);
1660
1661 /* Get ref on module for thread - this is released when thread exits */
1662 if (!try_module_get(THIS_MODULE)) {
1663 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1664 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001665 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001666 }
1667
1668 init_completion(&thi->stop);
1669 D_ASSERT(thi->task == NULL);
1670 thi->reset_cpu_mask = 1;
1671 thi->t_state = Running;
1672 spin_unlock_irqrestore(&thi->t_lock, flags);
1673 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1674
1675 nt = kthread_create(drbd_thread_setup, (void *) thi,
1676 "drbd%d_%s", mdev_to_minor(mdev), me);
1677
1678 if (IS_ERR(nt)) {
1679 dev_err(DEV, "Couldn't start thread\n");
1680
1681 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001682 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001683 }
1684 spin_lock_irqsave(&thi->t_lock, flags);
1685 thi->task = nt;
1686 thi->t_state = Running;
1687 spin_unlock_irqrestore(&thi->t_lock, flags);
1688 wake_up_process(nt);
1689 break;
1690 case Exiting:
1691 thi->t_state = Restarting;
1692 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1693 me, current->comm, current->pid);
1694 /* fall through */
1695 case Running:
1696 case Restarting:
1697 default:
1698 spin_unlock_irqrestore(&thi->t_lock, flags);
1699 break;
1700 }
1701
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001702 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001703}
1704
1705
1706void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1707{
1708 unsigned long flags;
1709
1710 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1711
1712 /* may be called from state engine, holding the req lock irqsave */
1713 spin_lock_irqsave(&thi->t_lock, flags);
1714
1715 if (thi->t_state == None) {
1716 spin_unlock_irqrestore(&thi->t_lock, flags);
1717 if (restart)
1718 drbd_thread_start(thi);
1719 return;
1720 }
1721
1722 if (thi->t_state != ns) {
1723 if (thi->task == NULL) {
1724 spin_unlock_irqrestore(&thi->t_lock, flags);
1725 return;
1726 }
1727
1728 thi->t_state = ns;
1729 smp_mb();
1730 init_completion(&thi->stop);
1731 if (thi->task != current)
1732 force_sig(DRBD_SIGKILL, thi->task);
1733
1734 }
1735
1736 spin_unlock_irqrestore(&thi->t_lock, flags);
1737
1738 if (wait)
1739 wait_for_completion(&thi->stop);
1740}
1741
1742#ifdef CONFIG_SMP
1743/**
1744 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1745 * @mdev: DRBD device.
1746 *
1747 * Forces all threads of a device onto the same CPU. This is beneficial for
1748 * DRBD's performance. May be overwritten by user's configuration.
1749 */
1750void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1751{
1752 int ord, cpu;
1753
1754 /* user override. */
1755 if (cpumask_weight(mdev->cpu_mask))
1756 return;
1757
1758 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1759 for_each_online_cpu(cpu) {
1760 if (ord-- == 0) {
1761 cpumask_set_cpu(cpu, mdev->cpu_mask);
1762 return;
1763 }
1764 }
1765 /* should not be reached */
1766 cpumask_setall(mdev->cpu_mask);
1767}
1768
1769/**
1770 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1771 * @mdev: DRBD device.
1772 *
1773 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1774 * prematurely.
1775 */
1776void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1777{
1778 struct task_struct *p = current;
1779 struct drbd_thread *thi =
1780 p == mdev->asender.task ? &mdev->asender :
1781 p == mdev->receiver.task ? &mdev->receiver :
1782 p == mdev->worker.task ? &mdev->worker :
1783 NULL;
1784 ERR_IF(thi == NULL)
1785 return;
1786 if (!thi->reset_cpu_mask)
1787 return;
1788 thi->reset_cpu_mask = 0;
1789 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1790}
1791#endif
1792
1793/* the appropriate socket mutex must be held already */
1794int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001795 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001796 size_t size, unsigned msg_flags)
1797{
1798 int sent, ok;
1799
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001800 ERR_IF(!h) return false;
1801 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001802
1803 h->magic = BE_DRBD_MAGIC;
1804 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001805 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001806
Philipp Reisnerb411b362009-09-25 16:07:19 -07001807 sent = drbd_send(mdev, sock, h, size, msg_flags);
1808
1809 ok = (sent == size);
1810 if (!ok)
1811 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1812 cmdname(cmd), (int)size, sent);
1813 return ok;
1814}
1815
1816/* don't pass the socket. we may only look at it
1817 * when we hold the appropriate socket mutex.
1818 */
1819int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001820 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001821{
1822 int ok = 0;
1823 struct socket *sock;
1824
1825 if (use_data_socket) {
1826 mutex_lock(&mdev->data.mutex);
1827 sock = mdev->data.socket;
1828 } else {
1829 mutex_lock(&mdev->meta.mutex);
1830 sock = mdev->meta.socket;
1831 }
1832
1833 /* drbd_disconnect() could have called drbd_free_sock()
1834 * while we were waiting in down()... */
1835 if (likely(sock != NULL))
1836 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1837
1838 if (use_data_socket)
1839 mutex_unlock(&mdev->data.mutex);
1840 else
1841 mutex_unlock(&mdev->meta.mutex);
1842 return ok;
1843}
1844
1845int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1846 size_t size)
1847{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001848 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001849 int ok;
1850
1851 h.magic = BE_DRBD_MAGIC;
1852 h.command = cpu_to_be16(cmd);
1853 h.length = cpu_to_be16(size);
1854
1855 if (!drbd_get_data_sock(mdev))
1856 return 0;
1857
Philipp Reisnerb411b362009-09-25 16:07:19 -07001858 ok = (sizeof(h) ==
1859 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1860 ok = ok && (size ==
1861 drbd_send(mdev, mdev->data.socket, data, size, 0));
1862
1863 drbd_put_data_sock(mdev);
1864
1865 return ok;
1866}
1867
1868int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1869{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001870 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001871 struct socket *sock;
1872 int size, rv;
1873 const int apv = mdev->agreed_pro_version;
1874
1875 size = apv <= 87 ? sizeof(struct p_rs_param)
1876 : apv == 88 ? sizeof(struct p_rs_param)
1877 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001878 : apv <= 94 ? sizeof(struct p_rs_param_89)
1879 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001880
1881 /* used from admin command context and receiver/worker context.
1882 * to avoid kmalloc, grab the socket right here,
1883 * then use the pre-allocated sbuf there */
1884 mutex_lock(&mdev->data.mutex);
1885 sock = mdev->data.socket;
1886
1887 if (likely(sock != NULL)) {
1888 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1889
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001890 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001891
1892 /* initialize verify_alg and csums_alg */
1893 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1894
1895 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001896 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1897 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1898 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1899 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001900
1901 if (apv >= 88)
1902 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1903 if (apv >= 89)
1904 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1905
1906 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1907 } else
1908 rv = 0; /* not ok */
1909
1910 mutex_unlock(&mdev->data.mutex);
1911
1912 return rv;
1913}
1914
1915int drbd_send_protocol(struct drbd_conf *mdev)
1916{
1917 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001918 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001919
1920 size = sizeof(struct p_protocol);
1921
1922 if (mdev->agreed_pro_version >= 87)
1923 size += strlen(mdev->net_conf->integrity_alg) + 1;
1924
1925 /* we must not recurse into our own queue,
1926 * as that is blocked during handshake */
1927 p = kmalloc(size, GFP_NOIO);
1928 if (p == NULL)
1929 return 0;
1930
1931 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1932 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1933 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1934 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001935 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1936
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001937 cf = 0;
1938 if (mdev->net_conf->want_lose)
1939 cf |= CF_WANT_LOSE;
1940 if (mdev->net_conf->dry_run) {
1941 if (mdev->agreed_pro_version >= 92)
1942 cf |= CF_DRY_RUN;
1943 else {
1944 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001945 kfree(p);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001946 return 0;
1947 }
1948 }
1949 p->conn_flags = cpu_to_be32(cf);
1950
Philipp Reisnerb411b362009-09-25 16:07:19 -07001951 if (mdev->agreed_pro_version >= 87)
1952 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1953
1954 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001955 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001956 kfree(p);
1957 return rv;
1958}
1959
1960int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1961{
1962 struct p_uuids p;
1963 int i;
1964
1965 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1966 return 1;
1967
1968 for (i = UI_CURRENT; i < UI_SIZE; i++)
1969 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1970
1971 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1972 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1973 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1974 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1975 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1976 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1977
1978 put_ldev(mdev);
1979
1980 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001981 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001982}
1983
1984int drbd_send_uuids(struct drbd_conf *mdev)
1985{
1986 return _drbd_send_uuids(mdev, 0);
1987}
1988
1989int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1990{
1991 return _drbd_send_uuids(mdev, 8);
1992}
1993
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001994int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001995{
1996 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001997 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001998
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001999 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2000
2001 get_random_bytes(&uuid, sizeof(u64));
2002 drbd_uuid_set(mdev, UI_BITMAP, uuid);
2003 drbd_md_sync(mdev);
2004 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002005
2006 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002007 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002008}
2009
Philipp Reisnere89b5912010-03-24 17:11:33 +01002010int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002011{
2012 struct p_sizes p;
2013 sector_t d_size, u_size;
2014 int q_order_type;
2015 int ok;
2016
2017 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2018 D_ASSERT(mdev->ldev->backing_bdev);
2019 d_size = drbd_get_max_capacity(mdev->ldev);
2020 u_size = mdev->ldev->dc.disk_size;
2021 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002022 put_ldev(mdev);
2023 } else {
2024 d_size = 0;
2025 u_size = 0;
2026 q_order_type = QUEUE_ORDERED_NONE;
2027 }
2028
2029 p.d_size = cpu_to_be64(d_size);
2030 p.u_size = cpu_to_be64(u_size);
2031 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01002032 p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002033 p.queue_order_type = cpu_to_be16(q_order_type);
2034 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002035
2036 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002037 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002038 return ok;
2039}
2040
2041/**
2042 * drbd_send_state() - Sends the drbd state to the peer
2043 * @mdev: DRBD device.
2044 */
2045int drbd_send_state(struct drbd_conf *mdev)
2046{
2047 struct socket *sock;
2048 struct p_state p;
2049 int ok = 0;
2050
2051 /* Grab state lock so we wont send state if we're in the middle
2052 * of a cluster wide state change on another thread */
2053 drbd_state_lock(mdev);
2054
2055 mutex_lock(&mdev->data.mutex);
2056
2057 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2058 sock = mdev->data.socket;
2059
2060 if (likely(sock != NULL)) {
2061 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002062 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002063 }
2064
2065 mutex_unlock(&mdev->data.mutex);
2066
2067 drbd_state_unlock(mdev);
2068 return ok;
2069}
2070
2071int drbd_send_state_req(struct drbd_conf *mdev,
2072 union drbd_state mask, union drbd_state val)
2073{
2074 struct p_req_state p;
2075
2076 p.mask = cpu_to_be32(mask.i);
2077 p.val = cpu_to_be32(val.i);
2078
2079 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002080 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002081}
2082
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002083int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002084{
2085 struct p_req_state_reply p;
2086
2087 p.retcode = cpu_to_be32(retcode);
2088
2089 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002090 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002091}
2092
2093int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2094 struct p_compressed_bm *p,
2095 struct bm_xfer_ctx *c)
2096{
2097 struct bitstream bs;
2098 unsigned long plain_bits;
2099 unsigned long tmp;
2100 unsigned long rl;
2101 unsigned len;
2102 unsigned toggle;
2103 int bits;
2104
2105 /* may we use this feature? */
2106 if ((mdev->sync_conf.use_rle == 0) ||
2107 (mdev->agreed_pro_version < 90))
2108 return 0;
2109
2110 if (c->bit_offset >= c->bm_bits)
2111 return 0; /* nothing to do. */
2112
2113 /* use at most thus many bytes */
2114 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2115 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2116 /* plain bits covered in this code string */
2117 plain_bits = 0;
2118
2119 /* p->encoding & 0x80 stores whether the first run length is set.
2120 * bit offset is implicit.
2121 * start with toggle == 2 to be able to tell the first iteration */
2122 toggle = 2;
2123
2124 /* see how much plain bits we can stuff into one packet
2125 * using RLE and VLI. */
2126 do {
2127 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2128 : _drbd_bm_find_next(mdev, c->bit_offset);
2129 if (tmp == -1UL)
2130 tmp = c->bm_bits;
2131 rl = tmp - c->bit_offset;
2132
2133 if (toggle == 2) { /* first iteration */
2134 if (rl == 0) {
2135 /* the first checked bit was set,
2136 * store start value, */
2137 DCBP_set_start(p, 1);
2138 /* but skip encoding of zero run length */
2139 toggle = !toggle;
2140 continue;
2141 }
2142 DCBP_set_start(p, 0);
2143 }
2144
2145 /* paranoia: catch zero runlength.
2146 * can only happen if bitmap is modified while we scan it. */
2147 if (rl == 0) {
2148 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2149 "t:%u bo:%lu\n", toggle, c->bit_offset);
2150 return -1;
2151 }
2152
2153 bits = vli_encode_bits(&bs, rl);
2154 if (bits == -ENOBUFS) /* buffer full */
2155 break;
2156 if (bits <= 0) {
2157 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2158 return 0;
2159 }
2160
2161 toggle = !toggle;
2162 plain_bits += rl;
2163 c->bit_offset = tmp;
2164 } while (c->bit_offset < c->bm_bits);
2165
2166 len = bs.cur.b - p->code + !!bs.cur.bit;
2167
2168 if (plain_bits < (len << 3)) {
2169 /* incompressible with this method.
2170 * we need to rewind both word and bit position. */
2171 c->bit_offset -= plain_bits;
2172 bm_xfer_ctx_bit_to_word_offset(c);
2173 c->bit_offset = c->word_offset * BITS_PER_LONG;
2174 return 0;
2175 }
2176
2177 /* RLE + VLI was able to compress it just fine.
2178 * update c->word_offset. */
2179 bm_xfer_ctx_bit_to_word_offset(c);
2180
2181 /* store pad_bits */
2182 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2183
2184 return len;
2185}
2186
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002187/**
2188 * send_bitmap_rle_or_plain
2189 *
2190 * Return 0 when done, 1 when another iteration is needed, and a negative error
2191 * code upon failure.
2192 */
2193static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002194send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002195 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002196{
2197 struct p_compressed_bm *p = (void*)h;
2198 unsigned long num_words;
2199 int len;
2200 int ok;
2201
2202 len = fill_bitmap_rle_bits(mdev, p, c);
2203
2204 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002205 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002206
2207 if (len) {
2208 DCBP_set_code(p, RLE_VLI_Bits);
2209 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2210 sizeof(*p) + len, 0);
2211
2212 c->packets[0]++;
2213 c->bytes[0] += sizeof(*p) + len;
2214
2215 if (c->bit_offset >= c->bm_bits)
2216 len = 0; /* DONE */
2217 } else {
2218 /* was not compressible.
2219 * send a buffer full of plain text bits instead. */
2220 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2221 len = num_words * sizeof(long);
2222 if (len)
2223 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2224 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002225 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002226 c->word_offset += num_words;
2227 c->bit_offset = c->word_offset * BITS_PER_LONG;
2228
2229 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002230 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002231
2232 if (c->bit_offset > c->bm_bits)
2233 c->bit_offset = c->bm_bits;
2234 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002235 if (ok) {
2236 if (len == 0) {
2237 INFO_bm_xfer_stats(mdev, "send", c);
2238 return 0;
2239 } else
2240 return 1;
2241 }
2242 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002243}
2244
2245/* See the comment at receive_bitmap() */
2246int _drbd_send_bitmap(struct drbd_conf *mdev)
2247{
2248 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002249 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002250 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002251
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002252 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002253
2254 /* maybe we should use some per thread scratch page,
2255 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002256 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002257 if (!p) {
2258 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002259 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002260 }
2261
2262 if (get_ldev(mdev)) {
2263 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2264 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2265 drbd_bm_set_all(mdev);
2266 if (drbd_bm_write(mdev)) {
2267 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2268 * but otherwise process as per normal - need to tell other
2269 * side that a full resync is required! */
2270 dev_err(DEV, "Failed to write bitmap to disk!\n");
2271 } else {
2272 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2273 drbd_md_sync(mdev);
2274 }
2275 }
2276 put_ldev(mdev);
2277 }
2278
2279 c = (struct bm_xfer_ctx) {
2280 .bm_bits = drbd_bm_bits(mdev),
2281 .bm_words = drbd_bm_words(mdev),
2282 };
2283
2284 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002285 err = send_bitmap_rle_or_plain(mdev, p, &c);
2286 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002287
2288 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002289 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002290}
2291
2292int drbd_send_bitmap(struct drbd_conf *mdev)
2293{
2294 int err;
2295
2296 if (!drbd_get_data_sock(mdev))
2297 return -1;
2298 err = !_drbd_send_bitmap(mdev);
2299 drbd_put_data_sock(mdev);
2300 return err;
2301}
2302
2303int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2304{
2305 int ok;
2306 struct p_barrier_ack p;
2307
2308 p.barrier = barrier_nr;
2309 p.set_size = cpu_to_be32(set_size);
2310
2311 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002312 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002313 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002314 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002315 return ok;
2316}
2317
2318/**
2319 * _drbd_send_ack() - Sends an ack packet
2320 * @mdev: DRBD device.
2321 * @cmd: Packet command code.
2322 * @sector: sector, needs to be in big endian byte order
2323 * @blksize: size in byte, needs to be in big endian byte order
2324 * @block_id: Id, big endian byte order
2325 */
2326static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2327 u64 sector,
2328 u32 blksize,
2329 u64 block_id)
2330{
2331 int ok;
2332 struct p_block_ack p;
2333
2334 p.sector = sector;
2335 p.block_id = block_id;
2336 p.blksize = blksize;
2337 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2338
2339 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002340 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002341 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002342 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002343 return ok;
2344}
2345
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002346/* dp->sector and dp->block_id already/still in network byte order,
2347 * data_size is payload size according to dp->head,
2348 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002349int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002350 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002351{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002352 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2353 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002354 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2355 dp->block_id);
2356}
2357
2358int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2359 struct p_block_req *rp)
2360{
2361 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2362}
2363
2364/**
2365 * drbd_send_ack() - Sends an ack packet
2366 * @mdev: DRBD device.
2367 * @cmd: Packet command code.
2368 * @e: Epoch entry.
2369 */
2370int drbd_send_ack(struct drbd_conf *mdev,
2371 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2372{
2373 return _drbd_send_ack(mdev, cmd,
2374 cpu_to_be64(e->sector),
2375 cpu_to_be32(e->size),
2376 e->block_id);
2377}
2378
2379/* This function misuses the block_id field to signal if the blocks
2380 * are is sync or not. */
2381int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2382 sector_t sector, int blksize, u64 block_id)
2383{
2384 return _drbd_send_ack(mdev, cmd,
2385 cpu_to_be64(sector),
2386 cpu_to_be32(blksize),
2387 cpu_to_be64(block_id));
2388}
2389
2390int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2391 sector_t sector, int size, u64 block_id)
2392{
2393 int ok;
2394 struct p_block_req p;
2395
2396 p.sector = cpu_to_be64(sector);
2397 p.block_id = block_id;
2398 p.blksize = cpu_to_be32(size);
2399
2400 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002401 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002402 return ok;
2403}
2404
2405int drbd_send_drequest_csum(struct drbd_conf *mdev,
2406 sector_t sector, int size,
2407 void *digest, int digest_size,
2408 enum drbd_packets cmd)
2409{
2410 int ok;
2411 struct p_block_req p;
2412
2413 p.sector = cpu_to_be64(sector);
2414 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2415 p.blksize = cpu_to_be32(size);
2416
2417 p.head.magic = BE_DRBD_MAGIC;
2418 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002419 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002420
2421 mutex_lock(&mdev->data.mutex);
2422
2423 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2424 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2425
2426 mutex_unlock(&mdev->data.mutex);
2427
2428 return ok;
2429}
2430
2431int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2432{
2433 int ok;
2434 struct p_block_req p;
2435
2436 p.sector = cpu_to_be64(sector);
2437 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2438 p.blksize = cpu_to_be32(size);
2439
2440 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002441 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002442 return ok;
2443}
2444
2445/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002446 * returns false if we should retry,
2447 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002448 */
2449static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2450{
2451 int drop_it;
2452 /* long elapsed = (long)(jiffies - mdev->last_received); */
2453
2454 drop_it = mdev->meta.socket == sock
2455 || !mdev->asender.task
2456 || get_t_state(&mdev->asender) != Running
2457 || mdev->state.conn < C_CONNECTED;
2458
2459 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002460 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002461
2462 drop_it = !--mdev->ko_count;
2463 if (!drop_it) {
2464 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2465 current->comm, current->pid, mdev->ko_count);
2466 request_ping(mdev);
2467 }
2468
2469 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2470}
2471
2472/* The idea of sendpage seems to be to put some kind of reference
2473 * to the page into the skb, and to hand it over to the NIC. In
2474 * this process get_page() gets called.
2475 *
2476 * As soon as the page was really sent over the network put_page()
2477 * gets called by some part of the network layer. [ NIC driver? ]
2478 *
2479 * [ get_page() / put_page() increment/decrement the count. If count
2480 * reaches 0 the page will be freed. ]
2481 *
2482 * This works nicely with pages from FSs.
2483 * But this means that in protocol A we might signal IO completion too early!
2484 *
2485 * In order not to corrupt data during a resync we must make sure
2486 * that we do not reuse our own buffer pages (EEs) to early, therefore
2487 * we have the net_ee list.
2488 *
2489 * XFS seems to have problems, still, it submits pages with page_count == 0!
2490 * As a workaround, we disable sendpage on pages
2491 * with page_count == 0 or PageSlab.
2492 */
2493static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002494 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002495{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002496 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002497 kunmap(page);
2498 if (sent == size)
2499 mdev->send_cnt += size>>9;
2500 return sent == size;
2501}
2502
2503static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002504 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002505{
2506 mm_segment_t oldfs = get_fs();
2507 int sent, ok;
2508 int len = size;
2509
2510 /* e.g. XFS meta- & log-data is in slab pages, which have a
2511 * page_count of 0 and/or have PageSlab() set.
2512 * we cannot use send_page for those, as that does get_page();
2513 * put_page(); and would cause either a VM_BUG directly, or
2514 * __page_cache_release a page that would actually still be referenced
2515 * by someone, leading to some obscure delayed Oops somewhere else. */
2516 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002517 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002518
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002519 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002520 drbd_update_congested(mdev);
2521 set_fs(KERNEL_DS);
2522 do {
2523 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2524 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002525 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002526 if (sent == -EAGAIN) {
2527 if (we_should_drop_the_connection(mdev,
2528 mdev->data.socket))
2529 break;
2530 else
2531 continue;
2532 }
2533 if (sent <= 0) {
2534 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2535 __func__, (int)size, len, sent);
2536 break;
2537 }
2538 len -= sent;
2539 offset += sent;
2540 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2541 set_fs(oldfs);
2542 clear_bit(NET_CONGESTED, &mdev->flags);
2543
2544 ok = (len == 0);
2545 if (likely(ok))
2546 mdev->send_cnt += size>>9;
2547 return ok;
2548}
2549
2550static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2551{
2552 struct bio_vec *bvec;
2553 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002554 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002555 __bio_for_each_segment(bvec, bio, i, 0) {
2556 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002557 bvec->bv_offset, bvec->bv_len,
2558 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002559 return 0;
2560 }
2561 return 1;
2562}
2563
2564static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2565{
2566 struct bio_vec *bvec;
2567 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002568 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002569 __bio_for_each_segment(bvec, bio, i, 0) {
2570 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002571 bvec->bv_offset, bvec->bv_len,
2572 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002573 return 0;
2574 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002575 return 1;
2576}
2577
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002578static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2579{
2580 struct page *page = e->pages;
2581 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002582 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002583 page_chain_for_each(page) {
2584 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002585 if (!_drbd_send_page(mdev, page, 0, l,
2586 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002587 return 0;
2588 len -= l;
2589 }
2590 return 1;
2591}
2592
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002593static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2594{
2595 if (mdev->agreed_pro_version >= 95)
2596 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002597 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2598 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2599 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2600 else
Jens Axboe721a9602011-03-09 11:56:30 +01002601 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002602}
2603
Philipp Reisnerb411b362009-09-25 16:07:19 -07002604/* Used to send write requests
2605 * R_PRIMARY -> Peer (P_DATA)
2606 */
2607int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2608{
2609 int ok = 1;
2610 struct p_data p;
2611 unsigned int dp_flags = 0;
2612 void *dgb;
2613 int dgs;
2614
2615 if (!drbd_get_data_sock(mdev))
2616 return 0;
2617
2618 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2619 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2620
Philipp Reisnerd5373382010-08-23 15:18:33 +02002621 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002622 p.head.h80.magic = BE_DRBD_MAGIC;
2623 p.head.h80.command = cpu_to_be16(P_DATA);
2624 p.head.h80.length =
2625 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2626 } else {
2627 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2628 p.head.h95.command = cpu_to_be16(P_DATA);
2629 p.head.h95.length =
2630 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2631 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002632
2633 p.sector = cpu_to_be64(req->sector);
2634 p.block_id = (unsigned long)req;
2635 p.seq_num = cpu_to_be32(req->seq_num =
2636 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002637
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002638 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2639
Philipp Reisnerb411b362009-09-25 16:07:19 -07002640 if (mdev->state.conn >= C_SYNC_SOURCE &&
2641 mdev->state.conn <= C_PAUSED_SYNC_T)
2642 dp_flags |= DP_MAY_SET_IN_SYNC;
2643
2644 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002645 set_bit(UNPLUG_REMOTE, &mdev->flags);
2646 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002647 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002648 if (ok && dgs) {
2649 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002650 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002651 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002652 }
2653 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002654 /* For protocol A, we have to memcpy the payload into
2655 * socket buffers, as we may complete right away
2656 * as soon as we handed it over to tcp, at which point the data
2657 * pages may become invalid.
2658 *
2659 * For data-integrity enabled, we copy it as well, so we can be
2660 * sure that even if the bio pages may still be modified, it
2661 * won't change the data on the wire, thus if the digest checks
2662 * out ok after sending on this side, but does not fit on the
2663 * receiving side, we sure have detected corruption elsewhere.
2664 */
2665 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002666 ok = _drbd_send_bio(mdev, req->master_bio);
2667 else
2668 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002669
2670 /* double check digest, sometimes buffers have been modified in flight. */
2671 if (dgs > 0 && dgs <= 64) {
2672 /* 64 byte, 512 bit, is the larges digest size
2673 * currently supported in kernel crypto. */
2674 unsigned char digest[64];
2675 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2676 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2677 dev_warn(DEV,
2678 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2679 (unsigned long long)req->sector, req->size);
2680 }
2681 } /* else if (dgs > 64) {
2682 ... Be noisy about digest too large ...
2683 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002684 }
2685
2686 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002687
Philipp Reisnerb411b362009-09-25 16:07:19 -07002688 return ok;
2689}
2690
2691/* answer packet, used to send data back for read requests:
2692 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2693 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2694 */
2695int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2696 struct drbd_epoch_entry *e)
2697{
2698 int ok;
2699 struct p_data p;
2700 void *dgb;
2701 int dgs;
2702
2703 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2704 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2705
Philipp Reisnerd5373382010-08-23 15:18:33 +02002706 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002707 p.head.h80.magic = BE_DRBD_MAGIC;
2708 p.head.h80.command = cpu_to_be16(cmd);
2709 p.head.h80.length =
2710 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2711 } else {
2712 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2713 p.head.h95.command = cpu_to_be16(cmd);
2714 p.head.h95.length =
2715 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2716 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002717
2718 p.sector = cpu_to_be64(e->sector);
2719 p.block_id = e->block_id;
2720 /* p.seq_num = 0; No sequence numbers here.. */
2721
2722 /* Only called by our kernel thread.
2723 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2724 * in response to admin command or module unload.
2725 */
2726 if (!drbd_get_data_sock(mdev))
2727 return 0;
2728
Philipp Reisner0b70a132010-08-20 13:36:10 +02002729 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002730 if (ok && dgs) {
2731 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002732 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002733 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002734 }
2735 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002736 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002737
2738 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002739
Philipp Reisnerb411b362009-09-25 16:07:19 -07002740 return ok;
2741}
2742
Philipp Reisner73a01a12010-10-27 14:33:00 +02002743int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2744{
2745 struct p_block_desc p;
2746
2747 p.sector = cpu_to_be64(req->sector);
2748 p.blksize = cpu_to_be32(req->size);
2749
2750 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2751}
2752
Philipp Reisnerb411b362009-09-25 16:07:19 -07002753/*
2754 drbd_send distinguishes two cases:
2755
2756 Packets sent via the data socket "sock"
2757 and packets sent via the meta data socket "msock"
2758
2759 sock msock
2760 -----------------+-------------------------+------------------------------
2761 timeout conf.timeout / 2 conf.timeout / 2
2762 timeout action send a ping via msock Abort communication
2763 and close all sockets
2764*/
2765
2766/*
2767 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2768 */
2769int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2770 void *buf, size_t size, unsigned msg_flags)
2771{
2772 struct kvec iov;
2773 struct msghdr msg;
2774 int rv, sent = 0;
2775
2776 if (!sock)
2777 return -1000;
2778
2779 /* THINK if (signal_pending) return ... ? */
2780
2781 iov.iov_base = buf;
2782 iov.iov_len = size;
2783
2784 msg.msg_name = NULL;
2785 msg.msg_namelen = 0;
2786 msg.msg_control = NULL;
2787 msg.msg_controllen = 0;
2788 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2789
2790 if (sock == mdev->data.socket) {
2791 mdev->ko_count = mdev->net_conf->ko_count;
2792 drbd_update_congested(mdev);
2793 }
2794 do {
2795 /* STRANGE
2796 * tcp_sendmsg does _not_ use its size parameter at all ?
2797 *
2798 * -EAGAIN on timeout, -EINTR on signal.
2799 */
2800/* THINK
2801 * do we need to block DRBD_SIG if sock == &meta.socket ??
2802 * otherwise wake_asender() might interrupt some send_*Ack !
2803 */
2804 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2805 if (rv == -EAGAIN) {
2806 if (we_should_drop_the_connection(mdev, sock))
2807 break;
2808 else
2809 continue;
2810 }
2811 D_ASSERT(rv != 0);
2812 if (rv == -EINTR) {
2813 flush_signals(current);
2814 rv = 0;
2815 }
2816 if (rv < 0)
2817 break;
2818 sent += rv;
2819 iov.iov_base += rv;
2820 iov.iov_len -= rv;
2821 } while (sent < size);
2822
2823 if (sock == mdev->data.socket)
2824 clear_bit(NET_CONGESTED, &mdev->flags);
2825
2826 if (rv <= 0) {
2827 if (rv != -EAGAIN) {
2828 dev_err(DEV, "%s_sendmsg returned %d\n",
2829 sock == mdev->meta.socket ? "msock" : "sock",
2830 rv);
2831 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2832 } else
2833 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2834 }
2835
2836 return sent;
2837}
2838
2839static int drbd_open(struct block_device *bdev, fmode_t mode)
2840{
2841 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2842 unsigned long flags;
2843 int rv = 0;
2844
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002845 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002846 spin_lock_irqsave(&mdev->req_lock, flags);
2847 /* to have a stable mdev->state.role
2848 * and no race with updating open_cnt */
2849
2850 if (mdev->state.role != R_PRIMARY) {
2851 if (mode & FMODE_WRITE)
2852 rv = -EROFS;
2853 else if (!allow_oos)
2854 rv = -EMEDIUMTYPE;
2855 }
2856
2857 if (!rv)
2858 mdev->open_cnt++;
2859 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002860 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002861
2862 return rv;
2863}
2864
2865static int drbd_release(struct gendisk *gd, fmode_t mode)
2866{
2867 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002868 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002869 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002870 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002871 return 0;
2872}
2873
Philipp Reisnerb411b362009-09-25 16:07:19 -07002874static void drbd_set_defaults(struct drbd_conf *mdev)
2875{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002876 /* This way we get a compile error when sync_conf grows,
2877 and we forgot to initialize it here */
2878 mdev->sync_conf = (struct syncer_conf) {
2879 /* .rate = */ DRBD_RATE_DEF,
2880 /* .after = */ DRBD_AFTER_DEF,
2881 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002882 /* .verify_alg = */ {}, 0,
2883 /* .cpu_mask = */ {}, 0,
2884 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002885 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002886 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2887 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2888 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2889 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002890 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2891 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002892 };
2893
2894 /* Have to use that way, because the layout differs between
2895 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002896 mdev->state = (union drbd_state) {
2897 { .role = R_SECONDARY,
2898 .peer = R_UNKNOWN,
2899 .conn = C_STANDALONE,
2900 .disk = D_DISKLESS,
2901 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002902 .susp = 0,
2903 .susp_nod = 0,
2904 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002905 } };
2906}
2907
2908void drbd_init_set_defaults(struct drbd_conf *mdev)
2909{
2910 /* the memset(,0,) did most of this.
2911 * note: only assignments, no allocation in here */
2912
2913 drbd_set_defaults(mdev);
2914
Philipp Reisnerb411b362009-09-25 16:07:19 -07002915 atomic_set(&mdev->ap_bio_cnt, 0);
2916 atomic_set(&mdev->ap_pending_cnt, 0);
2917 atomic_set(&mdev->rs_pending_cnt, 0);
2918 atomic_set(&mdev->unacked_cnt, 0);
2919 atomic_set(&mdev->local_cnt, 0);
2920 atomic_set(&mdev->net_cnt, 0);
2921 atomic_set(&mdev->packet_seq, 0);
2922 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02002923 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02002924 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002925 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02002926 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002927
2928 mutex_init(&mdev->md_io_mutex);
2929 mutex_init(&mdev->data.mutex);
2930 mutex_init(&mdev->meta.mutex);
2931 sema_init(&mdev->data.work.s, 0);
2932 sema_init(&mdev->meta.work.s, 0);
2933 mutex_init(&mdev->state_mutex);
2934
2935 spin_lock_init(&mdev->data.work.q_lock);
2936 spin_lock_init(&mdev->meta.work.q_lock);
2937
2938 spin_lock_init(&mdev->al_lock);
2939 spin_lock_init(&mdev->req_lock);
2940 spin_lock_init(&mdev->peer_seq_lock);
2941 spin_lock_init(&mdev->epoch_lock);
2942
2943 INIT_LIST_HEAD(&mdev->active_ee);
2944 INIT_LIST_HEAD(&mdev->sync_ee);
2945 INIT_LIST_HEAD(&mdev->done_ee);
2946 INIT_LIST_HEAD(&mdev->read_ee);
2947 INIT_LIST_HEAD(&mdev->net_ee);
2948 INIT_LIST_HEAD(&mdev->resync_reads);
2949 INIT_LIST_HEAD(&mdev->data.work.q);
2950 INIT_LIST_HEAD(&mdev->meta.work.q);
2951 INIT_LIST_HEAD(&mdev->resync_work.list);
2952 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002953 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002954 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02002955 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002956 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02002957
Philipp Reisnerb411b362009-09-25 16:07:19 -07002958 mdev->resync_work.cb = w_resync_inactive;
2959 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002960 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002961 mdev->md_sync_work.cb = w_md_sync;
2962 mdev->bm_io_work.w.cb = w_bitmap_io;
2963 init_timer(&mdev->resync_timer);
2964 init_timer(&mdev->md_sync_timer);
2965 mdev->resync_timer.function = resync_timer_fn;
2966 mdev->resync_timer.data = (unsigned long) mdev;
2967 mdev->md_sync_timer.function = md_sync_timer_fn;
2968 mdev->md_sync_timer.data = (unsigned long) mdev;
2969
2970 init_waitqueue_head(&mdev->misc_wait);
2971 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02002972 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002973 init_waitqueue_head(&mdev->ee_wait);
2974 init_waitqueue_head(&mdev->al_wait);
2975 init_waitqueue_head(&mdev->seq_wait);
2976
2977 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2978 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2979 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2980
2981 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02002982 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002983 mdev->resync_wenr = LC_FREE;
2984}
2985
2986void drbd_mdev_cleanup(struct drbd_conf *mdev)
2987{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002988 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002989 if (mdev->receiver.t_state != None)
2990 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2991 mdev->receiver.t_state);
2992
2993 /* no need to lock it, I'm the only thread alive */
2994 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2995 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2996 mdev->al_writ_cnt =
2997 mdev->bm_writ_cnt =
2998 mdev->read_cnt =
2999 mdev->recv_cnt =
3000 mdev->send_cnt =
3001 mdev->writ_cnt =
3002 mdev->p_size =
3003 mdev->rs_start =
3004 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003005 mdev->rs_failed = 0;
3006 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003007 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003008 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3009 mdev->rs_mark_left[i] = 0;
3010 mdev->rs_mark_time[i] = 0;
3011 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003012 D_ASSERT(mdev->net_conf == NULL);
3013
3014 drbd_set_my_capacity(mdev, 0);
3015 if (mdev->bitmap) {
3016 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003017 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003018 drbd_bm_cleanup(mdev);
3019 }
3020
3021 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003022 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003023
3024 /*
3025 * currently we drbd_init_ee only on module load, so
3026 * we may do drbd_release_ee only on module unload!
3027 */
3028 D_ASSERT(list_empty(&mdev->active_ee));
3029 D_ASSERT(list_empty(&mdev->sync_ee));
3030 D_ASSERT(list_empty(&mdev->done_ee));
3031 D_ASSERT(list_empty(&mdev->read_ee));
3032 D_ASSERT(list_empty(&mdev->net_ee));
3033 D_ASSERT(list_empty(&mdev->resync_reads));
3034 D_ASSERT(list_empty(&mdev->data.work.q));
3035 D_ASSERT(list_empty(&mdev->meta.work.q));
3036 D_ASSERT(list_empty(&mdev->resync_work.list));
3037 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003038 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003039
3040 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003041}
3042
3043
3044static void drbd_destroy_mempools(void)
3045{
3046 struct page *page;
3047
3048 while (drbd_pp_pool) {
3049 page = drbd_pp_pool;
3050 drbd_pp_pool = (struct page *)page_private(page);
3051 __free_page(page);
3052 drbd_pp_vacant--;
3053 }
3054
3055 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3056
3057 if (drbd_ee_mempool)
3058 mempool_destroy(drbd_ee_mempool);
3059 if (drbd_request_mempool)
3060 mempool_destroy(drbd_request_mempool);
3061 if (drbd_ee_cache)
3062 kmem_cache_destroy(drbd_ee_cache);
3063 if (drbd_request_cache)
3064 kmem_cache_destroy(drbd_request_cache);
3065 if (drbd_bm_ext_cache)
3066 kmem_cache_destroy(drbd_bm_ext_cache);
3067 if (drbd_al_ext_cache)
3068 kmem_cache_destroy(drbd_al_ext_cache);
3069
3070 drbd_ee_mempool = NULL;
3071 drbd_request_mempool = NULL;
3072 drbd_ee_cache = NULL;
3073 drbd_request_cache = NULL;
3074 drbd_bm_ext_cache = NULL;
3075 drbd_al_ext_cache = NULL;
3076
3077 return;
3078}
3079
3080static int drbd_create_mempools(void)
3081{
3082 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003083 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003084 int i;
3085
3086 /* prepare our caches and mempools */
3087 drbd_request_mempool = NULL;
3088 drbd_ee_cache = NULL;
3089 drbd_request_cache = NULL;
3090 drbd_bm_ext_cache = NULL;
3091 drbd_al_ext_cache = NULL;
3092 drbd_pp_pool = NULL;
3093
3094 /* caches */
3095 drbd_request_cache = kmem_cache_create(
3096 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3097 if (drbd_request_cache == NULL)
3098 goto Enomem;
3099
3100 drbd_ee_cache = kmem_cache_create(
3101 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3102 if (drbd_ee_cache == NULL)
3103 goto Enomem;
3104
3105 drbd_bm_ext_cache = kmem_cache_create(
3106 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3107 if (drbd_bm_ext_cache == NULL)
3108 goto Enomem;
3109
3110 drbd_al_ext_cache = kmem_cache_create(
3111 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3112 if (drbd_al_ext_cache == NULL)
3113 goto Enomem;
3114
3115 /* mempools */
3116 drbd_request_mempool = mempool_create(number,
3117 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3118 if (drbd_request_mempool == NULL)
3119 goto Enomem;
3120
3121 drbd_ee_mempool = mempool_create(number,
3122 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003123 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003124 goto Enomem;
3125
3126 /* drbd's page pool */
3127 spin_lock_init(&drbd_pp_lock);
3128
3129 for (i = 0; i < number; i++) {
3130 page = alloc_page(GFP_HIGHUSER);
3131 if (!page)
3132 goto Enomem;
3133 set_page_private(page, (unsigned long)drbd_pp_pool);
3134 drbd_pp_pool = page;
3135 }
3136 drbd_pp_vacant = number;
3137
3138 return 0;
3139
3140Enomem:
3141 drbd_destroy_mempools(); /* in case we allocated some */
3142 return -ENOMEM;
3143}
3144
3145static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3146 void *unused)
3147{
3148 /* just so we have it. you never know what interesting things we
3149 * might want to do here some day...
3150 */
3151
3152 return NOTIFY_DONE;
3153}
3154
3155static struct notifier_block drbd_notifier = {
3156 .notifier_call = drbd_notify_sys,
3157};
3158
3159static void drbd_release_ee_lists(struct drbd_conf *mdev)
3160{
3161 int rr;
3162
3163 rr = drbd_release_ee(mdev, &mdev->active_ee);
3164 if (rr)
3165 dev_err(DEV, "%d EEs in active list found!\n", rr);
3166
3167 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3168 if (rr)
3169 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3170
3171 rr = drbd_release_ee(mdev, &mdev->read_ee);
3172 if (rr)
3173 dev_err(DEV, "%d EEs in read list found!\n", rr);
3174
3175 rr = drbd_release_ee(mdev, &mdev->done_ee);
3176 if (rr)
3177 dev_err(DEV, "%d EEs in done list found!\n", rr);
3178
3179 rr = drbd_release_ee(mdev, &mdev->net_ee);
3180 if (rr)
3181 dev_err(DEV, "%d EEs in net list found!\n", rr);
3182}
3183
3184/* caution. no locking.
3185 * currently only used from module cleanup code. */
3186static void drbd_delete_device(unsigned int minor)
3187{
3188 struct drbd_conf *mdev = minor_to_mdev(minor);
3189
3190 if (!mdev)
3191 return;
3192
3193 /* paranoia asserts */
3194 if (mdev->open_cnt != 0)
3195 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3196 __FILE__ , __LINE__);
3197
3198 ERR_IF (!list_empty(&mdev->data.work.q)) {
3199 struct list_head *lp;
3200 list_for_each(lp, &mdev->data.work.q) {
3201 dev_err(DEV, "lp = %p\n", lp);
3202 }
3203 };
3204 /* end paranoia asserts */
3205
3206 del_gendisk(mdev->vdisk);
3207
3208 /* cleanup stuff that may have been allocated during
3209 * device (re-)configuration or state changes */
3210
3211 if (mdev->this_bdev)
3212 bdput(mdev->this_bdev);
3213
3214 drbd_free_resources(mdev);
3215
3216 drbd_release_ee_lists(mdev);
3217
3218 /* should be free'd on disconnect? */
3219 kfree(mdev->ee_hash);
3220 /*
3221 mdev->ee_hash_s = 0;
3222 mdev->ee_hash = NULL;
3223 */
3224
3225 lc_destroy(mdev->act_log);
3226 lc_destroy(mdev->resync);
3227
3228 kfree(mdev->p_uuid);
3229 /* mdev->p_uuid = NULL; */
3230
3231 kfree(mdev->int_dig_out);
3232 kfree(mdev->int_dig_in);
3233 kfree(mdev->int_dig_vv);
3234
3235 /* cleanup the rest that has been
3236 * allocated from drbd_new_device
3237 * and actually free the mdev itself */
3238 drbd_free_mdev(mdev);
3239}
3240
3241static void drbd_cleanup(void)
3242{
3243 unsigned int i;
3244
3245 unregister_reboot_notifier(&drbd_notifier);
3246
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003247 /* first remove proc,
3248 * drbdsetup uses it's presence to detect
3249 * whether DRBD is loaded.
3250 * If we would get stuck in proc removal,
3251 * but have netlink already deregistered,
3252 * some drbdsetup commands may wait forever
3253 * for an answer.
3254 */
3255 if (drbd_proc)
3256 remove_proc_entry("drbd", NULL);
3257
Philipp Reisnerb411b362009-09-25 16:07:19 -07003258 drbd_nl_cleanup();
3259
3260 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003261 i = minor_count;
3262 while (i--)
3263 drbd_delete_device(i);
3264 drbd_destroy_mempools();
3265 }
3266
3267 kfree(minor_table);
3268
3269 unregister_blkdev(DRBD_MAJOR, "drbd");
3270
3271 printk(KERN_INFO "drbd: module cleanup done.\n");
3272}
3273
3274/**
3275 * drbd_congested() - Callback for pdflush
3276 * @congested_data: User data
3277 * @bdi_bits: Bits pdflush is currently interested in
3278 *
3279 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3280 */
3281static int drbd_congested(void *congested_data, int bdi_bits)
3282{
3283 struct drbd_conf *mdev = congested_data;
3284 struct request_queue *q;
3285 char reason = '-';
3286 int r = 0;
3287
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003288 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003289 /* DRBD has frozen IO */
3290 r = bdi_bits;
3291 reason = 'd';
3292 goto out;
3293 }
3294
3295 if (get_ldev(mdev)) {
3296 q = bdev_get_queue(mdev->ldev->backing_bdev);
3297 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3298 put_ldev(mdev);
3299 if (r)
3300 reason = 'b';
3301 }
3302
3303 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3304 r |= (1 << BDI_async_congested);
3305 reason = reason == 'b' ? 'a' : 'n';
3306 }
3307
3308out:
3309 mdev->congestion_reason = reason;
3310 return r;
3311}
3312
3313struct drbd_conf *drbd_new_device(unsigned int minor)
3314{
3315 struct drbd_conf *mdev;
3316 struct gendisk *disk;
3317 struct request_queue *q;
3318
3319 /* GFP_KERNEL, we are outside of all write-out paths */
3320 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3321 if (!mdev)
3322 return NULL;
3323 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3324 goto out_no_cpumask;
3325
3326 mdev->minor = minor;
3327
3328 drbd_init_set_defaults(mdev);
3329
3330 q = blk_alloc_queue(GFP_KERNEL);
3331 if (!q)
3332 goto out_no_q;
3333 mdev->rq_queue = q;
3334 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003335
3336 disk = alloc_disk(1);
3337 if (!disk)
3338 goto out_no_disk;
3339 mdev->vdisk = disk;
3340
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003341 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003342
3343 disk->queue = q;
3344 disk->major = DRBD_MAJOR;
3345 disk->first_minor = minor;
3346 disk->fops = &drbd_ops;
3347 sprintf(disk->disk_name, "drbd%d", minor);
3348 disk->private_data = mdev;
3349
3350 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3351 /* we have no partitions. we contain only ourselves. */
3352 mdev->this_bdev->bd_contains = mdev->this_bdev;
3353
3354 q->backing_dev_info.congested_fn = drbd_congested;
3355 q->backing_dev_info.congested_data = mdev;
3356
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003357 blk_queue_make_request(q, drbd_make_request);
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003358 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003359 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3360 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003361 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003362
3363 mdev->md_io_page = alloc_page(GFP_KERNEL);
3364 if (!mdev->md_io_page)
3365 goto out_no_io_page;
3366
3367 if (drbd_bm_init(mdev))
3368 goto out_no_bitmap;
3369 /* no need to lock access, we are still initializing this minor device. */
3370 if (!tl_init(mdev))
3371 goto out_no_tl;
3372
3373 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3374 if (!mdev->app_reads_hash)
3375 goto out_no_app_reads;
3376
3377 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3378 if (!mdev->current_epoch)
3379 goto out_no_epoch;
3380
3381 INIT_LIST_HEAD(&mdev->current_epoch->list);
3382 mdev->epochs = 1;
3383
3384 return mdev;
3385
3386/* out_whatever_else:
3387 kfree(mdev->current_epoch); */
3388out_no_epoch:
3389 kfree(mdev->app_reads_hash);
3390out_no_app_reads:
3391 tl_cleanup(mdev);
3392out_no_tl:
3393 drbd_bm_cleanup(mdev);
3394out_no_bitmap:
3395 __free_page(mdev->md_io_page);
3396out_no_io_page:
3397 put_disk(disk);
3398out_no_disk:
3399 blk_cleanup_queue(q);
3400out_no_q:
3401 free_cpumask_var(mdev->cpu_mask);
3402out_no_cpumask:
3403 kfree(mdev);
3404 return NULL;
3405}
3406
3407/* counterpart of drbd_new_device.
3408 * last part of drbd_delete_device. */
3409void drbd_free_mdev(struct drbd_conf *mdev)
3410{
3411 kfree(mdev->current_epoch);
3412 kfree(mdev->app_reads_hash);
3413 tl_cleanup(mdev);
3414 if (mdev->bitmap) /* should no longer be there. */
3415 drbd_bm_cleanup(mdev);
3416 __free_page(mdev->md_io_page);
3417 put_disk(mdev->vdisk);
3418 blk_cleanup_queue(mdev->rq_queue);
3419 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003420 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003421 kfree(mdev);
3422}
3423
3424
3425int __init drbd_init(void)
3426{
3427 int err;
3428
3429 if (sizeof(struct p_handshake) != 80) {
3430 printk(KERN_ERR
3431 "drbd: never change the size or layout "
3432 "of the HandShake packet.\n");
3433 return -EINVAL;
3434 }
3435
3436 if (1 > minor_count || minor_count > 255) {
3437 printk(KERN_ERR
3438 "drbd: invalid minor_count (%d)\n", minor_count);
3439#ifdef MODULE
3440 return -EINVAL;
3441#else
3442 minor_count = 8;
3443#endif
3444 }
3445
3446 err = drbd_nl_init();
3447 if (err)
3448 return err;
3449
3450 err = register_blkdev(DRBD_MAJOR, "drbd");
3451 if (err) {
3452 printk(KERN_ERR
3453 "drbd: unable to register block device major %d\n",
3454 DRBD_MAJOR);
3455 return err;
3456 }
3457
3458 register_reboot_notifier(&drbd_notifier);
3459
3460 /*
3461 * allocate all necessary structs
3462 */
3463 err = -ENOMEM;
3464
3465 init_waitqueue_head(&drbd_pp_wait);
3466
3467 drbd_proc = NULL; /* play safe for drbd_cleanup */
3468 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3469 GFP_KERNEL);
3470 if (!minor_table)
3471 goto Enomem;
3472
3473 err = drbd_create_mempools();
3474 if (err)
3475 goto Enomem;
3476
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003477 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003478 if (!drbd_proc) {
3479 printk(KERN_ERR "drbd: unable to register proc file\n");
3480 goto Enomem;
3481 }
3482
3483 rwlock_init(&global_state_lock);
3484
3485 printk(KERN_INFO "drbd: initialized. "
3486 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3487 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3488 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3489 printk(KERN_INFO "drbd: registered as block device major %d\n",
3490 DRBD_MAJOR);
3491 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3492
3493 return 0; /* Success! */
3494
3495Enomem:
3496 drbd_cleanup();
3497 if (err == -ENOMEM)
3498 /* currently always the case */
3499 printk(KERN_ERR "drbd: ran out of memory\n");
3500 else
3501 printk(KERN_ERR "drbd: initialization failure\n");
3502 return err;
3503}
3504
3505void drbd_free_bc(struct drbd_backing_dev *ldev)
3506{
3507 if (ldev == NULL)
3508 return;
3509
Tejun Heoe525fd82010-11-13 11:55:17 +01003510 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3511 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003512
3513 kfree(ldev);
3514}
3515
3516void drbd_free_sock(struct drbd_conf *mdev)
3517{
3518 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003519 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003520 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3521 sock_release(mdev->data.socket);
3522 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003523 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003524 }
3525 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003526 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003527 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3528 sock_release(mdev->meta.socket);
3529 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003530 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003531 }
3532}
3533
3534
3535void drbd_free_resources(struct drbd_conf *mdev)
3536{
3537 crypto_free_hash(mdev->csums_tfm);
3538 mdev->csums_tfm = NULL;
3539 crypto_free_hash(mdev->verify_tfm);
3540 mdev->verify_tfm = NULL;
3541 crypto_free_hash(mdev->cram_hmac_tfm);
3542 mdev->cram_hmac_tfm = NULL;
3543 crypto_free_hash(mdev->integrity_w_tfm);
3544 mdev->integrity_w_tfm = NULL;
3545 crypto_free_hash(mdev->integrity_r_tfm);
3546 mdev->integrity_r_tfm = NULL;
3547
3548 drbd_free_sock(mdev);
3549
3550 __no_warn(local,
3551 drbd_free_bc(mdev->ldev);
3552 mdev->ldev = NULL;);
3553}
3554
3555/* meta data management */
3556
3557struct meta_data_on_disk {
3558 u64 la_size; /* last agreed size. */
3559 u64 uuid[UI_SIZE]; /* UUIDs. */
3560 u64 device_uuid;
3561 u64 reserved_u64_1;
3562 u32 flags; /* MDF */
3563 u32 magic;
3564 u32 md_size_sect;
3565 u32 al_offset; /* offset to this block */
3566 u32 al_nr_extents; /* important for restoring the AL */
3567 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3568 u32 bm_offset; /* offset to the bitmap, from here */
3569 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3570 u32 reserved_u32[4];
3571
3572} __packed;
3573
3574/**
3575 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3576 * @mdev: DRBD device.
3577 */
3578void drbd_md_sync(struct drbd_conf *mdev)
3579{
3580 struct meta_data_on_disk *buffer;
3581 sector_t sector;
3582 int i;
3583
Lars Ellenbergee15b032010-09-03 10:00:09 +02003584 del_timer(&mdev->md_sync_timer);
3585 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003586 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3587 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003588
3589 /* We use here D_FAILED and not D_ATTACHING because we try to write
3590 * metadata even if we detach due to a disk failure! */
3591 if (!get_ldev_if_state(mdev, D_FAILED))
3592 return;
3593
Philipp Reisnerb411b362009-09-25 16:07:19 -07003594 mutex_lock(&mdev->md_io_mutex);
3595 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3596 memset(buffer, 0, 512);
3597
3598 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3599 for (i = UI_CURRENT; i < UI_SIZE; i++)
3600 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3601 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3602 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3603
3604 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3605 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3606 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3607 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3608 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3609
3610 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3611
3612 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3613 sector = mdev->ldev->md.md_offset;
3614
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003615 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003616 /* this was a try anyways ... */
3617 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003618 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003619 }
3620
3621 /* Update mdev->ldev->md.la_size_sect,
3622 * since we updated it on metadata. */
3623 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3624
3625 mutex_unlock(&mdev->md_io_mutex);
3626 put_ldev(mdev);
3627}
3628
3629/**
3630 * drbd_md_read() - Reads in the meta data super block
3631 * @mdev: DRBD device.
3632 * @bdev: Device from which the meta data should be read in.
3633 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003634 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003635 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3636 */
3637int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3638{
3639 struct meta_data_on_disk *buffer;
3640 int i, rv = NO_ERROR;
3641
3642 if (!get_ldev_if_state(mdev, D_ATTACHING))
3643 return ERR_IO_MD_DISK;
3644
Philipp Reisnerb411b362009-09-25 16:07:19 -07003645 mutex_lock(&mdev->md_io_mutex);
3646 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3647
3648 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3649 /* NOTE: cant do normal error processing here as this is
3650 called BEFORE disk is attached */
3651 dev_err(DEV, "Error while reading metadata.\n");
3652 rv = ERR_IO_MD_DISK;
3653 goto err;
3654 }
3655
3656 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3657 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3658 rv = ERR_MD_INVALID;
3659 goto err;
3660 }
3661 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3662 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3663 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3664 rv = ERR_MD_INVALID;
3665 goto err;
3666 }
3667 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3668 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3669 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3670 rv = ERR_MD_INVALID;
3671 goto err;
3672 }
3673 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3674 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3675 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3676 rv = ERR_MD_INVALID;
3677 goto err;
3678 }
3679
3680 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3681 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3682 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3683 rv = ERR_MD_INVALID;
3684 goto err;
3685 }
3686
3687 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3688 for (i = UI_CURRENT; i < UI_SIZE; i++)
3689 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3690 bdev->md.flags = be32_to_cpu(buffer->flags);
3691 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3692 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3693
3694 if (mdev->sync_conf.al_extents < 7)
3695 mdev->sync_conf.al_extents = 127;
3696
3697 err:
3698 mutex_unlock(&mdev->md_io_mutex);
3699 put_ldev(mdev);
3700
3701 return rv;
3702}
3703
Lars Ellenbergac724122010-10-07 15:18:08 +02003704static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3705{
3706 static char *uuid_str[UI_EXTENDED_SIZE] = {
3707 [UI_CURRENT] = "CURRENT",
3708 [UI_BITMAP] = "BITMAP",
3709 [UI_HISTORY_START] = "HISTORY_START",
3710 [UI_HISTORY_END] = "HISTORY_END",
3711 [UI_SIZE] = "SIZE",
3712 [UI_FLAGS] = "FLAGS",
3713 };
3714
3715 if (index >= UI_EXTENDED_SIZE) {
3716 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3717 return;
3718 }
3719
3720 dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3721 uuid_str[index],
3722 (unsigned long long)mdev->ldev->md.uuid[index]);
3723}
3724
3725
Philipp Reisnerb411b362009-09-25 16:07:19 -07003726/**
3727 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3728 * @mdev: DRBD device.
3729 *
3730 * Call this function if you change anything that should be written to
3731 * the meta-data super block. This function sets MD_DIRTY, and starts a
3732 * timer that ensures that within five seconds you have to call drbd_md_sync().
3733 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003734#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003735void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3736{
3737 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3738 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3739 mdev->last_md_mark_dirty.line = line;
3740 mdev->last_md_mark_dirty.func = func;
3741 }
3742}
3743#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003744void drbd_md_mark_dirty(struct drbd_conf *mdev)
3745{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003746 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003747 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003748}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003749#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003750
3751static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3752{
3753 int i;
3754
Lars Ellenbergac724122010-10-07 15:18:08 +02003755 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003756 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Lars Ellenbergac724122010-10-07 15:18:08 +02003757 debug_drbd_uuid(mdev, i+1);
3758 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003759}
3760
3761void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3762{
3763 if (idx == UI_CURRENT) {
3764 if (mdev->state.role == R_PRIMARY)
3765 val |= 1;
3766 else
3767 val &= ~((u64)1);
3768
3769 drbd_set_ed_uuid(mdev, val);
3770 }
3771
3772 mdev->ldev->md.uuid[idx] = val;
Lars Ellenbergac724122010-10-07 15:18:08 +02003773 debug_drbd_uuid(mdev, idx);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003774 drbd_md_mark_dirty(mdev);
3775}
3776
3777
3778void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3779{
3780 if (mdev->ldev->md.uuid[idx]) {
3781 drbd_uuid_move_history(mdev);
3782 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Lars Ellenbergac724122010-10-07 15:18:08 +02003783 debug_drbd_uuid(mdev, UI_HISTORY_START);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003784 }
3785 _drbd_uuid_set(mdev, idx, val);
3786}
3787
3788/**
3789 * drbd_uuid_new_current() - Creates a new current UUID
3790 * @mdev: DRBD device.
3791 *
3792 * Creates a new current UUID, and rotates the old current UUID into
3793 * the bitmap slot. Causes an incremental resync upon next connect.
3794 */
3795void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3796{
3797 u64 val;
3798
3799 dev_info(DEV, "Creating new current UUID\n");
3800 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3801 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Lars Ellenbergac724122010-10-07 15:18:08 +02003802 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003803
3804 get_random_bytes(&val, sizeof(u64));
3805 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003806 /* get it to stable storage _now_ */
3807 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003808}
3809
3810void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3811{
3812 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3813 return;
3814
3815 if (val == 0) {
3816 drbd_uuid_move_history(mdev);
3817 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3818 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Lars Ellenbergac724122010-10-07 15:18:08 +02003819 debug_drbd_uuid(mdev, UI_HISTORY_START);
3820 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003821 } else {
3822 if (mdev->ldev->md.uuid[UI_BITMAP])
3823 dev_warn(DEV, "bm UUID already set");
3824
3825 mdev->ldev->md.uuid[UI_BITMAP] = val;
3826 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3827
Lars Ellenbergac724122010-10-07 15:18:08 +02003828 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003829 }
3830 drbd_md_mark_dirty(mdev);
3831}
3832
3833/**
3834 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3835 * @mdev: DRBD device.
3836 *
3837 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3838 */
3839int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3840{
3841 int rv = -EIO;
3842
3843 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3844 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3845 drbd_md_sync(mdev);
3846 drbd_bm_set_all(mdev);
3847
3848 rv = drbd_bm_write(mdev);
3849
3850 if (!rv) {
3851 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3852 drbd_md_sync(mdev);
3853 }
3854
3855 put_ldev(mdev);
3856 }
3857
3858 return rv;
3859}
3860
3861/**
3862 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3863 * @mdev: DRBD device.
3864 *
3865 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3866 */
3867int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3868{
3869 int rv = -EIO;
3870
Philipp Reisner07782862010-08-31 12:00:50 +02003871 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003872 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3873 drbd_bm_clear_all(mdev);
3874 rv = drbd_bm_write(mdev);
3875 put_ldev(mdev);
3876 }
3877
3878 return rv;
3879}
3880
3881static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3882{
3883 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003884 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003885
3886 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3887
Lars Ellenberg02851e92010-12-16 14:47:39 +01003888 if (get_ldev(mdev)) {
3889 drbd_bm_lock(mdev, work->why);
3890 rv = work->io_fn(mdev);
3891 drbd_bm_unlock(mdev);
3892 put_ldev(mdev);
3893 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003894
3895 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003896 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07003897 wake_up(&mdev->misc_wait);
3898
3899 if (work->done)
3900 work->done(mdev, rv);
3901
3902 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3903 work->why = NULL;
3904
3905 return 1;
3906}
3907
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003908void drbd_ldev_destroy(struct drbd_conf *mdev)
3909{
3910 lc_destroy(mdev->resync);
3911 mdev->resync = NULL;
3912 lc_destroy(mdev->act_log);
3913 mdev->act_log = NULL;
3914 __no_warn(local,
3915 drbd_free_bc(mdev->ldev);
3916 mdev->ldev = NULL;);
3917
3918 if (mdev->md_io_tmpp) {
3919 __free_page(mdev->md_io_tmpp);
3920 mdev->md_io_tmpp = NULL;
3921 }
3922 clear_bit(GO_DISKLESS, &mdev->flags);
3923}
3924
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003925static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3926{
3927 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02003928 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3929 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003930 * the protected members anymore, though, so once put_ldev reaches zero
3931 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003932 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003933 return 1;
3934}
3935
3936void drbd_go_diskless(struct drbd_conf *mdev)
3937{
3938 D_ASSERT(mdev->state.disk == D_FAILED);
3939 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02003940 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003941}
3942
Philipp Reisnerb411b362009-09-25 16:07:19 -07003943/**
3944 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3945 * @mdev: DRBD device.
3946 * @io_fn: IO callback to be called when bitmap IO is possible
3947 * @done: callback to be called after the bitmap IO was performed
3948 * @why: Descriptive text of the reason for doing the IO
3949 *
3950 * While IO on the bitmap happens we freeze application IO thus we ensure
3951 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3952 * called from worker context. It MUST NOT be used while a previous such
3953 * work is still pending!
3954 */
3955void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3956 int (*io_fn)(struct drbd_conf *),
3957 void (*done)(struct drbd_conf *, int),
3958 char *why)
3959{
3960 D_ASSERT(current == mdev->worker.task);
3961
3962 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3963 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3964 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3965 if (mdev->bm_io_work.why)
3966 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3967 why, mdev->bm_io_work.why);
3968
3969 mdev->bm_io_work.io_fn = io_fn;
3970 mdev->bm_io_work.done = done;
3971 mdev->bm_io_work.why = why;
3972
Philipp Reisner22afd7e2010-11-16 15:30:44 +01003973 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003974 set_bit(BITMAP_IO, &mdev->flags);
3975 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01003976 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003977 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003978 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01003979 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003980}
3981
3982/**
3983 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3984 * @mdev: DRBD device.
3985 * @io_fn: IO callback to be called when bitmap IO is possible
3986 * @why: Descriptive text of the reason for doing the IO
3987 *
3988 * freezes application IO while that the actual IO operations runs. This
3989 * functions MAY NOT be called from worker context.
3990 */
3991int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3992{
3993 int rv;
3994
3995 D_ASSERT(current != mdev->worker.task);
3996
3997 drbd_suspend_io(mdev);
3998
3999 drbd_bm_lock(mdev, why);
4000 rv = io_fn(mdev);
4001 drbd_bm_unlock(mdev);
4002
4003 drbd_resume_io(mdev);
4004
4005 return rv;
4006}
4007
4008void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4009{
4010 if ((mdev->ldev->md.flags & flag) != flag) {
4011 drbd_md_mark_dirty(mdev);
4012 mdev->ldev->md.flags |= flag;
4013 }
4014}
4015
4016void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4017{
4018 if ((mdev->ldev->md.flags & flag) != 0) {
4019 drbd_md_mark_dirty(mdev);
4020 mdev->ldev->md.flags &= ~flag;
4021 }
4022}
4023int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4024{
4025 return (bdev->md.flags & flag) != 0;
4026}
4027
4028static void md_sync_timer_fn(unsigned long data)
4029{
4030 struct drbd_conf *mdev = (struct drbd_conf *) data;
4031
4032 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4033}
4034
4035static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4036{
4037 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004038#ifdef DEBUG
4039 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4040 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4041#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004042 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004043 return 1;
4044}
4045
4046#ifdef CONFIG_DRBD_FAULT_INJECTION
4047/* Fault insertion support including random number generator shamelessly
4048 * stolen from kernel/rcutorture.c */
4049struct fault_random_state {
4050 unsigned long state;
4051 unsigned long count;
4052};
4053
4054#define FAULT_RANDOM_MULT 39916801 /* prime */
4055#define FAULT_RANDOM_ADD 479001701 /* prime */
4056#define FAULT_RANDOM_REFRESH 10000
4057
4058/*
4059 * Crude but fast random-number generator. Uses a linear congruential
4060 * generator, with occasional help from get_random_bytes().
4061 */
4062static unsigned long
4063_drbd_fault_random(struct fault_random_state *rsp)
4064{
4065 long refresh;
4066
Roel Kluin49829ea2009-12-15 22:55:44 +01004067 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004068 get_random_bytes(&refresh, sizeof(refresh));
4069 rsp->state += refresh;
4070 rsp->count = FAULT_RANDOM_REFRESH;
4071 }
4072 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4073 return swahw32(rsp->state);
4074}
4075
4076static char *
4077_drbd_fault_str(unsigned int type) {
4078 static char *_faults[] = {
4079 [DRBD_FAULT_MD_WR] = "Meta-data write",
4080 [DRBD_FAULT_MD_RD] = "Meta-data read",
4081 [DRBD_FAULT_RS_WR] = "Resync write",
4082 [DRBD_FAULT_RS_RD] = "Resync read",
4083 [DRBD_FAULT_DT_WR] = "Data write",
4084 [DRBD_FAULT_DT_RD] = "Data read",
4085 [DRBD_FAULT_DT_RA] = "Data read ahead",
4086 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004087 [DRBD_FAULT_AL_EE] = "EE allocation",
4088 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004089 };
4090
4091 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4092}
4093
4094unsigned int
4095_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4096{
4097 static struct fault_random_state rrs = {0, 0};
4098
4099 unsigned int ret = (
4100 (fault_devs == 0 ||
4101 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4102 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4103
4104 if (ret) {
4105 fault_count++;
4106
Lars Ellenberg73835062010-05-27 11:51:56 +02004107 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004108 dev_warn(DEV, "***Simulating %s failure\n",
4109 _drbd_fault_str(type));
4110 }
4111
4112 return ret;
4113}
4114#endif
4115
4116const char *drbd_buildtag(void)
4117{
4118 /* DRBD built from external sources has here a reference to the
4119 git hash of the source code. */
4120
4121 static char buildtag[38] = "\0uilt-in";
4122
4123 if (buildtag[0] == 0) {
4124#ifdef CONFIG_MODULES
4125 if (THIS_MODULE != NULL)
4126 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4127 else
4128#endif
4129 buildtag[0] = 'b';
4130 }
4131
4132 return buildtag;
4133}
4134
4135module_init(drbd_init)
4136module_exit(drbd_cleanup)
4137
Philipp Reisnerb411b362009-09-25 16:07:19 -07004138EXPORT_SYMBOL(drbd_conn_str);
4139EXPORT_SYMBOL(drbd_role_str);
4140EXPORT_SYMBOL(drbd_disk_str);
4141EXPORT_SYMBOL(drbd_set_st_err_str);