blob: 8d029b14e7cc1c9346f3768b5eae095cc79b9420 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
35#include <linux/smp_lock.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
67int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *);
70
71int drbd_init(void);
72static int drbd_open(struct block_device *bdev, fmode_t mode);
73static int drbd_release(struct gendisk *gd, fmode_t mode);
74static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76 union drbd_state ns, enum chg_state_flags flags);
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020080static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070081
Philipp Reisnerb411b362009-09-25 16:07:19 -070082MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
83 "Lars Ellenberg <lars@linbit.com>");
84MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
85MODULE_VERSION(REL_VERSION);
86MODULE_LICENSE("GPL");
87MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
88MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
89
90#include <linux/moduleparam.h>
91/* allow_open_on_secondary */
92MODULE_PARM_DESC(allow_oos, "DONT USE!");
93/* thanks to these macros, if compiled into the kernel (not-module),
94 * this becomes the boot parameter drbd.minor_count */
95module_param(minor_count, uint, 0444);
96module_param(disable_sendpage, bool, 0644);
97module_param(allow_oos, bool, 0);
98module_param(cn_idx, uint, 0444);
99module_param(proc_details, int, 0644);
100
101#ifdef CONFIG_DRBD_FAULT_INJECTION
102int enable_faults;
103int fault_rate;
104static int fault_count;
105int fault_devs;
106/* bitmap of enabled faults */
107module_param(enable_faults, int, 0664);
108/* fault rate % value - applies to all enabled faults */
109module_param(fault_rate, int, 0664);
110/* count of faults inserted */
111module_param(fault_count, int, 0664);
112/* bitmap of devices to insert faults on */
113module_param(fault_devs, int, 0644);
114#endif
115
116/* module parameter, defined */
117unsigned int minor_count = 32;
118int disable_sendpage;
119int allow_oos;
120unsigned int cn_idx = CN_IDX_DRBD;
121int proc_details; /* Detail level in proc drbd*/
122
123/* Module parameter for setting the user mode helper program
124 * to run. Default is /sbin/drbdadm */
125char usermode_helper[80] = "/sbin/drbdadm";
126
127module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
128
129/* in 2.6.x, our device mapping and config info contains our virtual gendisks
130 * as member "struct gendisk *vdisk;"
131 */
132struct drbd_conf **minor_table;
133
134struct kmem_cache *drbd_request_cache;
135struct kmem_cache *drbd_ee_cache; /* epoch entries */
136struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
137struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
138mempool_t *drbd_request_mempool;
139mempool_t *drbd_ee_mempool;
140
141/* I do not use a standard mempool, because:
142 1) I want to hand out the pre-allocated objects first.
143 2) I want to be able to interrupt sleeping allocation with a signal.
144 Note: This is a single linked list, the next pointer is the private
145 member of struct page.
146 */
147struct page *drbd_pp_pool;
148spinlock_t drbd_pp_lock;
149int drbd_pp_vacant;
150wait_queue_head_t drbd_pp_wait;
151
152DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
153
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100154static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700155 .owner = THIS_MODULE,
156 .open = drbd_open,
157 .release = drbd_release,
158};
159
160#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
161
162#ifdef __CHECKER__
163/* When checking with sparse, and this is an inline function, sparse will
164 give tons of false positives. When this is a real functions sparse works.
165 */
166int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
167{
168 int io_allowed;
169
170 atomic_inc(&mdev->local_cnt);
171 io_allowed = (mdev->state.disk >= mins);
172 if (!io_allowed) {
173 if (atomic_dec_and_test(&mdev->local_cnt))
174 wake_up(&mdev->misc_wait);
175 }
176 return io_allowed;
177}
178
179#endif
180
181/**
182 * DOC: The transfer log
183 *
184 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
185 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
186 * of the list. There is always at least one &struct drbd_tl_epoch object.
187 *
188 * Each &struct drbd_tl_epoch has a circular double linked list of requests
189 * attached.
190 */
191static int tl_init(struct drbd_conf *mdev)
192{
193 struct drbd_tl_epoch *b;
194
195 /* during device minor initialization, we may well use GFP_KERNEL */
196 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
197 if (!b)
198 return 0;
199 INIT_LIST_HEAD(&b->requests);
200 INIT_LIST_HEAD(&b->w.list);
201 b->next = NULL;
202 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200203 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700204 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
205
206 mdev->oldest_tle = b;
207 mdev->newest_tle = b;
208 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
209
210 mdev->tl_hash = NULL;
211 mdev->tl_hash_s = 0;
212
213 return 1;
214}
215
216static void tl_cleanup(struct drbd_conf *mdev)
217{
218 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
219 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
220 kfree(mdev->oldest_tle);
221 mdev->oldest_tle = NULL;
222 kfree(mdev->unused_spare_tle);
223 mdev->unused_spare_tle = NULL;
224 kfree(mdev->tl_hash);
225 mdev->tl_hash = NULL;
226 mdev->tl_hash_s = 0;
227}
228
229/**
230 * _tl_add_barrier() - Adds a barrier to the transfer log
231 * @mdev: DRBD device.
232 * @new: Barrier to be added before the current head of the TL.
233 *
234 * The caller must hold the req_lock.
235 */
236void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
237{
238 struct drbd_tl_epoch *newest_before;
239
240 INIT_LIST_HEAD(&new->requests);
241 INIT_LIST_HEAD(&new->w.list);
242 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
243 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200244 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700245
246 newest_before = mdev->newest_tle;
247 /* never send a barrier number == 0, because that is special-cased
248 * when using TCQ for our write ordering code */
249 new->br_number = (newest_before->br_number+1) ?: 1;
250 if (mdev->newest_tle != new) {
251 mdev->newest_tle->next = new;
252 mdev->newest_tle = new;
253 }
254}
255
256/**
257 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
258 * @mdev: DRBD device.
259 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
260 * @set_size: Expected number of requests before that barrier.
261 *
262 * In case the passed barrier_nr or set_size does not match the oldest
263 * &struct drbd_tl_epoch objects this function will cause a termination
264 * of the connection.
265 */
266void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
267 unsigned int set_size)
268{
269 struct drbd_tl_epoch *b, *nob; /* next old barrier */
270 struct list_head *le, *tle;
271 struct drbd_request *r;
272
273 spin_lock_irq(&mdev->req_lock);
274
275 b = mdev->oldest_tle;
276
277 /* first some paranoia code */
278 if (b == NULL) {
279 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
280 barrier_nr);
281 goto bail;
282 }
283 if (b->br_number != barrier_nr) {
284 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
285 barrier_nr, b->br_number);
286 goto bail;
287 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200288 if (b->n_writes != set_size) {
289 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
290 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700291 goto bail;
292 }
293
294 /* Clean up list of requests processed during current epoch */
295 list_for_each_safe(le, tle, &b->requests) {
296 r = list_entry(le, struct drbd_request, tl_requests);
297 _req_mod(r, barrier_acked);
298 }
299 /* There could be requests on the list waiting for completion
300 of the write to the local disk. To avoid corruptions of
301 slab's data structures we have to remove the lists head.
302
303 Also there could have been a barrier ack out of sequence, overtaking
304 the write acks - which would be a bug and violating write ordering.
305 To not deadlock in case we lose connection while such requests are
306 still pending, we need some way to find them for the
307 _req_mode(connection_lost_while_pending).
308
309 These have been list_move'd to the out_of_sequence_requests list in
310 _req_mod(, barrier_acked) above.
311 */
312 list_del_init(&b->requests);
313
314 nob = b->next;
315 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
316 _tl_add_barrier(mdev, b);
317 if (nob)
318 mdev->oldest_tle = nob;
319 /* if nob == NULL b was the only barrier, and becomes the new
320 barrier. Therefore mdev->oldest_tle points already to b */
321 } else {
322 D_ASSERT(nob != NULL);
323 mdev->oldest_tle = nob;
324 kfree(b);
325 }
326
327 spin_unlock_irq(&mdev->req_lock);
328 dec_ap_pending(mdev);
329
330 return;
331
332bail:
333 spin_unlock_irq(&mdev->req_lock);
334 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
335}
336
Philipp Reisner11b58e72010-05-12 17:08:26 +0200337/**
338 * _tl_restart() - Walks the transfer log, and applies an action to all requests
339 * @mdev: DRBD device.
340 * @what: The action/event to perform with all request objects
341 *
342 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
343 * restart_frozen_disk_io.
344 */
345static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
346{
347 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200348 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200349 struct drbd_request *req;
350 int rv, n_writes, n_reads;
351
352 b = mdev->oldest_tle;
353 pn = &mdev->oldest_tle;
354 while (b) {
355 n_writes = 0;
356 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200357 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200358 list_for_each_safe(le, tle, &b->requests) {
359 req = list_entry(le, struct drbd_request, tl_requests);
360 rv = _req_mod(req, what);
361
362 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
363 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
364 }
365 tmp = b->next;
366
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200367 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200368 if (what == resend) {
369 b->n_writes = n_writes;
370 if (b->w.cb == NULL) {
371 b->w.cb = w_send_barrier;
372 inc_ap_pending(mdev);
373 set_bit(CREATE_BARRIER, &mdev->flags);
374 }
375
376 drbd_queue_work(&mdev->data.work, &b->w);
377 }
378 pn = &b->next;
379 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200380 if (n_reads)
381 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200382 /* there could still be requests on that ring list,
383 * in case local io is still pending */
384 list_del(&b->requests);
385
386 /* dec_ap_pending corresponding to queue_barrier.
387 * the newest barrier may not have been queued yet,
388 * in which case w.cb is still NULL. */
389 if (b->w.cb != NULL)
390 dec_ap_pending(mdev);
391
392 if (b == mdev->newest_tle) {
393 /* recycle, but reinit! */
394 D_ASSERT(tmp == NULL);
395 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200396 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200397 INIT_LIST_HEAD(&b->w.list);
398 b->w.cb = NULL;
399 b->br_number = net_random();
400 b->n_writes = 0;
401
402 *pn = b;
403 break;
404 }
405 *pn = tmp;
406 kfree(b);
407 }
408 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200409 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200410 }
411}
412
Philipp Reisnerb411b362009-09-25 16:07:19 -0700413
414/**
415 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
416 * @mdev: DRBD device.
417 *
418 * This is called after the connection to the peer was lost. The storage covered
419 * by the requests on the transfer gets marked as our of sync. Called from the
420 * receiver thread and the worker thread.
421 */
422void tl_clear(struct drbd_conf *mdev)
423{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700424 struct list_head *le, *tle;
425 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700426
427 spin_lock_irq(&mdev->req_lock);
428
Philipp Reisner11b58e72010-05-12 17:08:26 +0200429 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700430
431 /* we expect this list to be empty. */
432 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
433
434 /* but just in case, clean it up anyways! */
435 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
436 r = list_entry(le, struct drbd_request, tl_requests);
437 /* It would be nice to complete outside of spinlock.
438 * But this is easier for now. */
439 _req_mod(r, connection_lost_while_pending);
440 }
441
442 /* ensure bit indicating barrier is required is clear */
443 clear_bit(CREATE_BARRIER, &mdev->flags);
444
Philipp Reisner288f4222010-05-27 15:07:43 +0200445 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
446
Philipp Reisnerb411b362009-09-25 16:07:19 -0700447 spin_unlock_irq(&mdev->req_lock);
448}
449
Philipp Reisner11b58e72010-05-12 17:08:26 +0200450void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
451{
452 spin_lock_irq(&mdev->req_lock);
453 _tl_restart(mdev, what);
454 spin_unlock_irq(&mdev->req_lock);
455}
456
Philipp Reisnerb411b362009-09-25 16:07:19 -0700457/**
458 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
459 * @mdev: DRBD device.
460 * @os: old (current) state.
461 * @ns: new (wanted) state.
462 */
463static int cl_wide_st_chg(struct drbd_conf *mdev,
464 union drbd_state os, union drbd_state ns)
465{
466 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
467 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
468 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
469 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
470 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
471 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
472 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
473}
474
475int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
476 union drbd_state mask, union drbd_state val)
477{
478 unsigned long flags;
479 union drbd_state os, ns;
480 int rv;
481
482 spin_lock_irqsave(&mdev->req_lock, flags);
483 os = mdev->state;
484 ns.i = (os.i & ~mask.i) | val.i;
485 rv = _drbd_set_state(mdev, ns, f, NULL);
486 ns = mdev->state;
487 spin_unlock_irqrestore(&mdev->req_lock, flags);
488
489 return rv;
490}
491
492/**
493 * drbd_force_state() - Impose a change which happens outside our control on our state
494 * @mdev: DRBD device.
495 * @mask: mask of state bits to change.
496 * @val: value of new state bits.
497 */
498void drbd_force_state(struct drbd_conf *mdev,
499 union drbd_state mask, union drbd_state val)
500{
501 drbd_change_state(mdev, CS_HARD, mask, val);
502}
503
504static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
505static int is_valid_state_transition(struct drbd_conf *,
506 union drbd_state, union drbd_state);
507static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200508 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700509int drbd_send_state_req(struct drbd_conf *,
510 union drbd_state, union drbd_state);
511
512static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
513 union drbd_state mask, union drbd_state val)
514{
515 union drbd_state os, ns;
516 unsigned long flags;
517 int rv;
518
519 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
520 return SS_CW_SUCCESS;
521
522 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
523 return SS_CW_FAILED_BY_PEER;
524
525 rv = 0;
526 spin_lock_irqsave(&mdev->req_lock, flags);
527 os = mdev->state;
528 ns.i = (os.i & ~mask.i) | val.i;
529 ns = sanitize_state(mdev, os, ns, NULL);
530
531 if (!cl_wide_st_chg(mdev, os, ns))
532 rv = SS_CW_NO_NEED;
533 if (!rv) {
534 rv = is_valid_state(mdev, ns);
535 if (rv == SS_SUCCESS) {
536 rv = is_valid_state_transition(mdev, ns, os);
537 if (rv == SS_SUCCESS)
538 rv = 0; /* cont waiting, otherwise fail. */
539 }
540 }
541 spin_unlock_irqrestore(&mdev->req_lock, flags);
542
543 return rv;
544}
545
546/**
547 * drbd_req_state() - Perform an eventually cluster wide state change
548 * @mdev: DRBD device.
549 * @mask: mask of state bits to change.
550 * @val: value of new state bits.
551 * @f: flags
552 *
553 * Should not be called directly, use drbd_request_state() or
554 * _drbd_request_state().
555 */
556static int drbd_req_state(struct drbd_conf *mdev,
557 union drbd_state mask, union drbd_state val,
558 enum chg_state_flags f)
559{
560 struct completion done;
561 unsigned long flags;
562 union drbd_state os, ns;
563 int rv;
564
565 init_completion(&done);
566
567 if (f & CS_SERIALIZE)
568 mutex_lock(&mdev->state_mutex);
569
570 spin_lock_irqsave(&mdev->req_lock, flags);
571 os = mdev->state;
572 ns.i = (os.i & ~mask.i) | val.i;
573 ns = sanitize_state(mdev, os, ns, NULL);
574
575 if (cl_wide_st_chg(mdev, os, ns)) {
576 rv = is_valid_state(mdev, ns);
577 if (rv == SS_SUCCESS)
578 rv = is_valid_state_transition(mdev, ns, os);
579 spin_unlock_irqrestore(&mdev->req_lock, flags);
580
581 if (rv < SS_SUCCESS) {
582 if (f & CS_VERBOSE)
583 print_st_err(mdev, os, ns, rv);
584 goto abort;
585 }
586
587 drbd_state_lock(mdev);
588 if (!drbd_send_state_req(mdev, mask, val)) {
589 drbd_state_unlock(mdev);
590 rv = SS_CW_FAILED_BY_PEER;
591 if (f & CS_VERBOSE)
592 print_st_err(mdev, os, ns, rv);
593 goto abort;
594 }
595
596 wait_event(mdev->state_wait,
597 (rv = _req_st_cond(mdev, mask, val)));
598
599 if (rv < SS_SUCCESS) {
600 drbd_state_unlock(mdev);
601 if (f & CS_VERBOSE)
602 print_st_err(mdev, os, ns, rv);
603 goto abort;
604 }
605 spin_lock_irqsave(&mdev->req_lock, flags);
606 os = mdev->state;
607 ns.i = (os.i & ~mask.i) | val.i;
608 rv = _drbd_set_state(mdev, ns, f, &done);
609 drbd_state_unlock(mdev);
610 } else {
611 rv = _drbd_set_state(mdev, ns, f, &done);
612 }
613
614 spin_unlock_irqrestore(&mdev->req_lock, flags);
615
616 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
617 D_ASSERT(current != mdev->worker.task);
618 wait_for_completion(&done);
619 }
620
621abort:
622 if (f & CS_SERIALIZE)
623 mutex_unlock(&mdev->state_mutex);
624
625 return rv;
626}
627
628/**
629 * _drbd_request_state() - Request a state change (with flags)
630 * @mdev: DRBD device.
631 * @mask: mask of state bits to change.
632 * @val: value of new state bits.
633 * @f: flags
634 *
635 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
636 * flag, or when logging of failed state change requests is not desired.
637 */
638int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
639 union drbd_state val, enum chg_state_flags f)
640{
641 int rv;
642
643 wait_event(mdev->state_wait,
644 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
645
646 return rv;
647}
648
649static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
650{
651 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
652 name,
653 drbd_conn_str(ns.conn),
654 drbd_role_str(ns.role),
655 drbd_role_str(ns.peer),
656 drbd_disk_str(ns.disk),
657 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200658 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700659 ns.aftr_isp ? 'a' : '-',
660 ns.peer_isp ? 'p' : '-',
661 ns.user_isp ? 'u' : '-'
662 );
663}
664
665void print_st_err(struct drbd_conf *mdev,
666 union drbd_state os, union drbd_state ns, int err)
667{
668 if (err == SS_IN_TRANSIENT_STATE)
669 return;
670 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
671 print_st(mdev, " state", os);
672 print_st(mdev, "wanted", ns);
673}
674
675
676#define drbd_peer_str drbd_role_str
677#define drbd_pdsk_str drbd_disk_str
678
679#define drbd_susp_str(A) ((A) ? "1" : "0")
680#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
681#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
682#define drbd_user_isp_str(A) ((A) ? "1" : "0")
683
684#define PSC(A) \
685 ({ if (ns.A != os.A) { \
686 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
687 drbd_##A##_str(os.A), \
688 drbd_##A##_str(ns.A)); \
689 } })
690
691/**
692 * is_valid_state() - Returns an SS_ error code if ns is not valid
693 * @mdev: DRBD device.
694 * @ns: State to consider.
695 */
696static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
697{
698 /* See drbd_state_sw_errors in drbd_strings.c */
699
700 enum drbd_fencing_p fp;
701 int rv = SS_SUCCESS;
702
703 fp = FP_DONT_CARE;
704 if (get_ldev(mdev)) {
705 fp = mdev->ldev->dc.fencing;
706 put_ldev(mdev);
707 }
708
709 if (get_net_conf(mdev)) {
710 if (!mdev->net_conf->two_primaries &&
711 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
712 rv = SS_TWO_PRIMARIES;
713 put_net_conf(mdev);
714 }
715
716 if (rv <= 0)
717 /* already found a reason to abort */;
718 else if (ns.role == R_SECONDARY && mdev->open_cnt)
719 rv = SS_DEVICE_IN_USE;
720
721 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
722 rv = SS_NO_UP_TO_DATE_DISK;
723
724 else if (fp >= FP_RESOURCE &&
725 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
726 rv = SS_PRIMARY_NOP;
727
728 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
729 rv = SS_NO_UP_TO_DATE_DISK;
730
731 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
732 rv = SS_NO_LOCAL_DISK;
733
734 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
735 rv = SS_NO_REMOTE_DISK;
736
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200737 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
738 rv = SS_NO_UP_TO_DATE_DISK;
739
Philipp Reisnerb411b362009-09-25 16:07:19 -0700740 else if ((ns.conn == C_CONNECTED ||
741 ns.conn == C_WF_BITMAP_S ||
742 ns.conn == C_SYNC_SOURCE ||
743 ns.conn == C_PAUSED_SYNC_S) &&
744 ns.disk == D_OUTDATED)
745 rv = SS_CONNECTED_OUTDATES;
746
747 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
748 (mdev->sync_conf.verify_alg[0] == 0))
749 rv = SS_NO_VERIFY_ALG;
750
751 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
752 mdev->agreed_pro_version < 88)
753 rv = SS_NOT_SUPPORTED;
754
755 return rv;
756}
757
758/**
759 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
760 * @mdev: DRBD device.
761 * @ns: new state.
762 * @os: old state.
763 */
764static int is_valid_state_transition(struct drbd_conf *mdev,
765 union drbd_state ns, union drbd_state os)
766{
767 int rv = SS_SUCCESS;
768
769 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
770 os.conn > C_CONNECTED)
771 rv = SS_RESYNC_RUNNING;
772
773 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
774 rv = SS_ALREADY_STANDALONE;
775
776 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
777 rv = SS_IS_DISKLESS;
778
779 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
780 rv = SS_NO_NET_CONFIG;
781
782 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
783 rv = SS_LOWER_THAN_OUTDATED;
784
785 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
786 rv = SS_IN_TRANSIENT_STATE;
787
788 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
789 rv = SS_IN_TRANSIENT_STATE;
790
791 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
792 rv = SS_NEED_CONNECTION;
793
794 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
795 ns.conn != os.conn && os.conn > C_CONNECTED)
796 rv = SS_RESYNC_RUNNING;
797
798 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
799 os.conn < C_CONNECTED)
800 rv = SS_NEED_CONNECTION;
801
802 return rv;
803}
804
805/**
806 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
807 * @mdev: DRBD device.
808 * @os: old state.
809 * @ns: new state.
810 * @warn_sync_abort:
811 *
812 * When we loose connection, we have to set the state of the peers disk (pdsk)
813 * to D_UNKNOWN. This rule and many more along those lines are in this function.
814 */
815static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200816 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700817{
818 enum drbd_fencing_p fp;
819
820 fp = FP_DONT_CARE;
821 if (get_ldev(mdev)) {
822 fp = mdev->ldev->dc.fencing;
823 put_ldev(mdev);
824 }
825
826 /* Disallow Network errors to configure a device's network part */
827 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
828 os.conn <= C_DISCONNECTING)
829 ns.conn = os.conn;
830
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200831 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
832 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700833 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200834 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700835 ns.conn = os.conn;
836
837 /* After C_DISCONNECTING only C_STANDALONE may follow */
838 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
839 ns.conn = os.conn;
840
841 if (ns.conn < C_CONNECTED) {
842 ns.peer_isp = 0;
843 ns.peer = R_UNKNOWN;
844 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
845 ns.pdsk = D_UNKNOWN;
846 }
847
848 /* Clear the aftr_isp when becoming unconfigured */
849 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
850 ns.aftr_isp = 0;
851
Philipp Reisnerb411b362009-09-25 16:07:19 -0700852 /* Abort resync if a disk fails/detaches */
853 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
854 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
855 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200856 *warn_sync_abort =
857 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
858 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700859 ns.conn = C_CONNECTED;
860 }
861
862 if (ns.conn >= C_CONNECTED &&
863 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
864 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
865 switch (ns.conn) {
866 case C_WF_BITMAP_T:
867 case C_PAUSED_SYNC_T:
868 ns.disk = D_OUTDATED;
869 break;
870 case C_CONNECTED:
871 case C_WF_BITMAP_S:
872 case C_SYNC_SOURCE:
873 case C_PAUSED_SYNC_S:
874 ns.disk = D_UP_TO_DATE;
875 break;
876 case C_SYNC_TARGET:
877 ns.disk = D_INCONSISTENT;
878 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
879 break;
880 }
881 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
882 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
883 }
884
885 if (ns.conn >= C_CONNECTED &&
886 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
887 switch (ns.conn) {
888 case C_CONNECTED:
889 case C_WF_BITMAP_T:
890 case C_PAUSED_SYNC_T:
891 case C_SYNC_TARGET:
892 ns.pdsk = D_UP_TO_DATE;
893 break;
894 case C_WF_BITMAP_S:
895 case C_PAUSED_SYNC_S:
Lars Ellenberge0f83012010-04-01 15:13:19 +0200896 /* remap any consistent state to D_OUTDATED,
897 * but disallow "upgrade" of not even consistent states.
898 */
899 ns.pdsk =
900 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
901 ? os.pdsk : D_OUTDATED;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700902 break;
903 case C_SYNC_SOURCE:
904 ns.pdsk = D_INCONSISTENT;
905 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
906 break;
907 }
908 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
909 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
910 }
911
912 /* Connection breaks down before we finished "Negotiating" */
913 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
914 get_ldev_if_state(mdev, D_NEGOTIATING)) {
915 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
916 ns.disk = mdev->new_state_tmp.disk;
917 ns.pdsk = mdev->new_state_tmp.pdsk;
918 } else {
919 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
920 ns.disk = D_DISKLESS;
921 ns.pdsk = D_UNKNOWN;
922 }
923 put_ldev(mdev);
924 }
925
926 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200927 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
928 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200929 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +0200930
931 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
932 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
933 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200934 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700935
936 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
937 if (ns.conn == C_SYNC_SOURCE)
938 ns.conn = C_PAUSED_SYNC_S;
939 if (ns.conn == C_SYNC_TARGET)
940 ns.conn = C_PAUSED_SYNC_T;
941 } else {
942 if (ns.conn == C_PAUSED_SYNC_S)
943 ns.conn = C_SYNC_SOURCE;
944 if (ns.conn == C_PAUSED_SYNC_T)
945 ns.conn = C_SYNC_TARGET;
946 }
947
948 return ns;
949}
950
951/* helper for __drbd_set_state */
952static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
953{
954 if (cs == C_VERIFY_T) {
955 /* starting online verify from an arbitrary position
956 * does not fit well into the existing protocol.
957 * on C_VERIFY_T, we initialize ov_left and friends
958 * implicitly in receive_DataRequest once the
959 * first P_OV_REQUEST is received */
960 mdev->ov_start_sector = ~(sector_t)0;
961 } else {
962 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
963 if (bit >= mdev->rs_total)
964 mdev->ov_start_sector =
965 BM_BIT_TO_SECT(mdev->rs_total - 1);
966 mdev->ov_position = mdev->ov_start_sector;
967 }
968}
969
Philipp Reisner07782862010-08-31 12:00:50 +0200970static void drbd_resume_al(struct drbd_conf *mdev)
971{
972 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
973 dev_info(DEV, "Resumed AL updates\n");
974}
975
Philipp Reisnerb411b362009-09-25 16:07:19 -0700976/**
977 * __drbd_set_state() - Set a new DRBD state
978 * @mdev: DRBD device.
979 * @ns: new state.
980 * @flags: Flags
981 * @done: Optional completion, that will get completed after the after_state_ch() finished
982 *
983 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
984 */
985int __drbd_set_state(struct drbd_conf *mdev,
986 union drbd_state ns, enum chg_state_flags flags,
987 struct completion *done)
988{
989 union drbd_state os;
990 int rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200991 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700992 struct after_state_chg_work *ascw;
993
994 os = mdev->state;
995
996 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
997
998 if (ns.i == os.i)
999 return SS_NOTHING_TO_DO;
1000
1001 if (!(flags & CS_HARD)) {
1002 /* pre-state-change checks ; only look at ns */
1003 /* See drbd_state_sw_errors in drbd_strings.c */
1004
1005 rv = is_valid_state(mdev, ns);
1006 if (rv < SS_SUCCESS) {
1007 /* If the old state was illegal as well, then let
1008 this happen...*/
1009
Philipp Reisner1616a252010-06-10 16:55:15 +02001010 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001011 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001012 } else
1013 rv = is_valid_state_transition(mdev, ns, os);
1014 }
1015
1016 if (rv < SS_SUCCESS) {
1017 if (flags & CS_VERBOSE)
1018 print_st_err(mdev, os, ns, rv);
1019 return rv;
1020 }
1021
1022 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001023 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001024
1025 {
1026 char *pbp, pb[300];
1027 pbp = pb;
1028 *pbp = 0;
1029 PSC(role);
1030 PSC(peer);
1031 PSC(conn);
1032 PSC(disk);
1033 PSC(pdsk);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001034 if (is_susp(ns) != is_susp(os))
1035 pbp += sprintf(pbp, "susp( %s -> %s ) ",
1036 drbd_susp_str(is_susp(os)),
1037 drbd_susp_str(is_susp(ns)));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001038 PSC(aftr_isp);
1039 PSC(peer_isp);
1040 PSC(user_isp);
1041 dev_info(DEV, "%s\n", pb);
1042 }
1043
1044 /* solve the race between becoming unconfigured,
1045 * worker doing the cleanup, and
1046 * admin reconfiguring us:
1047 * on (re)configure, first set CONFIG_PENDING,
1048 * then wait for a potentially exiting worker,
1049 * start the worker, and schedule one no_op.
1050 * then proceed with configuration.
1051 */
1052 if (ns.disk == D_DISKLESS &&
1053 ns.conn == C_STANDALONE &&
1054 ns.role == R_SECONDARY &&
1055 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1056 set_bit(DEVICE_DYING, &mdev->flags);
1057
1058 mdev->state.i = ns.i;
1059 wake_up(&mdev->misc_wait);
1060 wake_up(&mdev->state_wait);
1061
Philipp Reisnerb411b362009-09-25 16:07:19 -07001062 /* aborted verify run. log the last position */
1063 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1064 ns.conn < C_CONNECTED) {
1065 mdev->ov_start_sector =
1066 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1067 dev_info(DEV, "Online Verify reached sector %llu\n",
1068 (unsigned long long)mdev->ov_start_sector);
1069 }
1070
1071 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1072 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1073 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001074 mdev->rs_paused += (long)jiffies
1075 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001076 if (ns.conn == C_SYNC_TARGET)
1077 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001078 }
1079
1080 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1081 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1082 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001083 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001084 }
1085
1086 if (os.conn == C_CONNECTED &&
1087 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001088 unsigned long now = jiffies;
1089 int i;
1090
Philipp Reisnerb411b362009-09-25 16:07:19 -07001091 mdev->ov_position = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001092 mdev->rs_total = drbd_bm_bits(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001093 if (mdev->agreed_pro_version >= 90)
1094 set_ov_position(mdev, ns.conn);
1095 else
1096 mdev->ov_start_sector = 0;
1097 mdev->ov_left = mdev->rs_total
1098 - BM_SECT_TO_BIT(mdev->ov_position);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001099 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001100 mdev->rs_last_events = 0;
1101 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001102 mdev->ov_last_oos_size = 0;
1103 mdev->ov_last_oos_start = 0;
1104
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001105 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1106 mdev->rs_mark_left[i] = mdev->rs_total;
1107 mdev->rs_mark_time[i] = now;
1108 }
1109
Philipp Reisnerb411b362009-09-25 16:07:19 -07001110 if (ns.conn == C_VERIFY_S) {
1111 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1112 (unsigned long long)mdev->ov_position);
1113 mod_timer(&mdev->resync_timer, jiffies);
1114 }
1115 }
1116
1117 if (get_ldev(mdev)) {
1118 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1119 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1120 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1121
1122 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1123 mdf |= MDF_CRASHED_PRIMARY;
1124 if (mdev->state.role == R_PRIMARY ||
1125 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1126 mdf |= MDF_PRIMARY_IND;
1127 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1128 mdf |= MDF_CONNECTED_IND;
1129 if (mdev->state.disk > D_INCONSISTENT)
1130 mdf |= MDF_CONSISTENT;
1131 if (mdev->state.disk > D_OUTDATED)
1132 mdf |= MDF_WAS_UP_TO_DATE;
1133 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1134 mdf |= MDF_PEER_OUT_DATED;
1135 if (mdf != mdev->ldev->md.flags) {
1136 mdev->ldev->md.flags = mdf;
1137 drbd_md_mark_dirty(mdev);
1138 }
1139 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1140 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1141 put_ldev(mdev);
1142 }
1143
1144 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1145 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1146 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1147 set_bit(CONSIDER_RESYNC, &mdev->flags);
1148
1149 /* Receiver should clean up itself */
1150 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1151 drbd_thread_stop_nowait(&mdev->receiver);
1152
1153 /* Now the receiver finished cleaning up itself, it should die */
1154 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1155 drbd_thread_stop_nowait(&mdev->receiver);
1156
1157 /* Upon network failure, we need to restart the receiver. */
1158 if (os.conn > C_TEAR_DOWN &&
1159 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1160 drbd_thread_restart_nowait(&mdev->receiver);
1161
Philipp Reisner07782862010-08-31 12:00:50 +02001162 /* Resume AL writing if we get a connection */
1163 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1164 drbd_resume_al(mdev);
1165
Philipp Reisnerb411b362009-09-25 16:07:19 -07001166 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1167 if (ascw) {
1168 ascw->os = os;
1169 ascw->ns = ns;
1170 ascw->flags = flags;
1171 ascw->w.cb = w_after_state_ch;
1172 ascw->done = done;
1173 drbd_queue_work(&mdev->data.work, &ascw->w);
1174 } else {
1175 dev_warn(DEV, "Could not kmalloc an ascw\n");
1176 }
1177
1178 return rv;
1179}
1180
1181static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1182{
1183 struct after_state_chg_work *ascw =
1184 container_of(w, struct after_state_chg_work, w);
1185 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1186 if (ascw->flags & CS_WAIT_COMPLETE) {
1187 D_ASSERT(ascw->done != NULL);
1188 complete(ascw->done);
1189 }
1190 kfree(ascw);
1191
1192 return 1;
1193}
1194
1195static void abw_start_sync(struct drbd_conf *mdev, int rv)
1196{
1197 if (rv) {
1198 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1199 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1200 return;
1201 }
1202
1203 switch (mdev->state.conn) {
1204 case C_STARTING_SYNC_T:
1205 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1206 break;
1207 case C_STARTING_SYNC_S:
1208 drbd_start_resync(mdev, C_SYNC_SOURCE);
1209 break;
1210 }
1211}
1212
1213/**
1214 * after_state_ch() - Perform after state change actions that may sleep
1215 * @mdev: DRBD device.
1216 * @os: old state.
1217 * @ns: new state.
1218 * @flags: Flags
1219 */
1220static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1221 union drbd_state ns, enum chg_state_flags flags)
1222{
1223 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001224 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001225 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001226
1227 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1228 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1229 if (mdev->p_uuid)
1230 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1231 }
1232
1233 fp = FP_DONT_CARE;
1234 if (get_ldev(mdev)) {
1235 fp = mdev->ldev->dc.fencing;
1236 put_ldev(mdev);
1237 }
1238
1239 /* Inform userspace about the change... */
1240 drbd_bcast_state(mdev, ns);
1241
1242 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1243 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1244 drbd_khelper(mdev, "pri-on-incon-degr");
1245
1246 /* Here we have the actions that are performed after a
1247 state change. This function might sleep */
1248
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001249 nsm.i = -1;
1250 if (ns.susp_nod) {
Philipp Reisner265be2d2010-05-31 10:14:17 +02001251 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
Philipp Reisner67098932010-06-24 16:24:25 +02001252 if (ns.conn == C_CONNECTED)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001253 what = resend, nsm.susp_nod = 0;
Philipp Reisner67098932010-06-24 16:24:25 +02001254 else /* ns.conn > C_CONNECTED */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001255 dev_err(DEV, "Unexpected Resynd going on!\n");
1256 }
1257
Philipp Reisner67098932010-06-24 16:24:25 +02001258 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001259 what = restart_frozen_disk_io, nsm.susp_nod = 0;
1260
Philipp Reisner265be2d2010-05-31 10:14:17 +02001261 }
1262
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001263 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001264 /* case1: The outdate peer handler is successful: */
1265 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001266 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001267 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1268 drbd_uuid_new_current(mdev);
1269 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001270 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001271 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001272 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001273 spin_unlock_irq(&mdev->req_lock);
1274 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001275 /* case2: The connection was established again: */
1276 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1277 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001278 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001279 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001280 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001281 }
Philipp Reisner67098932010-06-24 16:24:25 +02001282
1283 if (what != nothing) {
1284 spin_lock_irq(&mdev->req_lock);
1285 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001286 nsm.i &= mdev->state.i;
1287 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001288 spin_unlock_irq(&mdev->req_lock);
1289 }
1290
Philipp Reisnerb411b362009-09-25 16:07:19 -07001291 /* Do not change the order of the if above and the two below... */
1292 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1293 drbd_send_uuids(mdev);
1294 drbd_send_state(mdev);
1295 }
1296 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1297 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1298
1299 /* Lost contact to peer's copy of the data */
1300 if ((os.pdsk >= D_INCONSISTENT &&
1301 os.pdsk != D_UNKNOWN &&
1302 os.pdsk != D_OUTDATED)
1303 && (ns.pdsk < D_INCONSISTENT ||
1304 ns.pdsk == D_UNKNOWN ||
1305 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001306 if (get_ldev(mdev)) {
1307 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001308 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001309 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001310 set_bit(NEW_CUR_UUID, &mdev->flags);
1311 } else {
1312 drbd_uuid_new_current(mdev);
1313 drbd_send_uuids(mdev);
1314 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001315 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001316 put_ldev(mdev);
1317 }
1318 }
1319
1320 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001321 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001322 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001323 drbd_send_uuids(mdev);
1324 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001325
1326 /* D_DISKLESS Peer becomes secondary */
1327 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1328 drbd_al_to_on_disk_bm(mdev);
1329 put_ldev(mdev);
1330 }
1331
1332 /* Last part of the attaching process ... */
1333 if (ns.conn >= C_CONNECTED &&
1334 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001335 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001336 drbd_send_uuids(mdev);
1337 drbd_send_state(mdev);
1338 }
1339
1340 /* We want to pause/continue resync, tell peer. */
1341 if (ns.conn >= C_CONNECTED &&
1342 ((os.aftr_isp != ns.aftr_isp) ||
1343 (os.user_isp != ns.user_isp)))
1344 drbd_send_state(mdev);
1345
1346 /* In case one of the isp bits got set, suspend other devices. */
1347 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1348 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1349 suspend_other_sg(mdev);
1350
1351 /* Make sure the peer gets informed about eventual state
1352 changes (ISP bits) while we were in WFReportParams. */
1353 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1354 drbd_send_state(mdev);
1355
1356 /* We are in the progress to start a full sync... */
1357 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1358 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1359 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1360
1361 /* We are invalidating our self... */
1362 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1363 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1364 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1365
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001366 /* first half of local IO error */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001367 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001368 enum drbd_io_error_p eh = EP_PASS_ON;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001369
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001370 if (drbd_send_state(mdev))
1371 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1372 else
1373 dev_err(DEV, "Sending state for drbd_io_error() failed\n");
1374
1375 drbd_rs_cancel_all(mdev);
1376
Philipp Reisnerb411b362009-09-25 16:07:19 -07001377 if (get_ldev_if_state(mdev, D_FAILED)) {
1378 eh = mdev->ldev->dc.on_io_error;
1379 put_ldev(mdev);
1380 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001381 if (eh == EP_CALL_HELPER)
1382 drbd_khelper(mdev, "local-io-error");
1383 }
1384
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001385
1386 /* second half of local IO error handling,
1387 * after local_cnt references have reached zero: */
1388 if (os.disk == D_FAILED && ns.disk == D_DISKLESS) {
1389 mdev->rs_total = 0;
1390 mdev->rs_failed = 0;
1391 atomic_set(&mdev->rs_pending_cnt, 0);
1392 }
1393
Philipp Reisnerb411b362009-09-25 16:07:19 -07001394 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
Lars Ellenberg9d282872010-10-14 13:57:07 +02001395 /* We must still be diskless,
1396 * re-attach has to be serialized with this! */
1397 if (mdev->state.disk != D_DISKLESS)
1398 dev_err(DEV,
1399 "ASSERT FAILED: disk is %s while going diskless\n",
1400 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001401
Lars Ellenberg9d282872010-10-14 13:57:07 +02001402 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state
1403 * will inc/dec it frequently. Since we became D_DISKLESS, no
1404 * one has touched the protected members anymore, though, so we
1405 * are safe to free them here. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001406 if (drbd_send_state(mdev))
1407 dev_warn(DEV, "Notified peer that I detached my disk.\n");
1408 else
1409 dev_err(DEV, "Sending state for detach failed\n");
1410
Philipp Reisnerb411b362009-09-25 16:07:19 -07001411 lc_destroy(mdev->resync);
1412 mdev->resync = NULL;
1413 lc_destroy(mdev->act_log);
1414 mdev->act_log = NULL;
1415 __no_warn(local,
1416 drbd_free_bc(mdev->ldev);
1417 mdev->ldev = NULL;);
1418
Lars Ellenbergf65363c2010-09-14 20:14:09 +02001419 if (mdev->md_io_tmpp) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001420 __free_page(mdev->md_io_tmpp);
Lars Ellenbergf65363c2010-09-14 20:14:09 +02001421 mdev->md_io_tmpp = NULL;
1422 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001423 }
1424
1425 /* Disks got bigger while they were detached */
1426 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1427 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1428 if (ns.conn == C_CONNECTED)
1429 resync_after_online_grow(mdev);
1430 }
1431
1432 /* A resync finished or aborted, wake paused devices... */
1433 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1434 (os.peer_isp && !ns.peer_isp) ||
1435 (os.user_isp && !ns.user_isp))
1436 resume_next_sg(mdev);
1437
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001438 /* sync target done with resync. Explicitly notify peer, even though
1439 * it should (at least for non-empty resyncs) already know itself. */
1440 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1441 drbd_send_state(mdev);
1442
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001443 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001444 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001445 drbd_free_tl_hash(mdev);
1446
Philipp Reisnerb411b362009-09-25 16:07:19 -07001447 /* Upon network connection, we need to start the receiver */
1448 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1449 drbd_thread_start(&mdev->receiver);
1450
1451 /* Terminate worker thread if we are unconfigured - it will be
1452 restarted as needed... */
1453 if (ns.disk == D_DISKLESS &&
1454 ns.conn == C_STANDALONE &&
1455 ns.role == R_SECONDARY) {
1456 if (os.aftr_isp != ns.aftr_isp)
1457 resume_next_sg(mdev);
1458 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1459 if (test_bit(DEVICE_DYING, &mdev->flags))
1460 drbd_thread_stop_nowait(&mdev->worker);
1461 }
1462
1463 drbd_md_sync(mdev);
1464}
1465
1466
1467static int drbd_thread_setup(void *arg)
1468{
1469 struct drbd_thread *thi = (struct drbd_thread *) arg;
1470 struct drbd_conf *mdev = thi->mdev;
1471 unsigned long flags;
1472 int retval;
1473
1474restart:
1475 retval = thi->function(thi);
1476
1477 spin_lock_irqsave(&thi->t_lock, flags);
1478
1479 /* if the receiver has been "Exiting", the last thing it did
1480 * was set the conn state to "StandAlone",
1481 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1482 * and receiver thread will be "started".
1483 * drbd_thread_start needs to set "Restarting" in that case.
1484 * t_state check and assignment needs to be within the same spinlock,
1485 * so either thread_start sees Exiting, and can remap to Restarting,
1486 * or thread_start see None, and can proceed as normal.
1487 */
1488
1489 if (thi->t_state == Restarting) {
1490 dev_info(DEV, "Restarting %s\n", current->comm);
1491 thi->t_state = Running;
1492 spin_unlock_irqrestore(&thi->t_lock, flags);
1493 goto restart;
1494 }
1495
1496 thi->task = NULL;
1497 thi->t_state = None;
1498 smp_mb();
1499 complete(&thi->stop);
1500 spin_unlock_irqrestore(&thi->t_lock, flags);
1501
1502 dev_info(DEV, "Terminating %s\n", current->comm);
1503
1504 /* Release mod reference taken when thread was started */
1505 module_put(THIS_MODULE);
1506 return retval;
1507}
1508
1509static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1510 int (*func) (struct drbd_thread *))
1511{
1512 spin_lock_init(&thi->t_lock);
1513 thi->task = NULL;
1514 thi->t_state = None;
1515 thi->function = func;
1516 thi->mdev = mdev;
1517}
1518
1519int drbd_thread_start(struct drbd_thread *thi)
1520{
1521 struct drbd_conf *mdev = thi->mdev;
1522 struct task_struct *nt;
1523 unsigned long flags;
1524
1525 const char *me =
1526 thi == &mdev->receiver ? "receiver" :
1527 thi == &mdev->asender ? "asender" :
1528 thi == &mdev->worker ? "worker" : "NONSENSE";
1529
1530 /* is used from state engine doing drbd_thread_stop_nowait,
1531 * while holding the req lock irqsave */
1532 spin_lock_irqsave(&thi->t_lock, flags);
1533
1534 switch (thi->t_state) {
1535 case None:
1536 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1537 me, current->comm, current->pid);
1538
1539 /* Get ref on module for thread - this is released when thread exits */
1540 if (!try_module_get(THIS_MODULE)) {
1541 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1542 spin_unlock_irqrestore(&thi->t_lock, flags);
1543 return FALSE;
1544 }
1545
1546 init_completion(&thi->stop);
1547 D_ASSERT(thi->task == NULL);
1548 thi->reset_cpu_mask = 1;
1549 thi->t_state = Running;
1550 spin_unlock_irqrestore(&thi->t_lock, flags);
1551 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1552
1553 nt = kthread_create(drbd_thread_setup, (void *) thi,
1554 "drbd%d_%s", mdev_to_minor(mdev), me);
1555
1556 if (IS_ERR(nt)) {
1557 dev_err(DEV, "Couldn't start thread\n");
1558
1559 module_put(THIS_MODULE);
1560 return FALSE;
1561 }
1562 spin_lock_irqsave(&thi->t_lock, flags);
1563 thi->task = nt;
1564 thi->t_state = Running;
1565 spin_unlock_irqrestore(&thi->t_lock, flags);
1566 wake_up_process(nt);
1567 break;
1568 case Exiting:
1569 thi->t_state = Restarting;
1570 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1571 me, current->comm, current->pid);
1572 /* fall through */
1573 case Running:
1574 case Restarting:
1575 default:
1576 spin_unlock_irqrestore(&thi->t_lock, flags);
1577 break;
1578 }
1579
1580 return TRUE;
1581}
1582
1583
1584void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1585{
1586 unsigned long flags;
1587
1588 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1589
1590 /* may be called from state engine, holding the req lock irqsave */
1591 spin_lock_irqsave(&thi->t_lock, flags);
1592
1593 if (thi->t_state == None) {
1594 spin_unlock_irqrestore(&thi->t_lock, flags);
1595 if (restart)
1596 drbd_thread_start(thi);
1597 return;
1598 }
1599
1600 if (thi->t_state != ns) {
1601 if (thi->task == NULL) {
1602 spin_unlock_irqrestore(&thi->t_lock, flags);
1603 return;
1604 }
1605
1606 thi->t_state = ns;
1607 smp_mb();
1608 init_completion(&thi->stop);
1609 if (thi->task != current)
1610 force_sig(DRBD_SIGKILL, thi->task);
1611
1612 }
1613
1614 spin_unlock_irqrestore(&thi->t_lock, flags);
1615
1616 if (wait)
1617 wait_for_completion(&thi->stop);
1618}
1619
1620#ifdef CONFIG_SMP
1621/**
1622 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1623 * @mdev: DRBD device.
1624 *
1625 * Forces all threads of a device onto the same CPU. This is beneficial for
1626 * DRBD's performance. May be overwritten by user's configuration.
1627 */
1628void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1629{
1630 int ord, cpu;
1631
1632 /* user override. */
1633 if (cpumask_weight(mdev->cpu_mask))
1634 return;
1635
1636 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1637 for_each_online_cpu(cpu) {
1638 if (ord-- == 0) {
1639 cpumask_set_cpu(cpu, mdev->cpu_mask);
1640 return;
1641 }
1642 }
1643 /* should not be reached */
1644 cpumask_setall(mdev->cpu_mask);
1645}
1646
1647/**
1648 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1649 * @mdev: DRBD device.
1650 *
1651 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1652 * prematurely.
1653 */
1654void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1655{
1656 struct task_struct *p = current;
1657 struct drbd_thread *thi =
1658 p == mdev->asender.task ? &mdev->asender :
1659 p == mdev->receiver.task ? &mdev->receiver :
1660 p == mdev->worker.task ? &mdev->worker :
1661 NULL;
1662 ERR_IF(thi == NULL)
1663 return;
1664 if (!thi->reset_cpu_mask)
1665 return;
1666 thi->reset_cpu_mask = 0;
1667 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1668}
1669#endif
1670
1671/* the appropriate socket mutex must be held already */
1672int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001673 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001674 size_t size, unsigned msg_flags)
1675{
1676 int sent, ok;
1677
1678 ERR_IF(!h) return FALSE;
1679 ERR_IF(!size) return FALSE;
1680
1681 h->magic = BE_DRBD_MAGIC;
1682 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001683 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001684
Philipp Reisnerb411b362009-09-25 16:07:19 -07001685 sent = drbd_send(mdev, sock, h, size, msg_flags);
1686
1687 ok = (sent == size);
1688 if (!ok)
1689 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1690 cmdname(cmd), (int)size, sent);
1691 return ok;
1692}
1693
1694/* don't pass the socket. we may only look at it
1695 * when we hold the appropriate socket mutex.
1696 */
1697int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001698 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001699{
1700 int ok = 0;
1701 struct socket *sock;
1702
1703 if (use_data_socket) {
1704 mutex_lock(&mdev->data.mutex);
1705 sock = mdev->data.socket;
1706 } else {
1707 mutex_lock(&mdev->meta.mutex);
1708 sock = mdev->meta.socket;
1709 }
1710
1711 /* drbd_disconnect() could have called drbd_free_sock()
1712 * while we were waiting in down()... */
1713 if (likely(sock != NULL))
1714 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1715
1716 if (use_data_socket)
1717 mutex_unlock(&mdev->data.mutex);
1718 else
1719 mutex_unlock(&mdev->meta.mutex);
1720 return ok;
1721}
1722
1723int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1724 size_t size)
1725{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001726 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001727 int ok;
1728
1729 h.magic = BE_DRBD_MAGIC;
1730 h.command = cpu_to_be16(cmd);
1731 h.length = cpu_to_be16(size);
1732
1733 if (!drbd_get_data_sock(mdev))
1734 return 0;
1735
Philipp Reisnerb411b362009-09-25 16:07:19 -07001736 ok = (sizeof(h) ==
1737 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1738 ok = ok && (size ==
1739 drbd_send(mdev, mdev->data.socket, data, size, 0));
1740
1741 drbd_put_data_sock(mdev);
1742
1743 return ok;
1744}
1745
1746int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1747{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001748 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001749 struct socket *sock;
1750 int size, rv;
1751 const int apv = mdev->agreed_pro_version;
1752
1753 size = apv <= 87 ? sizeof(struct p_rs_param)
1754 : apv == 88 ? sizeof(struct p_rs_param)
1755 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001756 : apv <= 94 ? sizeof(struct p_rs_param_89)
1757 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001758
1759 /* used from admin command context and receiver/worker context.
1760 * to avoid kmalloc, grab the socket right here,
1761 * then use the pre-allocated sbuf there */
1762 mutex_lock(&mdev->data.mutex);
1763 sock = mdev->data.socket;
1764
1765 if (likely(sock != NULL)) {
1766 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1767
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001768 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001769
1770 /* initialize verify_alg and csums_alg */
1771 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1772
1773 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001774 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1775 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1776 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1777 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001778
1779 if (apv >= 88)
1780 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1781 if (apv >= 89)
1782 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1783
1784 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1785 } else
1786 rv = 0; /* not ok */
1787
1788 mutex_unlock(&mdev->data.mutex);
1789
1790 return rv;
1791}
1792
1793int drbd_send_protocol(struct drbd_conf *mdev)
1794{
1795 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001796 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001797
1798 size = sizeof(struct p_protocol);
1799
1800 if (mdev->agreed_pro_version >= 87)
1801 size += strlen(mdev->net_conf->integrity_alg) + 1;
1802
1803 /* we must not recurse into our own queue,
1804 * as that is blocked during handshake */
1805 p = kmalloc(size, GFP_NOIO);
1806 if (p == NULL)
1807 return 0;
1808
1809 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1810 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1811 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1812 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001813 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1814
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001815 cf = 0;
1816 if (mdev->net_conf->want_lose)
1817 cf |= CF_WANT_LOSE;
1818 if (mdev->net_conf->dry_run) {
1819 if (mdev->agreed_pro_version >= 92)
1820 cf |= CF_DRY_RUN;
1821 else {
1822 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001823 kfree(p);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001824 return 0;
1825 }
1826 }
1827 p->conn_flags = cpu_to_be32(cf);
1828
Philipp Reisnerb411b362009-09-25 16:07:19 -07001829 if (mdev->agreed_pro_version >= 87)
1830 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1831
1832 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001833 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001834 kfree(p);
1835 return rv;
1836}
1837
1838int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1839{
1840 struct p_uuids p;
1841 int i;
1842
1843 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1844 return 1;
1845
1846 for (i = UI_CURRENT; i < UI_SIZE; i++)
1847 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1848
1849 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1850 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1851 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1852 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1853 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1854 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1855
1856 put_ldev(mdev);
1857
1858 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001859 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001860}
1861
1862int drbd_send_uuids(struct drbd_conf *mdev)
1863{
1864 return _drbd_send_uuids(mdev, 0);
1865}
1866
1867int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1868{
1869 return _drbd_send_uuids(mdev, 8);
1870}
1871
1872
1873int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1874{
1875 struct p_rs_uuid p;
1876
1877 p.uuid = cpu_to_be64(val);
1878
1879 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001880 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001881}
1882
Philipp Reisnere89b5912010-03-24 17:11:33 +01001883int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001884{
1885 struct p_sizes p;
1886 sector_t d_size, u_size;
1887 int q_order_type;
1888 int ok;
1889
1890 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1891 D_ASSERT(mdev->ldev->backing_bdev);
1892 d_size = drbd_get_max_capacity(mdev->ldev);
1893 u_size = mdev->ldev->dc.disk_size;
1894 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001895 put_ldev(mdev);
1896 } else {
1897 d_size = 0;
1898 u_size = 0;
1899 q_order_type = QUEUE_ORDERED_NONE;
1900 }
1901
1902 p.d_size = cpu_to_be64(d_size);
1903 p.u_size = cpu_to_be64(u_size);
1904 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1905 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
Philipp Reisnere89b5912010-03-24 17:11:33 +01001906 p.queue_order_type = cpu_to_be16(q_order_type);
1907 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001908
1909 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001910 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001911 return ok;
1912}
1913
1914/**
1915 * drbd_send_state() - Sends the drbd state to the peer
1916 * @mdev: DRBD device.
1917 */
1918int drbd_send_state(struct drbd_conf *mdev)
1919{
1920 struct socket *sock;
1921 struct p_state p;
1922 int ok = 0;
1923
1924 /* Grab state lock so we wont send state if we're in the middle
1925 * of a cluster wide state change on another thread */
1926 drbd_state_lock(mdev);
1927
1928 mutex_lock(&mdev->data.mutex);
1929
1930 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1931 sock = mdev->data.socket;
1932
1933 if (likely(sock != NULL)) {
1934 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001935 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001936 }
1937
1938 mutex_unlock(&mdev->data.mutex);
1939
1940 drbd_state_unlock(mdev);
1941 return ok;
1942}
1943
1944int drbd_send_state_req(struct drbd_conf *mdev,
1945 union drbd_state mask, union drbd_state val)
1946{
1947 struct p_req_state p;
1948
1949 p.mask = cpu_to_be32(mask.i);
1950 p.val = cpu_to_be32(val.i);
1951
1952 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001953 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001954}
1955
1956int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1957{
1958 struct p_req_state_reply p;
1959
1960 p.retcode = cpu_to_be32(retcode);
1961
1962 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001963 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001964}
1965
1966int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1967 struct p_compressed_bm *p,
1968 struct bm_xfer_ctx *c)
1969{
1970 struct bitstream bs;
1971 unsigned long plain_bits;
1972 unsigned long tmp;
1973 unsigned long rl;
1974 unsigned len;
1975 unsigned toggle;
1976 int bits;
1977
1978 /* may we use this feature? */
1979 if ((mdev->sync_conf.use_rle == 0) ||
1980 (mdev->agreed_pro_version < 90))
1981 return 0;
1982
1983 if (c->bit_offset >= c->bm_bits)
1984 return 0; /* nothing to do. */
1985
1986 /* use at most thus many bytes */
1987 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1988 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1989 /* plain bits covered in this code string */
1990 plain_bits = 0;
1991
1992 /* p->encoding & 0x80 stores whether the first run length is set.
1993 * bit offset is implicit.
1994 * start with toggle == 2 to be able to tell the first iteration */
1995 toggle = 2;
1996
1997 /* see how much plain bits we can stuff into one packet
1998 * using RLE and VLI. */
1999 do {
2000 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2001 : _drbd_bm_find_next(mdev, c->bit_offset);
2002 if (tmp == -1UL)
2003 tmp = c->bm_bits;
2004 rl = tmp - c->bit_offset;
2005
2006 if (toggle == 2) { /* first iteration */
2007 if (rl == 0) {
2008 /* the first checked bit was set,
2009 * store start value, */
2010 DCBP_set_start(p, 1);
2011 /* but skip encoding of zero run length */
2012 toggle = !toggle;
2013 continue;
2014 }
2015 DCBP_set_start(p, 0);
2016 }
2017
2018 /* paranoia: catch zero runlength.
2019 * can only happen if bitmap is modified while we scan it. */
2020 if (rl == 0) {
2021 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2022 "t:%u bo:%lu\n", toggle, c->bit_offset);
2023 return -1;
2024 }
2025
2026 bits = vli_encode_bits(&bs, rl);
2027 if (bits == -ENOBUFS) /* buffer full */
2028 break;
2029 if (bits <= 0) {
2030 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2031 return 0;
2032 }
2033
2034 toggle = !toggle;
2035 plain_bits += rl;
2036 c->bit_offset = tmp;
2037 } while (c->bit_offset < c->bm_bits);
2038
2039 len = bs.cur.b - p->code + !!bs.cur.bit;
2040
2041 if (plain_bits < (len << 3)) {
2042 /* incompressible with this method.
2043 * we need to rewind both word and bit position. */
2044 c->bit_offset -= plain_bits;
2045 bm_xfer_ctx_bit_to_word_offset(c);
2046 c->bit_offset = c->word_offset * BITS_PER_LONG;
2047 return 0;
2048 }
2049
2050 /* RLE + VLI was able to compress it just fine.
2051 * update c->word_offset. */
2052 bm_xfer_ctx_bit_to_word_offset(c);
2053
2054 /* store pad_bits */
2055 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2056
2057 return len;
2058}
2059
2060enum { OK, FAILED, DONE }
2061send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002062 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002063{
2064 struct p_compressed_bm *p = (void*)h;
2065 unsigned long num_words;
2066 int len;
2067 int ok;
2068
2069 len = fill_bitmap_rle_bits(mdev, p, c);
2070
2071 if (len < 0)
2072 return FAILED;
2073
2074 if (len) {
2075 DCBP_set_code(p, RLE_VLI_Bits);
2076 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2077 sizeof(*p) + len, 0);
2078
2079 c->packets[0]++;
2080 c->bytes[0] += sizeof(*p) + len;
2081
2082 if (c->bit_offset >= c->bm_bits)
2083 len = 0; /* DONE */
2084 } else {
2085 /* was not compressible.
2086 * send a buffer full of plain text bits instead. */
2087 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2088 len = num_words * sizeof(long);
2089 if (len)
2090 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2091 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002092 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002093 c->word_offset += num_words;
2094 c->bit_offset = c->word_offset * BITS_PER_LONG;
2095
2096 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002097 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002098
2099 if (c->bit_offset > c->bm_bits)
2100 c->bit_offset = c->bm_bits;
2101 }
2102 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2103
2104 if (ok == DONE)
2105 INFO_bm_xfer_stats(mdev, "send", c);
2106 return ok;
2107}
2108
2109/* See the comment at receive_bitmap() */
2110int _drbd_send_bitmap(struct drbd_conf *mdev)
2111{
2112 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002113 struct p_header80 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002114 int ret;
2115
2116 ERR_IF(!mdev->bitmap) return FALSE;
2117
2118 /* maybe we should use some per thread scratch page,
2119 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002120 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002121 if (!p) {
2122 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2123 return FALSE;
2124 }
2125
2126 if (get_ldev(mdev)) {
2127 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2128 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2129 drbd_bm_set_all(mdev);
2130 if (drbd_bm_write(mdev)) {
2131 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2132 * but otherwise process as per normal - need to tell other
2133 * side that a full resync is required! */
2134 dev_err(DEV, "Failed to write bitmap to disk!\n");
2135 } else {
2136 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2137 drbd_md_sync(mdev);
2138 }
2139 }
2140 put_ldev(mdev);
2141 }
2142
2143 c = (struct bm_xfer_ctx) {
2144 .bm_bits = drbd_bm_bits(mdev),
2145 .bm_words = drbd_bm_words(mdev),
2146 };
2147
2148 do {
2149 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2150 } while (ret == OK);
2151
2152 free_page((unsigned long) p);
2153 return (ret == DONE);
2154}
2155
2156int drbd_send_bitmap(struct drbd_conf *mdev)
2157{
2158 int err;
2159
2160 if (!drbd_get_data_sock(mdev))
2161 return -1;
2162 err = !_drbd_send_bitmap(mdev);
2163 drbd_put_data_sock(mdev);
2164 return err;
2165}
2166
2167int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2168{
2169 int ok;
2170 struct p_barrier_ack p;
2171
2172 p.barrier = barrier_nr;
2173 p.set_size = cpu_to_be32(set_size);
2174
2175 if (mdev->state.conn < C_CONNECTED)
2176 return FALSE;
2177 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002178 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002179 return ok;
2180}
2181
2182/**
2183 * _drbd_send_ack() - Sends an ack packet
2184 * @mdev: DRBD device.
2185 * @cmd: Packet command code.
2186 * @sector: sector, needs to be in big endian byte order
2187 * @blksize: size in byte, needs to be in big endian byte order
2188 * @block_id: Id, big endian byte order
2189 */
2190static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2191 u64 sector,
2192 u32 blksize,
2193 u64 block_id)
2194{
2195 int ok;
2196 struct p_block_ack p;
2197
2198 p.sector = sector;
2199 p.block_id = block_id;
2200 p.blksize = blksize;
2201 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2202
2203 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2204 return FALSE;
2205 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002206 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002207 return ok;
2208}
2209
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002210/* dp->sector and dp->block_id already/still in network byte order,
2211 * data_size is payload size according to dp->head,
2212 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002213int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002214 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002215{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002216 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2217 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002218 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2219 dp->block_id);
2220}
2221
2222int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2223 struct p_block_req *rp)
2224{
2225 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2226}
2227
2228/**
2229 * drbd_send_ack() - Sends an ack packet
2230 * @mdev: DRBD device.
2231 * @cmd: Packet command code.
2232 * @e: Epoch entry.
2233 */
2234int drbd_send_ack(struct drbd_conf *mdev,
2235 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2236{
2237 return _drbd_send_ack(mdev, cmd,
2238 cpu_to_be64(e->sector),
2239 cpu_to_be32(e->size),
2240 e->block_id);
2241}
2242
2243/* This function misuses the block_id field to signal if the blocks
2244 * are is sync or not. */
2245int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2246 sector_t sector, int blksize, u64 block_id)
2247{
2248 return _drbd_send_ack(mdev, cmd,
2249 cpu_to_be64(sector),
2250 cpu_to_be32(blksize),
2251 cpu_to_be64(block_id));
2252}
2253
2254int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2255 sector_t sector, int size, u64 block_id)
2256{
2257 int ok;
2258 struct p_block_req p;
2259
2260 p.sector = cpu_to_be64(sector);
2261 p.block_id = block_id;
2262 p.blksize = cpu_to_be32(size);
2263
2264 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002265 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002266 return ok;
2267}
2268
2269int drbd_send_drequest_csum(struct drbd_conf *mdev,
2270 sector_t sector, int size,
2271 void *digest, int digest_size,
2272 enum drbd_packets cmd)
2273{
2274 int ok;
2275 struct p_block_req p;
2276
2277 p.sector = cpu_to_be64(sector);
2278 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2279 p.blksize = cpu_to_be32(size);
2280
2281 p.head.magic = BE_DRBD_MAGIC;
2282 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002283 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002284
2285 mutex_lock(&mdev->data.mutex);
2286
2287 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2288 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2289
2290 mutex_unlock(&mdev->data.mutex);
2291
2292 return ok;
2293}
2294
2295int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2296{
2297 int ok;
2298 struct p_block_req p;
2299
2300 p.sector = cpu_to_be64(sector);
2301 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2302 p.blksize = cpu_to_be32(size);
2303
2304 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002305 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002306 return ok;
2307}
2308
2309/* called on sndtimeo
2310 * returns FALSE if we should retry,
2311 * TRUE if we think connection is dead
2312 */
2313static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2314{
2315 int drop_it;
2316 /* long elapsed = (long)(jiffies - mdev->last_received); */
2317
2318 drop_it = mdev->meta.socket == sock
2319 || !mdev->asender.task
2320 || get_t_state(&mdev->asender) != Running
2321 || mdev->state.conn < C_CONNECTED;
2322
2323 if (drop_it)
2324 return TRUE;
2325
2326 drop_it = !--mdev->ko_count;
2327 if (!drop_it) {
2328 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2329 current->comm, current->pid, mdev->ko_count);
2330 request_ping(mdev);
2331 }
2332
2333 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2334}
2335
2336/* The idea of sendpage seems to be to put some kind of reference
2337 * to the page into the skb, and to hand it over to the NIC. In
2338 * this process get_page() gets called.
2339 *
2340 * As soon as the page was really sent over the network put_page()
2341 * gets called by some part of the network layer. [ NIC driver? ]
2342 *
2343 * [ get_page() / put_page() increment/decrement the count. If count
2344 * reaches 0 the page will be freed. ]
2345 *
2346 * This works nicely with pages from FSs.
2347 * But this means that in protocol A we might signal IO completion too early!
2348 *
2349 * In order not to corrupt data during a resync we must make sure
2350 * that we do not reuse our own buffer pages (EEs) to early, therefore
2351 * we have the net_ee list.
2352 *
2353 * XFS seems to have problems, still, it submits pages with page_count == 0!
2354 * As a workaround, we disable sendpage on pages
2355 * with page_count == 0 or PageSlab.
2356 */
2357static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002358 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002359{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002360 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002361 kunmap(page);
2362 if (sent == size)
2363 mdev->send_cnt += size>>9;
2364 return sent == size;
2365}
2366
2367static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002368 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002369{
2370 mm_segment_t oldfs = get_fs();
2371 int sent, ok;
2372 int len = size;
2373
2374 /* e.g. XFS meta- & log-data is in slab pages, which have a
2375 * page_count of 0 and/or have PageSlab() set.
2376 * we cannot use send_page for those, as that does get_page();
2377 * put_page(); and would cause either a VM_BUG directly, or
2378 * __page_cache_release a page that would actually still be referenced
2379 * by someone, leading to some obscure delayed Oops somewhere else. */
2380 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002381 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002382
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002383 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002384 drbd_update_congested(mdev);
2385 set_fs(KERNEL_DS);
2386 do {
2387 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2388 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002389 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002390 if (sent == -EAGAIN) {
2391 if (we_should_drop_the_connection(mdev,
2392 mdev->data.socket))
2393 break;
2394 else
2395 continue;
2396 }
2397 if (sent <= 0) {
2398 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2399 __func__, (int)size, len, sent);
2400 break;
2401 }
2402 len -= sent;
2403 offset += sent;
2404 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2405 set_fs(oldfs);
2406 clear_bit(NET_CONGESTED, &mdev->flags);
2407
2408 ok = (len == 0);
2409 if (likely(ok))
2410 mdev->send_cnt += size>>9;
2411 return ok;
2412}
2413
2414static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2415{
2416 struct bio_vec *bvec;
2417 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002418 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002419 __bio_for_each_segment(bvec, bio, i, 0) {
2420 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002421 bvec->bv_offset, bvec->bv_len,
2422 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002423 return 0;
2424 }
2425 return 1;
2426}
2427
2428static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2429{
2430 struct bio_vec *bvec;
2431 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002432 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002433 __bio_for_each_segment(bvec, bio, i, 0) {
2434 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002435 bvec->bv_offset, bvec->bv_len,
2436 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002437 return 0;
2438 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002439 return 1;
2440}
2441
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002442static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2443{
2444 struct page *page = e->pages;
2445 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002446 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002447 page_chain_for_each(page) {
2448 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002449 if (!_drbd_send_page(mdev, page, 0, l,
2450 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002451 return 0;
2452 len -= l;
2453 }
2454 return 1;
2455}
2456
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002457static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2458{
2459 if (mdev->agreed_pro_version >= 95)
2460 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2461 (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
2462 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2463 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2464 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2465 else
2466 return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
2467}
2468
Philipp Reisnerb411b362009-09-25 16:07:19 -07002469/* Used to send write requests
2470 * R_PRIMARY -> Peer (P_DATA)
2471 */
2472int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2473{
2474 int ok = 1;
2475 struct p_data p;
2476 unsigned int dp_flags = 0;
2477 void *dgb;
2478 int dgs;
2479
2480 if (!drbd_get_data_sock(mdev))
2481 return 0;
2482
2483 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2484 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2485
Philipp Reisnerd5373382010-08-23 15:18:33 +02002486 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002487 p.head.h80.magic = BE_DRBD_MAGIC;
2488 p.head.h80.command = cpu_to_be16(P_DATA);
2489 p.head.h80.length =
2490 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2491 } else {
2492 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2493 p.head.h95.command = cpu_to_be16(P_DATA);
2494 p.head.h95.length =
2495 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2496 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002497
2498 p.sector = cpu_to_be64(req->sector);
2499 p.block_id = (unsigned long)req;
2500 p.seq_num = cpu_to_be32(req->seq_num =
2501 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002502
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002503 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2504
Philipp Reisnerb411b362009-09-25 16:07:19 -07002505 if (mdev->state.conn >= C_SYNC_SOURCE &&
2506 mdev->state.conn <= C_PAUSED_SYNC_T)
2507 dp_flags |= DP_MAY_SET_IN_SYNC;
2508
2509 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002510 set_bit(UNPLUG_REMOTE, &mdev->flags);
2511 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002512 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002513 if (ok && dgs) {
2514 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002515 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002516 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002517 }
2518 if (ok) {
2519 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2520 ok = _drbd_send_bio(mdev, req->master_bio);
2521 else
2522 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2523 }
2524
2525 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002526
Philipp Reisnerb411b362009-09-25 16:07:19 -07002527 return ok;
2528}
2529
2530/* answer packet, used to send data back for read requests:
2531 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2532 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2533 */
2534int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2535 struct drbd_epoch_entry *e)
2536{
2537 int ok;
2538 struct p_data p;
2539 void *dgb;
2540 int dgs;
2541
2542 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2543 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2544
Philipp Reisnerd5373382010-08-23 15:18:33 +02002545 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002546 p.head.h80.magic = BE_DRBD_MAGIC;
2547 p.head.h80.command = cpu_to_be16(cmd);
2548 p.head.h80.length =
2549 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2550 } else {
2551 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2552 p.head.h95.command = cpu_to_be16(cmd);
2553 p.head.h95.length =
2554 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2555 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002556
2557 p.sector = cpu_to_be64(e->sector);
2558 p.block_id = e->block_id;
2559 /* p.seq_num = 0; No sequence numbers here.. */
2560
2561 /* Only called by our kernel thread.
2562 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2563 * in response to admin command or module unload.
2564 */
2565 if (!drbd_get_data_sock(mdev))
2566 return 0;
2567
Philipp Reisner0b70a132010-08-20 13:36:10 +02002568 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002569 if (ok && dgs) {
2570 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002571 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002572 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002573 }
2574 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002575 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002576
2577 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002578
Philipp Reisnerb411b362009-09-25 16:07:19 -07002579 return ok;
2580}
2581
2582/*
2583 drbd_send distinguishes two cases:
2584
2585 Packets sent via the data socket "sock"
2586 and packets sent via the meta data socket "msock"
2587
2588 sock msock
2589 -----------------+-------------------------+------------------------------
2590 timeout conf.timeout / 2 conf.timeout / 2
2591 timeout action send a ping via msock Abort communication
2592 and close all sockets
2593*/
2594
2595/*
2596 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2597 */
2598int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2599 void *buf, size_t size, unsigned msg_flags)
2600{
2601 struct kvec iov;
2602 struct msghdr msg;
2603 int rv, sent = 0;
2604
2605 if (!sock)
2606 return -1000;
2607
2608 /* THINK if (signal_pending) return ... ? */
2609
2610 iov.iov_base = buf;
2611 iov.iov_len = size;
2612
2613 msg.msg_name = NULL;
2614 msg.msg_namelen = 0;
2615 msg.msg_control = NULL;
2616 msg.msg_controllen = 0;
2617 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2618
2619 if (sock == mdev->data.socket) {
2620 mdev->ko_count = mdev->net_conf->ko_count;
2621 drbd_update_congested(mdev);
2622 }
2623 do {
2624 /* STRANGE
2625 * tcp_sendmsg does _not_ use its size parameter at all ?
2626 *
2627 * -EAGAIN on timeout, -EINTR on signal.
2628 */
2629/* THINK
2630 * do we need to block DRBD_SIG if sock == &meta.socket ??
2631 * otherwise wake_asender() might interrupt some send_*Ack !
2632 */
2633 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2634 if (rv == -EAGAIN) {
2635 if (we_should_drop_the_connection(mdev, sock))
2636 break;
2637 else
2638 continue;
2639 }
2640 D_ASSERT(rv != 0);
2641 if (rv == -EINTR) {
2642 flush_signals(current);
2643 rv = 0;
2644 }
2645 if (rv < 0)
2646 break;
2647 sent += rv;
2648 iov.iov_base += rv;
2649 iov.iov_len -= rv;
2650 } while (sent < size);
2651
2652 if (sock == mdev->data.socket)
2653 clear_bit(NET_CONGESTED, &mdev->flags);
2654
2655 if (rv <= 0) {
2656 if (rv != -EAGAIN) {
2657 dev_err(DEV, "%s_sendmsg returned %d\n",
2658 sock == mdev->meta.socket ? "msock" : "sock",
2659 rv);
2660 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2661 } else
2662 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2663 }
2664
2665 return sent;
2666}
2667
2668static int drbd_open(struct block_device *bdev, fmode_t mode)
2669{
2670 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2671 unsigned long flags;
2672 int rv = 0;
2673
Arnd Bergmann6e9624b2010-08-07 18:25:34 +02002674 lock_kernel();
Philipp Reisnerb411b362009-09-25 16:07:19 -07002675 spin_lock_irqsave(&mdev->req_lock, flags);
2676 /* to have a stable mdev->state.role
2677 * and no race with updating open_cnt */
2678
2679 if (mdev->state.role != R_PRIMARY) {
2680 if (mode & FMODE_WRITE)
2681 rv = -EROFS;
2682 else if (!allow_oos)
2683 rv = -EMEDIUMTYPE;
2684 }
2685
2686 if (!rv)
2687 mdev->open_cnt++;
2688 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann6e9624b2010-08-07 18:25:34 +02002689 unlock_kernel();
Philipp Reisnerb411b362009-09-25 16:07:19 -07002690
2691 return rv;
2692}
2693
2694static int drbd_release(struct gendisk *gd, fmode_t mode)
2695{
2696 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann6e9624b2010-08-07 18:25:34 +02002697 lock_kernel();
Philipp Reisnerb411b362009-09-25 16:07:19 -07002698 mdev->open_cnt--;
Arnd Bergmann6e9624b2010-08-07 18:25:34 +02002699 unlock_kernel();
Philipp Reisnerb411b362009-09-25 16:07:19 -07002700 return 0;
2701}
2702
2703static void drbd_unplug_fn(struct request_queue *q)
2704{
2705 struct drbd_conf *mdev = q->queuedata;
2706
Philipp Reisnerb411b362009-09-25 16:07:19 -07002707 /* unplug FIRST */
2708 spin_lock_irq(q->queue_lock);
2709 blk_remove_plug(q);
2710 spin_unlock_irq(q->queue_lock);
2711
2712 /* only if connected */
2713 spin_lock_irq(&mdev->req_lock);
2714 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2715 D_ASSERT(mdev->state.role == R_PRIMARY);
2716 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2717 /* add to the data.work queue,
2718 * unless already queued.
2719 * XXX this might be a good addition to drbd_queue_work
2720 * anyways, to detect "double queuing" ... */
2721 if (list_empty(&mdev->unplug_work.list))
2722 drbd_queue_work(&mdev->data.work,
2723 &mdev->unplug_work);
2724 }
2725 }
2726 spin_unlock_irq(&mdev->req_lock);
2727
2728 if (mdev->state.disk >= D_INCONSISTENT)
2729 drbd_kick_lo(mdev);
2730}
2731
2732static void drbd_set_defaults(struct drbd_conf *mdev)
2733{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002734 /* This way we get a compile error when sync_conf grows,
2735 and we forgot to initialize it here */
2736 mdev->sync_conf = (struct syncer_conf) {
2737 /* .rate = */ DRBD_RATE_DEF,
2738 /* .after = */ DRBD_AFTER_DEF,
2739 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002740 /* .verify_alg = */ {}, 0,
2741 /* .cpu_mask = */ {}, 0,
2742 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002743 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002744 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2745 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2746 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2747 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002748 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2749 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002750 };
2751
2752 /* Have to use that way, because the layout differs between
2753 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002754 mdev->state = (union drbd_state) {
2755 { .role = R_SECONDARY,
2756 .peer = R_UNKNOWN,
2757 .conn = C_STANDALONE,
2758 .disk = D_DISKLESS,
2759 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002760 .susp = 0,
2761 .susp_nod = 0,
2762 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002763 } };
2764}
2765
2766void drbd_init_set_defaults(struct drbd_conf *mdev)
2767{
2768 /* the memset(,0,) did most of this.
2769 * note: only assignments, no allocation in here */
2770
2771 drbd_set_defaults(mdev);
2772
2773 /* for now, we do NOT yet support it,
2774 * even though we start some framework
2775 * to eventually support barriers */
2776 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2777
2778 atomic_set(&mdev->ap_bio_cnt, 0);
2779 atomic_set(&mdev->ap_pending_cnt, 0);
2780 atomic_set(&mdev->rs_pending_cnt, 0);
2781 atomic_set(&mdev->unacked_cnt, 0);
2782 atomic_set(&mdev->local_cnt, 0);
2783 atomic_set(&mdev->net_cnt, 0);
2784 atomic_set(&mdev->packet_seq, 0);
2785 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02002786 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02002787 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002788 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002789
2790 mutex_init(&mdev->md_io_mutex);
2791 mutex_init(&mdev->data.mutex);
2792 mutex_init(&mdev->meta.mutex);
2793 sema_init(&mdev->data.work.s, 0);
2794 sema_init(&mdev->meta.work.s, 0);
2795 mutex_init(&mdev->state_mutex);
2796
2797 spin_lock_init(&mdev->data.work.q_lock);
2798 spin_lock_init(&mdev->meta.work.q_lock);
2799
2800 spin_lock_init(&mdev->al_lock);
2801 spin_lock_init(&mdev->req_lock);
2802 spin_lock_init(&mdev->peer_seq_lock);
2803 spin_lock_init(&mdev->epoch_lock);
2804
2805 INIT_LIST_HEAD(&mdev->active_ee);
2806 INIT_LIST_HEAD(&mdev->sync_ee);
2807 INIT_LIST_HEAD(&mdev->done_ee);
2808 INIT_LIST_HEAD(&mdev->read_ee);
2809 INIT_LIST_HEAD(&mdev->net_ee);
2810 INIT_LIST_HEAD(&mdev->resync_reads);
2811 INIT_LIST_HEAD(&mdev->data.work.q);
2812 INIT_LIST_HEAD(&mdev->meta.work.q);
2813 INIT_LIST_HEAD(&mdev->resync_work.list);
2814 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002815 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002816 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2817 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02002818
Philipp Reisnerb411b362009-09-25 16:07:19 -07002819 mdev->resync_work.cb = w_resync_inactive;
2820 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002821 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002822 mdev->md_sync_work.cb = w_md_sync;
2823 mdev->bm_io_work.w.cb = w_bitmap_io;
2824 init_timer(&mdev->resync_timer);
2825 init_timer(&mdev->md_sync_timer);
2826 mdev->resync_timer.function = resync_timer_fn;
2827 mdev->resync_timer.data = (unsigned long) mdev;
2828 mdev->md_sync_timer.function = md_sync_timer_fn;
2829 mdev->md_sync_timer.data = (unsigned long) mdev;
2830
2831 init_waitqueue_head(&mdev->misc_wait);
2832 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02002833 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002834 init_waitqueue_head(&mdev->ee_wait);
2835 init_waitqueue_head(&mdev->al_wait);
2836 init_waitqueue_head(&mdev->seq_wait);
2837
2838 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2839 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2840 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2841
2842 mdev->agreed_pro_version = PRO_VERSION_MAX;
2843 mdev->write_ordering = WO_bio_barrier;
2844 mdev->resync_wenr = LC_FREE;
2845}
2846
2847void drbd_mdev_cleanup(struct drbd_conf *mdev)
2848{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002849 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002850 if (mdev->receiver.t_state != None)
2851 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2852 mdev->receiver.t_state);
2853
2854 /* no need to lock it, I'm the only thread alive */
2855 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2856 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2857 mdev->al_writ_cnt =
2858 mdev->bm_writ_cnt =
2859 mdev->read_cnt =
2860 mdev->recv_cnt =
2861 mdev->send_cnt =
2862 mdev->writ_cnt =
2863 mdev->p_size =
2864 mdev->rs_start =
2865 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002866 mdev->rs_failed = 0;
2867 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002868 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002869 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2870 mdev->rs_mark_left[i] = 0;
2871 mdev->rs_mark_time[i] = 0;
2872 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002873 D_ASSERT(mdev->net_conf == NULL);
2874
2875 drbd_set_my_capacity(mdev, 0);
2876 if (mdev->bitmap) {
2877 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01002878 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002879 drbd_bm_cleanup(mdev);
2880 }
2881
2882 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02002883 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002884
2885 /*
2886 * currently we drbd_init_ee only on module load, so
2887 * we may do drbd_release_ee only on module unload!
2888 */
2889 D_ASSERT(list_empty(&mdev->active_ee));
2890 D_ASSERT(list_empty(&mdev->sync_ee));
2891 D_ASSERT(list_empty(&mdev->done_ee));
2892 D_ASSERT(list_empty(&mdev->read_ee));
2893 D_ASSERT(list_empty(&mdev->net_ee));
2894 D_ASSERT(list_empty(&mdev->resync_reads));
2895 D_ASSERT(list_empty(&mdev->data.work.q));
2896 D_ASSERT(list_empty(&mdev->meta.work.q));
2897 D_ASSERT(list_empty(&mdev->resync_work.list));
2898 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002899 D_ASSERT(list_empty(&mdev->go_diskless.list));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002900
2901}
2902
2903
2904static void drbd_destroy_mempools(void)
2905{
2906 struct page *page;
2907
2908 while (drbd_pp_pool) {
2909 page = drbd_pp_pool;
2910 drbd_pp_pool = (struct page *)page_private(page);
2911 __free_page(page);
2912 drbd_pp_vacant--;
2913 }
2914
2915 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2916
2917 if (drbd_ee_mempool)
2918 mempool_destroy(drbd_ee_mempool);
2919 if (drbd_request_mempool)
2920 mempool_destroy(drbd_request_mempool);
2921 if (drbd_ee_cache)
2922 kmem_cache_destroy(drbd_ee_cache);
2923 if (drbd_request_cache)
2924 kmem_cache_destroy(drbd_request_cache);
2925 if (drbd_bm_ext_cache)
2926 kmem_cache_destroy(drbd_bm_ext_cache);
2927 if (drbd_al_ext_cache)
2928 kmem_cache_destroy(drbd_al_ext_cache);
2929
2930 drbd_ee_mempool = NULL;
2931 drbd_request_mempool = NULL;
2932 drbd_ee_cache = NULL;
2933 drbd_request_cache = NULL;
2934 drbd_bm_ext_cache = NULL;
2935 drbd_al_ext_cache = NULL;
2936
2937 return;
2938}
2939
2940static int drbd_create_mempools(void)
2941{
2942 struct page *page;
2943 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2944 int i;
2945
2946 /* prepare our caches and mempools */
2947 drbd_request_mempool = NULL;
2948 drbd_ee_cache = NULL;
2949 drbd_request_cache = NULL;
2950 drbd_bm_ext_cache = NULL;
2951 drbd_al_ext_cache = NULL;
2952 drbd_pp_pool = NULL;
2953
2954 /* caches */
2955 drbd_request_cache = kmem_cache_create(
2956 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2957 if (drbd_request_cache == NULL)
2958 goto Enomem;
2959
2960 drbd_ee_cache = kmem_cache_create(
2961 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2962 if (drbd_ee_cache == NULL)
2963 goto Enomem;
2964
2965 drbd_bm_ext_cache = kmem_cache_create(
2966 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2967 if (drbd_bm_ext_cache == NULL)
2968 goto Enomem;
2969
2970 drbd_al_ext_cache = kmem_cache_create(
2971 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2972 if (drbd_al_ext_cache == NULL)
2973 goto Enomem;
2974
2975 /* mempools */
2976 drbd_request_mempool = mempool_create(number,
2977 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2978 if (drbd_request_mempool == NULL)
2979 goto Enomem;
2980
2981 drbd_ee_mempool = mempool_create(number,
2982 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2983 if (drbd_request_mempool == NULL)
2984 goto Enomem;
2985
2986 /* drbd's page pool */
2987 spin_lock_init(&drbd_pp_lock);
2988
2989 for (i = 0; i < number; i++) {
2990 page = alloc_page(GFP_HIGHUSER);
2991 if (!page)
2992 goto Enomem;
2993 set_page_private(page, (unsigned long)drbd_pp_pool);
2994 drbd_pp_pool = page;
2995 }
2996 drbd_pp_vacant = number;
2997
2998 return 0;
2999
3000Enomem:
3001 drbd_destroy_mempools(); /* in case we allocated some */
3002 return -ENOMEM;
3003}
3004
3005static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3006 void *unused)
3007{
3008 /* just so we have it. you never know what interesting things we
3009 * might want to do here some day...
3010 */
3011
3012 return NOTIFY_DONE;
3013}
3014
3015static struct notifier_block drbd_notifier = {
3016 .notifier_call = drbd_notify_sys,
3017};
3018
3019static void drbd_release_ee_lists(struct drbd_conf *mdev)
3020{
3021 int rr;
3022
3023 rr = drbd_release_ee(mdev, &mdev->active_ee);
3024 if (rr)
3025 dev_err(DEV, "%d EEs in active list found!\n", rr);
3026
3027 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3028 if (rr)
3029 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3030
3031 rr = drbd_release_ee(mdev, &mdev->read_ee);
3032 if (rr)
3033 dev_err(DEV, "%d EEs in read list found!\n", rr);
3034
3035 rr = drbd_release_ee(mdev, &mdev->done_ee);
3036 if (rr)
3037 dev_err(DEV, "%d EEs in done list found!\n", rr);
3038
3039 rr = drbd_release_ee(mdev, &mdev->net_ee);
3040 if (rr)
3041 dev_err(DEV, "%d EEs in net list found!\n", rr);
3042}
3043
3044/* caution. no locking.
3045 * currently only used from module cleanup code. */
3046static void drbd_delete_device(unsigned int minor)
3047{
3048 struct drbd_conf *mdev = minor_to_mdev(minor);
3049
3050 if (!mdev)
3051 return;
3052
3053 /* paranoia asserts */
3054 if (mdev->open_cnt != 0)
3055 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3056 __FILE__ , __LINE__);
3057
3058 ERR_IF (!list_empty(&mdev->data.work.q)) {
3059 struct list_head *lp;
3060 list_for_each(lp, &mdev->data.work.q) {
3061 dev_err(DEV, "lp = %p\n", lp);
3062 }
3063 };
3064 /* end paranoia asserts */
3065
3066 del_gendisk(mdev->vdisk);
3067
3068 /* cleanup stuff that may have been allocated during
3069 * device (re-)configuration or state changes */
3070
3071 if (mdev->this_bdev)
3072 bdput(mdev->this_bdev);
3073
3074 drbd_free_resources(mdev);
3075
3076 drbd_release_ee_lists(mdev);
3077
3078 /* should be free'd on disconnect? */
3079 kfree(mdev->ee_hash);
3080 /*
3081 mdev->ee_hash_s = 0;
3082 mdev->ee_hash = NULL;
3083 */
3084
3085 lc_destroy(mdev->act_log);
3086 lc_destroy(mdev->resync);
3087
3088 kfree(mdev->p_uuid);
3089 /* mdev->p_uuid = NULL; */
3090
3091 kfree(mdev->int_dig_out);
3092 kfree(mdev->int_dig_in);
3093 kfree(mdev->int_dig_vv);
3094
3095 /* cleanup the rest that has been
3096 * allocated from drbd_new_device
3097 * and actually free the mdev itself */
3098 drbd_free_mdev(mdev);
3099}
3100
3101static void drbd_cleanup(void)
3102{
3103 unsigned int i;
3104
3105 unregister_reboot_notifier(&drbd_notifier);
3106
3107 drbd_nl_cleanup();
3108
3109 if (minor_table) {
3110 if (drbd_proc)
3111 remove_proc_entry("drbd", NULL);
3112 i = minor_count;
3113 while (i--)
3114 drbd_delete_device(i);
3115 drbd_destroy_mempools();
3116 }
3117
3118 kfree(minor_table);
3119
3120 unregister_blkdev(DRBD_MAJOR, "drbd");
3121
3122 printk(KERN_INFO "drbd: module cleanup done.\n");
3123}
3124
3125/**
3126 * drbd_congested() - Callback for pdflush
3127 * @congested_data: User data
3128 * @bdi_bits: Bits pdflush is currently interested in
3129 *
3130 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3131 */
3132static int drbd_congested(void *congested_data, int bdi_bits)
3133{
3134 struct drbd_conf *mdev = congested_data;
3135 struct request_queue *q;
3136 char reason = '-';
3137 int r = 0;
3138
3139 if (!__inc_ap_bio_cond(mdev)) {
3140 /* DRBD has frozen IO */
3141 r = bdi_bits;
3142 reason = 'd';
3143 goto out;
3144 }
3145
3146 if (get_ldev(mdev)) {
3147 q = bdev_get_queue(mdev->ldev->backing_bdev);
3148 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3149 put_ldev(mdev);
3150 if (r)
3151 reason = 'b';
3152 }
3153
3154 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3155 r |= (1 << BDI_async_congested);
3156 reason = reason == 'b' ? 'a' : 'n';
3157 }
3158
3159out:
3160 mdev->congestion_reason = reason;
3161 return r;
3162}
3163
3164struct drbd_conf *drbd_new_device(unsigned int minor)
3165{
3166 struct drbd_conf *mdev;
3167 struct gendisk *disk;
3168 struct request_queue *q;
3169
3170 /* GFP_KERNEL, we are outside of all write-out paths */
3171 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3172 if (!mdev)
3173 return NULL;
3174 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3175 goto out_no_cpumask;
3176
3177 mdev->minor = minor;
3178
3179 drbd_init_set_defaults(mdev);
3180
3181 q = blk_alloc_queue(GFP_KERNEL);
3182 if (!q)
3183 goto out_no_q;
3184 mdev->rq_queue = q;
3185 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003186
3187 disk = alloc_disk(1);
3188 if (!disk)
3189 goto out_no_disk;
3190 mdev->vdisk = disk;
3191
3192 set_disk_ro(disk, TRUE);
3193
3194 disk->queue = q;
3195 disk->major = DRBD_MAJOR;
3196 disk->first_minor = minor;
3197 disk->fops = &drbd_ops;
3198 sprintf(disk->disk_name, "drbd%d", minor);
3199 disk->private_data = mdev;
3200
3201 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3202 /* we have no partitions. we contain only ourselves. */
3203 mdev->this_bdev->bd_contains = mdev->this_bdev;
3204
3205 q->backing_dev_info.congested_fn = drbd_congested;
3206 q->backing_dev_info.congested_data = mdev;
3207
3208 blk_queue_make_request(q, drbd_make_request_26);
Lars Ellenberg98ec2862010-01-21 19:33:14 +01003209 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003210 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3211 blk_queue_merge_bvec(q, drbd_merge_bvec);
3212 q->queue_lock = &mdev->req_lock; /* needed since we use */
3213 /* plugging on a queue, that actually has no requests! */
3214 q->unplug_fn = drbd_unplug_fn;
3215
3216 mdev->md_io_page = alloc_page(GFP_KERNEL);
3217 if (!mdev->md_io_page)
3218 goto out_no_io_page;
3219
3220 if (drbd_bm_init(mdev))
3221 goto out_no_bitmap;
3222 /* no need to lock access, we are still initializing this minor device. */
3223 if (!tl_init(mdev))
3224 goto out_no_tl;
3225
3226 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3227 if (!mdev->app_reads_hash)
3228 goto out_no_app_reads;
3229
3230 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3231 if (!mdev->current_epoch)
3232 goto out_no_epoch;
3233
3234 INIT_LIST_HEAD(&mdev->current_epoch->list);
3235 mdev->epochs = 1;
3236
3237 return mdev;
3238
3239/* out_whatever_else:
3240 kfree(mdev->current_epoch); */
3241out_no_epoch:
3242 kfree(mdev->app_reads_hash);
3243out_no_app_reads:
3244 tl_cleanup(mdev);
3245out_no_tl:
3246 drbd_bm_cleanup(mdev);
3247out_no_bitmap:
3248 __free_page(mdev->md_io_page);
3249out_no_io_page:
3250 put_disk(disk);
3251out_no_disk:
3252 blk_cleanup_queue(q);
3253out_no_q:
3254 free_cpumask_var(mdev->cpu_mask);
3255out_no_cpumask:
3256 kfree(mdev);
3257 return NULL;
3258}
3259
3260/* counterpart of drbd_new_device.
3261 * last part of drbd_delete_device. */
3262void drbd_free_mdev(struct drbd_conf *mdev)
3263{
3264 kfree(mdev->current_epoch);
3265 kfree(mdev->app_reads_hash);
3266 tl_cleanup(mdev);
3267 if (mdev->bitmap) /* should no longer be there. */
3268 drbd_bm_cleanup(mdev);
3269 __free_page(mdev->md_io_page);
3270 put_disk(mdev->vdisk);
3271 blk_cleanup_queue(mdev->rq_queue);
3272 free_cpumask_var(mdev->cpu_mask);
3273 kfree(mdev);
3274}
3275
3276
3277int __init drbd_init(void)
3278{
3279 int err;
3280
3281 if (sizeof(struct p_handshake) != 80) {
3282 printk(KERN_ERR
3283 "drbd: never change the size or layout "
3284 "of the HandShake packet.\n");
3285 return -EINVAL;
3286 }
3287
3288 if (1 > minor_count || minor_count > 255) {
3289 printk(KERN_ERR
3290 "drbd: invalid minor_count (%d)\n", minor_count);
3291#ifdef MODULE
3292 return -EINVAL;
3293#else
3294 minor_count = 8;
3295#endif
3296 }
3297
3298 err = drbd_nl_init();
3299 if (err)
3300 return err;
3301
3302 err = register_blkdev(DRBD_MAJOR, "drbd");
3303 if (err) {
3304 printk(KERN_ERR
3305 "drbd: unable to register block device major %d\n",
3306 DRBD_MAJOR);
3307 return err;
3308 }
3309
3310 register_reboot_notifier(&drbd_notifier);
3311
3312 /*
3313 * allocate all necessary structs
3314 */
3315 err = -ENOMEM;
3316
3317 init_waitqueue_head(&drbd_pp_wait);
3318
3319 drbd_proc = NULL; /* play safe for drbd_cleanup */
3320 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3321 GFP_KERNEL);
3322 if (!minor_table)
3323 goto Enomem;
3324
3325 err = drbd_create_mempools();
3326 if (err)
3327 goto Enomem;
3328
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003329 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003330 if (!drbd_proc) {
3331 printk(KERN_ERR "drbd: unable to register proc file\n");
3332 goto Enomem;
3333 }
3334
3335 rwlock_init(&global_state_lock);
3336
3337 printk(KERN_INFO "drbd: initialized. "
3338 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3339 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3340 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3341 printk(KERN_INFO "drbd: registered as block device major %d\n",
3342 DRBD_MAJOR);
3343 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3344
3345 return 0; /* Success! */
3346
3347Enomem:
3348 drbd_cleanup();
3349 if (err == -ENOMEM)
3350 /* currently always the case */
3351 printk(KERN_ERR "drbd: ran out of memory\n");
3352 else
3353 printk(KERN_ERR "drbd: initialization failure\n");
3354 return err;
3355}
3356
3357void drbd_free_bc(struct drbd_backing_dev *ldev)
3358{
3359 if (ldev == NULL)
3360 return;
3361
3362 bd_release(ldev->backing_bdev);
3363 bd_release(ldev->md_bdev);
3364
3365 fput(ldev->lo_file);
3366 fput(ldev->md_file);
3367
3368 kfree(ldev);
3369}
3370
3371void drbd_free_sock(struct drbd_conf *mdev)
3372{
3373 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003374 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003375 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3376 sock_release(mdev->data.socket);
3377 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003378 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003379 }
3380 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003381 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003382 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3383 sock_release(mdev->meta.socket);
3384 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003385 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003386 }
3387}
3388
3389
3390void drbd_free_resources(struct drbd_conf *mdev)
3391{
3392 crypto_free_hash(mdev->csums_tfm);
3393 mdev->csums_tfm = NULL;
3394 crypto_free_hash(mdev->verify_tfm);
3395 mdev->verify_tfm = NULL;
3396 crypto_free_hash(mdev->cram_hmac_tfm);
3397 mdev->cram_hmac_tfm = NULL;
3398 crypto_free_hash(mdev->integrity_w_tfm);
3399 mdev->integrity_w_tfm = NULL;
3400 crypto_free_hash(mdev->integrity_r_tfm);
3401 mdev->integrity_r_tfm = NULL;
3402
3403 drbd_free_sock(mdev);
3404
3405 __no_warn(local,
3406 drbd_free_bc(mdev->ldev);
3407 mdev->ldev = NULL;);
3408}
3409
3410/* meta data management */
3411
3412struct meta_data_on_disk {
3413 u64 la_size; /* last agreed size. */
3414 u64 uuid[UI_SIZE]; /* UUIDs. */
3415 u64 device_uuid;
3416 u64 reserved_u64_1;
3417 u32 flags; /* MDF */
3418 u32 magic;
3419 u32 md_size_sect;
3420 u32 al_offset; /* offset to this block */
3421 u32 al_nr_extents; /* important for restoring the AL */
3422 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3423 u32 bm_offset; /* offset to the bitmap, from here */
3424 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3425 u32 reserved_u32[4];
3426
3427} __packed;
3428
3429/**
3430 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3431 * @mdev: DRBD device.
3432 */
3433void drbd_md_sync(struct drbd_conf *mdev)
3434{
3435 struct meta_data_on_disk *buffer;
3436 sector_t sector;
3437 int i;
3438
Lars Ellenbergee15b032010-09-03 10:00:09 +02003439 del_timer(&mdev->md_sync_timer);
3440 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003441 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3442 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003443
3444 /* We use here D_FAILED and not D_ATTACHING because we try to write
3445 * metadata even if we detach due to a disk failure! */
3446 if (!get_ldev_if_state(mdev, D_FAILED))
3447 return;
3448
Philipp Reisnerb411b362009-09-25 16:07:19 -07003449 mutex_lock(&mdev->md_io_mutex);
3450 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3451 memset(buffer, 0, 512);
3452
3453 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3454 for (i = UI_CURRENT; i < UI_SIZE; i++)
3455 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3456 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3457 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3458
3459 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3460 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3461 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3462 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3463 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3464
3465 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3466
3467 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3468 sector = mdev->ldev->md.md_offset;
3469
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003470 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003471 /* this was a try anyways ... */
3472 dev_err(DEV, "meta data update failed!\n");
Philipp Reisnerb411b362009-09-25 16:07:19 -07003473 drbd_chk_io_error(mdev, 1, TRUE);
3474 }
3475
3476 /* Update mdev->ldev->md.la_size_sect,
3477 * since we updated it on metadata. */
3478 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3479
3480 mutex_unlock(&mdev->md_io_mutex);
3481 put_ldev(mdev);
3482}
3483
3484/**
3485 * drbd_md_read() - Reads in the meta data super block
3486 * @mdev: DRBD device.
3487 * @bdev: Device from which the meta data should be read in.
3488 *
3489 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3490 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3491 */
3492int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3493{
3494 struct meta_data_on_disk *buffer;
3495 int i, rv = NO_ERROR;
3496
3497 if (!get_ldev_if_state(mdev, D_ATTACHING))
3498 return ERR_IO_MD_DISK;
3499
Philipp Reisnerb411b362009-09-25 16:07:19 -07003500 mutex_lock(&mdev->md_io_mutex);
3501 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3502
3503 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3504 /* NOTE: cant do normal error processing here as this is
3505 called BEFORE disk is attached */
3506 dev_err(DEV, "Error while reading metadata.\n");
3507 rv = ERR_IO_MD_DISK;
3508 goto err;
3509 }
3510
3511 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3512 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3513 rv = ERR_MD_INVALID;
3514 goto err;
3515 }
3516 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3517 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3518 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3519 rv = ERR_MD_INVALID;
3520 goto err;
3521 }
3522 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3523 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3524 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3525 rv = ERR_MD_INVALID;
3526 goto err;
3527 }
3528 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3529 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3530 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3531 rv = ERR_MD_INVALID;
3532 goto err;
3533 }
3534
3535 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3536 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3537 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3538 rv = ERR_MD_INVALID;
3539 goto err;
3540 }
3541
3542 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3543 for (i = UI_CURRENT; i < UI_SIZE; i++)
3544 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3545 bdev->md.flags = be32_to_cpu(buffer->flags);
3546 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3547 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3548
3549 if (mdev->sync_conf.al_extents < 7)
3550 mdev->sync_conf.al_extents = 127;
3551
3552 err:
3553 mutex_unlock(&mdev->md_io_mutex);
3554 put_ldev(mdev);
3555
3556 return rv;
3557}
3558
Lars Ellenbergac724122010-10-07 15:18:08 +02003559static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3560{
3561 static char *uuid_str[UI_EXTENDED_SIZE] = {
3562 [UI_CURRENT] = "CURRENT",
3563 [UI_BITMAP] = "BITMAP",
3564 [UI_HISTORY_START] = "HISTORY_START",
3565 [UI_HISTORY_END] = "HISTORY_END",
3566 [UI_SIZE] = "SIZE",
3567 [UI_FLAGS] = "FLAGS",
3568 };
3569
3570 if (index >= UI_EXTENDED_SIZE) {
3571 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3572 return;
3573 }
3574
3575 dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3576 uuid_str[index],
3577 (unsigned long long)mdev->ldev->md.uuid[index]);
3578}
3579
3580
Philipp Reisnerb411b362009-09-25 16:07:19 -07003581/**
3582 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3583 * @mdev: DRBD device.
3584 *
3585 * Call this function if you change anything that should be written to
3586 * the meta-data super block. This function sets MD_DIRTY, and starts a
3587 * timer that ensures that within five seconds you have to call drbd_md_sync().
3588 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003589#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003590void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3591{
3592 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3593 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3594 mdev->last_md_mark_dirty.line = line;
3595 mdev->last_md_mark_dirty.func = func;
3596 }
3597}
3598#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003599void drbd_md_mark_dirty(struct drbd_conf *mdev)
3600{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003601 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003602 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003603}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003604#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003605
3606static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3607{
3608 int i;
3609
Lars Ellenbergac724122010-10-07 15:18:08 +02003610 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003611 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Lars Ellenbergac724122010-10-07 15:18:08 +02003612 debug_drbd_uuid(mdev, i+1);
3613 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003614}
3615
3616void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3617{
3618 if (idx == UI_CURRENT) {
3619 if (mdev->state.role == R_PRIMARY)
3620 val |= 1;
3621 else
3622 val &= ~((u64)1);
3623
3624 drbd_set_ed_uuid(mdev, val);
3625 }
3626
3627 mdev->ldev->md.uuid[idx] = val;
Lars Ellenbergac724122010-10-07 15:18:08 +02003628 debug_drbd_uuid(mdev, idx);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003629 drbd_md_mark_dirty(mdev);
3630}
3631
3632
3633void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3634{
3635 if (mdev->ldev->md.uuid[idx]) {
3636 drbd_uuid_move_history(mdev);
3637 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Lars Ellenbergac724122010-10-07 15:18:08 +02003638 debug_drbd_uuid(mdev, UI_HISTORY_START);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003639 }
3640 _drbd_uuid_set(mdev, idx, val);
3641}
3642
3643/**
3644 * drbd_uuid_new_current() - Creates a new current UUID
3645 * @mdev: DRBD device.
3646 *
3647 * Creates a new current UUID, and rotates the old current UUID into
3648 * the bitmap slot. Causes an incremental resync upon next connect.
3649 */
3650void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3651{
3652 u64 val;
3653
3654 dev_info(DEV, "Creating new current UUID\n");
3655 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3656 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Lars Ellenbergac724122010-10-07 15:18:08 +02003657 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003658
3659 get_random_bytes(&val, sizeof(u64));
3660 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003661 /* get it to stable storage _now_ */
3662 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003663}
3664
3665void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3666{
3667 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3668 return;
3669
3670 if (val == 0) {
3671 drbd_uuid_move_history(mdev);
3672 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3673 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Lars Ellenbergac724122010-10-07 15:18:08 +02003674 debug_drbd_uuid(mdev, UI_HISTORY_START);
3675 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003676 } else {
3677 if (mdev->ldev->md.uuid[UI_BITMAP])
3678 dev_warn(DEV, "bm UUID already set");
3679
3680 mdev->ldev->md.uuid[UI_BITMAP] = val;
3681 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3682
Lars Ellenbergac724122010-10-07 15:18:08 +02003683 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003684 }
3685 drbd_md_mark_dirty(mdev);
3686}
3687
3688/**
3689 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3690 * @mdev: DRBD device.
3691 *
3692 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3693 */
3694int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3695{
3696 int rv = -EIO;
3697
3698 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3699 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3700 drbd_md_sync(mdev);
3701 drbd_bm_set_all(mdev);
3702
3703 rv = drbd_bm_write(mdev);
3704
3705 if (!rv) {
3706 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3707 drbd_md_sync(mdev);
3708 }
3709
3710 put_ldev(mdev);
3711 }
3712
3713 return rv;
3714}
3715
3716/**
3717 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3718 * @mdev: DRBD device.
3719 *
3720 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3721 */
3722int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3723{
3724 int rv = -EIO;
3725
Philipp Reisner07782862010-08-31 12:00:50 +02003726 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003727 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3728 drbd_bm_clear_all(mdev);
3729 rv = drbd_bm_write(mdev);
3730 put_ldev(mdev);
3731 }
3732
3733 return rv;
3734}
3735
3736static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3737{
3738 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3739 int rv;
3740
3741 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3742
3743 drbd_bm_lock(mdev, work->why);
3744 rv = work->io_fn(mdev);
3745 drbd_bm_unlock(mdev);
3746
3747 clear_bit(BITMAP_IO, &mdev->flags);
3748 wake_up(&mdev->misc_wait);
3749
3750 if (work->done)
3751 work->done(mdev, rv);
3752
3753 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3754 work->why = NULL;
3755
3756 return 1;
3757}
3758
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003759static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3760{
3761 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02003762 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3763 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3764 * the protected members anymore, though, so in the after_state_ch work
3765 * it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003766 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberg5dbfe7a2010-10-15 09:52:46 +02003767 /* We need to wait for return of references checked out while we still
3768 * have been D_FAILED, though (drbd_md_sync, bitmap io). */
3769 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003770
3771 clear_bit(GO_DISKLESS, &mdev->flags);
3772 return 1;
3773}
3774
3775void drbd_go_diskless(struct drbd_conf *mdev)
3776{
3777 D_ASSERT(mdev->state.disk == D_FAILED);
3778 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02003779 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3780 /* don't drbd_queue_work_front,
3781 * we need to serialize with the after_state_ch work
3782 * of the -> D_FAILED transition. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003783}
3784
Philipp Reisnerb411b362009-09-25 16:07:19 -07003785/**
3786 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3787 * @mdev: DRBD device.
3788 * @io_fn: IO callback to be called when bitmap IO is possible
3789 * @done: callback to be called after the bitmap IO was performed
3790 * @why: Descriptive text of the reason for doing the IO
3791 *
3792 * While IO on the bitmap happens we freeze application IO thus we ensure
3793 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3794 * called from worker context. It MUST NOT be used while a previous such
3795 * work is still pending!
3796 */
3797void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3798 int (*io_fn)(struct drbd_conf *),
3799 void (*done)(struct drbd_conf *, int),
3800 char *why)
3801{
3802 D_ASSERT(current == mdev->worker.task);
3803
3804 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3805 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3806 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3807 if (mdev->bm_io_work.why)
3808 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3809 why, mdev->bm_io_work.why);
3810
3811 mdev->bm_io_work.io_fn = io_fn;
3812 mdev->bm_io_work.done = done;
3813 mdev->bm_io_work.why = why;
3814
3815 set_bit(BITMAP_IO, &mdev->flags);
3816 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3817 if (list_empty(&mdev->bm_io_work.w.list)) {
3818 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3819 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3820 } else
3821 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3822 }
3823}
3824
3825/**
3826 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3827 * @mdev: DRBD device.
3828 * @io_fn: IO callback to be called when bitmap IO is possible
3829 * @why: Descriptive text of the reason for doing the IO
3830 *
3831 * freezes application IO while that the actual IO operations runs. This
3832 * functions MAY NOT be called from worker context.
3833 */
3834int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3835{
3836 int rv;
3837
3838 D_ASSERT(current != mdev->worker.task);
3839
3840 drbd_suspend_io(mdev);
3841
3842 drbd_bm_lock(mdev, why);
3843 rv = io_fn(mdev);
3844 drbd_bm_unlock(mdev);
3845
3846 drbd_resume_io(mdev);
3847
3848 return rv;
3849}
3850
3851void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3852{
3853 if ((mdev->ldev->md.flags & flag) != flag) {
3854 drbd_md_mark_dirty(mdev);
3855 mdev->ldev->md.flags |= flag;
3856 }
3857}
3858
3859void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3860{
3861 if ((mdev->ldev->md.flags & flag) != 0) {
3862 drbd_md_mark_dirty(mdev);
3863 mdev->ldev->md.flags &= ~flag;
3864 }
3865}
3866int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3867{
3868 return (bdev->md.flags & flag) != 0;
3869}
3870
3871static void md_sync_timer_fn(unsigned long data)
3872{
3873 struct drbd_conf *mdev = (struct drbd_conf *) data;
3874
3875 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3876}
3877
3878static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3879{
3880 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02003881#ifdef DEBUG
3882 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3883 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3884#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003885 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003886 return 1;
3887}
3888
3889#ifdef CONFIG_DRBD_FAULT_INJECTION
3890/* Fault insertion support including random number generator shamelessly
3891 * stolen from kernel/rcutorture.c */
3892struct fault_random_state {
3893 unsigned long state;
3894 unsigned long count;
3895};
3896
3897#define FAULT_RANDOM_MULT 39916801 /* prime */
3898#define FAULT_RANDOM_ADD 479001701 /* prime */
3899#define FAULT_RANDOM_REFRESH 10000
3900
3901/*
3902 * Crude but fast random-number generator. Uses a linear congruential
3903 * generator, with occasional help from get_random_bytes().
3904 */
3905static unsigned long
3906_drbd_fault_random(struct fault_random_state *rsp)
3907{
3908 long refresh;
3909
Roel Kluin49829ea2009-12-15 22:55:44 +01003910 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003911 get_random_bytes(&refresh, sizeof(refresh));
3912 rsp->state += refresh;
3913 rsp->count = FAULT_RANDOM_REFRESH;
3914 }
3915 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3916 return swahw32(rsp->state);
3917}
3918
3919static char *
3920_drbd_fault_str(unsigned int type) {
3921 static char *_faults[] = {
3922 [DRBD_FAULT_MD_WR] = "Meta-data write",
3923 [DRBD_FAULT_MD_RD] = "Meta-data read",
3924 [DRBD_FAULT_RS_WR] = "Resync write",
3925 [DRBD_FAULT_RS_RD] = "Resync read",
3926 [DRBD_FAULT_DT_WR] = "Data write",
3927 [DRBD_FAULT_DT_RD] = "Data read",
3928 [DRBD_FAULT_DT_RA] = "Data read ahead",
3929 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02003930 [DRBD_FAULT_AL_EE] = "EE allocation",
3931 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07003932 };
3933
3934 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3935}
3936
3937unsigned int
3938_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3939{
3940 static struct fault_random_state rrs = {0, 0};
3941
3942 unsigned int ret = (
3943 (fault_devs == 0 ||
3944 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3945 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3946
3947 if (ret) {
3948 fault_count++;
3949
Lars Ellenberg73835062010-05-27 11:51:56 +02003950 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003951 dev_warn(DEV, "***Simulating %s failure\n",
3952 _drbd_fault_str(type));
3953 }
3954
3955 return ret;
3956}
3957#endif
3958
3959const char *drbd_buildtag(void)
3960{
3961 /* DRBD built from external sources has here a reference to the
3962 git hash of the source code. */
3963
3964 static char buildtag[38] = "\0uilt-in";
3965
3966 if (buildtag[0] == 0) {
3967#ifdef CONFIG_MODULES
3968 if (THIS_MODULE != NULL)
3969 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3970 else
3971#endif
3972 buildtag[0] = 'b';
3973 }
3974
3975 return buildtag;
3976}
3977
3978module_init(drbd_init)
3979module_exit(drbd_cleanup)
3980
Philipp Reisnerb411b362009-09-25 16:07:19 -07003981EXPORT_SYMBOL(drbd_conn_str);
3982EXPORT_SYMBOL(drbd_role_str);
3983EXPORT_SYMBOL(drbd_disk_str);
3984EXPORT_SYMBOL(drbd_set_st_err_str);