blob: c0ea5baa9a1bbd9993e424be56983eb55325bac4 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010088MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070090MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700120int disable_sendpage;
121int allow_oos;
122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
142
143/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149struct page *drbd_pp_pool;
150spinlock_t drbd_pp_lock;
151int drbd_pp_vacant;
152wait_queue_head_t drbd_pp_wait;
153
154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100156static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160};
161
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169{
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179}
180
181#endif
182
183/**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193static int tl_init(struct drbd_conf *mdev)
194{
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200205 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
211
212 mdev->tl_hash = NULL;
213 mdev->tl_hash_s = 0;
214
215 return 1;
216}
217
218static void tl_cleanup(struct drbd_conf *mdev)
219{
220 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
221 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
222 kfree(mdev->oldest_tle);
223 mdev->oldest_tle = NULL;
224 kfree(mdev->unused_spare_tle);
225 mdev->unused_spare_tle = NULL;
226 kfree(mdev->tl_hash);
227 mdev->tl_hash = NULL;
228 mdev->tl_hash_s = 0;
229}
230
Andreas Gruenbacherd6287692011-01-13 23:05:39 +0100231static void drbd_free_tl_hash(struct drbd_conf *mdev)
232{
233 struct hlist_head *h;
234
235 spin_lock_irq(&mdev->req_lock);
236
237 if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
238 spin_unlock_irq(&mdev->req_lock);
239 return;
240 }
241 /* paranoia code */
242 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
243 if (h->first)
244 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
245 (int)(h - mdev->ee_hash), h->first);
246 kfree(mdev->ee_hash);
247 mdev->ee_hash = NULL;
248 mdev->ee_hash_s = 0;
249
250 /* paranoia code */
251 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
252 if (h->first)
253 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
254 (int)(h - mdev->tl_hash), h->first);
255 kfree(mdev->tl_hash);
256 mdev->tl_hash = NULL;
257 mdev->tl_hash_s = 0;
258 spin_unlock_irq(&mdev->req_lock);
259}
260
Philipp Reisnerb411b362009-09-25 16:07:19 -0700261/**
262 * _tl_add_barrier() - Adds a barrier to the transfer log
263 * @mdev: DRBD device.
264 * @new: Barrier to be added before the current head of the TL.
265 *
266 * The caller must hold the req_lock.
267 */
268void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
269{
270 struct drbd_tl_epoch *newest_before;
271
272 INIT_LIST_HEAD(&new->requests);
273 INIT_LIST_HEAD(&new->w.list);
274 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
275 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200276 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700277
278 newest_before = mdev->newest_tle;
279 /* never send a barrier number == 0, because that is special-cased
280 * when using TCQ for our write ordering code */
281 new->br_number = (newest_before->br_number+1) ?: 1;
282 if (mdev->newest_tle != new) {
283 mdev->newest_tle->next = new;
284 mdev->newest_tle = new;
285 }
286}
287
288/**
289 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
290 * @mdev: DRBD device.
291 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
292 * @set_size: Expected number of requests before that barrier.
293 *
294 * In case the passed barrier_nr or set_size does not match the oldest
295 * &struct drbd_tl_epoch objects this function will cause a termination
296 * of the connection.
297 */
298void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
299 unsigned int set_size)
300{
301 struct drbd_tl_epoch *b, *nob; /* next old barrier */
302 struct list_head *le, *tle;
303 struct drbd_request *r;
304
305 spin_lock_irq(&mdev->req_lock);
306
307 b = mdev->oldest_tle;
308
309 /* first some paranoia code */
310 if (b == NULL) {
311 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
312 barrier_nr);
313 goto bail;
314 }
315 if (b->br_number != barrier_nr) {
316 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
317 barrier_nr, b->br_number);
318 goto bail;
319 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200320 if (b->n_writes != set_size) {
321 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
322 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700323 goto bail;
324 }
325
326 /* Clean up list of requests processed during current epoch */
327 list_for_each_safe(le, tle, &b->requests) {
328 r = list_entry(le, struct drbd_request, tl_requests);
329 _req_mod(r, barrier_acked);
330 }
331 /* There could be requests on the list waiting for completion
332 of the write to the local disk. To avoid corruptions of
333 slab's data structures we have to remove the lists head.
334
335 Also there could have been a barrier ack out of sequence, overtaking
336 the write acks - which would be a bug and violating write ordering.
337 To not deadlock in case we lose connection while such requests are
338 still pending, we need some way to find them for the
339 _req_mode(connection_lost_while_pending).
340
341 These have been list_move'd to the out_of_sequence_requests list in
342 _req_mod(, barrier_acked) above.
343 */
344 list_del_init(&b->requests);
345
346 nob = b->next;
347 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
348 _tl_add_barrier(mdev, b);
349 if (nob)
350 mdev->oldest_tle = nob;
351 /* if nob == NULL b was the only barrier, and becomes the new
352 barrier. Therefore mdev->oldest_tle points already to b */
353 } else {
354 D_ASSERT(nob != NULL);
355 mdev->oldest_tle = nob;
356 kfree(b);
357 }
358
359 spin_unlock_irq(&mdev->req_lock);
360 dec_ap_pending(mdev);
361
362 return;
363
364bail:
365 spin_unlock_irq(&mdev->req_lock);
366 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
367}
368
Philipp Reisner617049a2010-12-22 12:48:31 +0100369
Philipp Reisner11b58e72010-05-12 17:08:26 +0200370/**
371 * _tl_restart() - Walks the transfer log, and applies an action to all requests
372 * @mdev: DRBD device.
373 * @what: The action/event to perform with all request objects
374 *
375 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
376 * restart_frozen_disk_io.
377 */
378static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
379{
380 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200381 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200382 struct drbd_request *req;
383 int rv, n_writes, n_reads;
384
385 b = mdev->oldest_tle;
386 pn = &mdev->oldest_tle;
387 while (b) {
388 n_writes = 0;
389 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200390 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200391 list_for_each_safe(le, tle, &b->requests) {
392 req = list_entry(le, struct drbd_request, tl_requests);
393 rv = _req_mod(req, what);
394
395 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
396 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
397 }
398 tmp = b->next;
399
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200400 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200401 if (what == resend) {
402 b->n_writes = n_writes;
403 if (b->w.cb == NULL) {
404 b->w.cb = w_send_barrier;
405 inc_ap_pending(mdev);
406 set_bit(CREATE_BARRIER, &mdev->flags);
407 }
408
409 drbd_queue_work(&mdev->data.work, &b->w);
410 }
411 pn = &b->next;
412 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200413 if (n_reads)
414 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200415 /* there could still be requests on that ring list,
416 * in case local io is still pending */
417 list_del(&b->requests);
418
419 /* dec_ap_pending corresponding to queue_barrier.
420 * the newest barrier may not have been queued yet,
421 * in which case w.cb is still NULL. */
422 if (b->w.cb != NULL)
423 dec_ap_pending(mdev);
424
425 if (b == mdev->newest_tle) {
426 /* recycle, but reinit! */
427 D_ASSERT(tmp == NULL);
428 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200429 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200430 INIT_LIST_HEAD(&b->w.list);
431 b->w.cb = NULL;
432 b->br_number = net_random();
433 b->n_writes = 0;
434
435 *pn = b;
436 break;
437 }
438 *pn = tmp;
439 kfree(b);
440 }
441 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200442 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200443 }
444}
445
Philipp Reisnerb411b362009-09-25 16:07:19 -0700446
447/**
448 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
449 * @mdev: DRBD device.
450 *
451 * This is called after the connection to the peer was lost. The storage covered
452 * by the requests on the transfer gets marked as our of sync. Called from the
453 * receiver thread and the worker thread.
454 */
455void tl_clear(struct drbd_conf *mdev)
456{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700457 struct list_head *le, *tle;
458 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700459
460 spin_lock_irq(&mdev->req_lock);
461
Philipp Reisner11b58e72010-05-12 17:08:26 +0200462 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700463
464 /* we expect this list to be empty. */
465 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
466
467 /* but just in case, clean it up anyways! */
468 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
469 r = list_entry(le, struct drbd_request, tl_requests);
470 /* It would be nice to complete outside of spinlock.
471 * But this is easier for now. */
472 _req_mod(r, connection_lost_while_pending);
473 }
474
475 /* ensure bit indicating barrier is required is clear */
476 clear_bit(CREATE_BARRIER, &mdev->flags);
477
Philipp Reisner288f4222010-05-27 15:07:43 +0200478 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
479
Philipp Reisnerb411b362009-09-25 16:07:19 -0700480 spin_unlock_irq(&mdev->req_lock);
481}
482
Philipp Reisner11b58e72010-05-12 17:08:26 +0200483void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
484{
485 spin_lock_irq(&mdev->req_lock);
486 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700487 spin_unlock_irq(&mdev->req_lock);
488}
489
490/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100491 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700492 * @mdev: DRBD device.
493 * @os: old (current) state.
494 * @ns: new (wanted) state.
495 */
496static int cl_wide_st_chg(struct drbd_conf *mdev,
497 union drbd_state os, union drbd_state ns)
498{
499 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
500 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
501 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
502 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
503 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
504 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
505 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
506}
507
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100508enum drbd_state_rv
509drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
510 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700511{
512 unsigned long flags;
513 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100514 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700515
516 spin_lock_irqsave(&mdev->req_lock, flags);
517 os = mdev->state;
518 ns.i = (os.i & ~mask.i) | val.i;
519 rv = _drbd_set_state(mdev, ns, f, NULL);
520 ns = mdev->state;
521 spin_unlock_irqrestore(&mdev->req_lock, flags);
522
523 return rv;
524}
525
526/**
527 * drbd_force_state() - Impose a change which happens outside our control on our state
528 * @mdev: DRBD device.
529 * @mask: mask of state bits to change.
530 * @val: value of new state bits.
531 */
532void drbd_force_state(struct drbd_conf *mdev,
533 union drbd_state mask, union drbd_state val)
534{
535 drbd_change_state(mdev, CS_HARD, mask, val);
536}
537
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100538static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
539static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
540 union drbd_state,
541 union drbd_state);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700542static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200543 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700544int drbd_send_state_req(struct drbd_conf *,
545 union drbd_state, union drbd_state);
546
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100547static enum drbd_state_rv
548_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
549 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700550{
551 union drbd_state os, ns;
552 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100553 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700554
555 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
556 return SS_CW_SUCCESS;
557
558 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
559 return SS_CW_FAILED_BY_PEER;
560
561 rv = 0;
562 spin_lock_irqsave(&mdev->req_lock, flags);
563 os = mdev->state;
564 ns.i = (os.i & ~mask.i) | val.i;
565 ns = sanitize_state(mdev, os, ns, NULL);
566
567 if (!cl_wide_st_chg(mdev, os, ns))
568 rv = SS_CW_NO_NEED;
569 if (!rv) {
570 rv = is_valid_state(mdev, ns);
571 if (rv == SS_SUCCESS) {
572 rv = is_valid_state_transition(mdev, ns, os);
573 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100574 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700575 }
576 }
577 spin_unlock_irqrestore(&mdev->req_lock, flags);
578
579 return rv;
580}
581
582/**
583 * drbd_req_state() - Perform an eventually cluster wide state change
584 * @mdev: DRBD device.
585 * @mask: mask of state bits to change.
586 * @val: value of new state bits.
587 * @f: flags
588 *
589 * Should not be called directly, use drbd_request_state() or
590 * _drbd_request_state().
591 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100592static enum drbd_state_rv
593drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
594 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700595{
596 struct completion done;
597 unsigned long flags;
598 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100599 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700600
601 init_completion(&done);
602
603 if (f & CS_SERIALIZE)
604 mutex_lock(&mdev->state_mutex);
605
606 spin_lock_irqsave(&mdev->req_lock, flags);
607 os = mdev->state;
608 ns.i = (os.i & ~mask.i) | val.i;
609 ns = sanitize_state(mdev, os, ns, NULL);
610
611 if (cl_wide_st_chg(mdev, os, ns)) {
612 rv = is_valid_state(mdev, ns);
613 if (rv == SS_SUCCESS)
614 rv = is_valid_state_transition(mdev, ns, os);
615 spin_unlock_irqrestore(&mdev->req_lock, flags);
616
617 if (rv < SS_SUCCESS) {
618 if (f & CS_VERBOSE)
619 print_st_err(mdev, os, ns, rv);
620 goto abort;
621 }
622
623 drbd_state_lock(mdev);
624 if (!drbd_send_state_req(mdev, mask, val)) {
625 drbd_state_unlock(mdev);
626 rv = SS_CW_FAILED_BY_PEER;
627 if (f & CS_VERBOSE)
628 print_st_err(mdev, os, ns, rv);
629 goto abort;
630 }
631
632 wait_event(mdev->state_wait,
633 (rv = _req_st_cond(mdev, mask, val)));
634
635 if (rv < SS_SUCCESS) {
636 drbd_state_unlock(mdev);
637 if (f & CS_VERBOSE)
638 print_st_err(mdev, os, ns, rv);
639 goto abort;
640 }
641 spin_lock_irqsave(&mdev->req_lock, flags);
642 os = mdev->state;
643 ns.i = (os.i & ~mask.i) | val.i;
644 rv = _drbd_set_state(mdev, ns, f, &done);
645 drbd_state_unlock(mdev);
646 } else {
647 rv = _drbd_set_state(mdev, ns, f, &done);
648 }
649
650 spin_unlock_irqrestore(&mdev->req_lock, flags);
651
652 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
653 D_ASSERT(current != mdev->worker.task);
654 wait_for_completion(&done);
655 }
656
657abort:
658 if (f & CS_SERIALIZE)
659 mutex_unlock(&mdev->state_mutex);
660
661 return rv;
662}
663
664/**
665 * _drbd_request_state() - Request a state change (with flags)
666 * @mdev: DRBD device.
667 * @mask: mask of state bits to change.
668 * @val: value of new state bits.
669 * @f: flags
670 *
671 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
672 * flag, or when logging of failed state change requests is not desired.
673 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100674enum drbd_state_rv
675_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
676 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700677{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100678 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700679
680 wait_event(mdev->state_wait,
681 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
682
683 return rv;
684}
685
686static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
687{
688 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
689 name,
690 drbd_conn_str(ns.conn),
691 drbd_role_str(ns.role),
692 drbd_role_str(ns.peer),
693 drbd_disk_str(ns.disk),
694 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200695 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700696 ns.aftr_isp ? 'a' : '-',
697 ns.peer_isp ? 'p' : '-',
698 ns.user_isp ? 'u' : '-'
699 );
700}
701
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100702void print_st_err(struct drbd_conf *mdev, union drbd_state os,
703 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700704{
705 if (err == SS_IN_TRANSIENT_STATE)
706 return;
707 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
708 print_st(mdev, " state", os);
709 print_st(mdev, "wanted", ns);
710}
711
712
Philipp Reisnerb411b362009-09-25 16:07:19 -0700713/**
714 * is_valid_state() - Returns an SS_ error code if ns is not valid
715 * @mdev: DRBD device.
716 * @ns: State to consider.
717 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100718static enum drbd_state_rv
719is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700720{
721 /* See drbd_state_sw_errors in drbd_strings.c */
722
723 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100724 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700725
726 fp = FP_DONT_CARE;
727 if (get_ldev(mdev)) {
728 fp = mdev->ldev->dc.fencing;
729 put_ldev(mdev);
730 }
731
732 if (get_net_conf(mdev)) {
733 if (!mdev->net_conf->two_primaries &&
734 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
735 rv = SS_TWO_PRIMARIES;
736 put_net_conf(mdev);
737 }
738
739 if (rv <= 0)
740 /* already found a reason to abort */;
741 else if (ns.role == R_SECONDARY && mdev->open_cnt)
742 rv = SS_DEVICE_IN_USE;
743
744 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
745 rv = SS_NO_UP_TO_DATE_DISK;
746
747 else if (fp >= FP_RESOURCE &&
748 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
749 rv = SS_PRIMARY_NOP;
750
751 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
752 rv = SS_NO_UP_TO_DATE_DISK;
753
754 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
755 rv = SS_NO_LOCAL_DISK;
756
757 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
758 rv = SS_NO_REMOTE_DISK;
759
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200760 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
761 rv = SS_NO_UP_TO_DATE_DISK;
762
Philipp Reisnerb411b362009-09-25 16:07:19 -0700763 else if ((ns.conn == C_CONNECTED ||
764 ns.conn == C_WF_BITMAP_S ||
765 ns.conn == C_SYNC_SOURCE ||
766 ns.conn == C_PAUSED_SYNC_S) &&
767 ns.disk == D_OUTDATED)
768 rv = SS_CONNECTED_OUTDATES;
769
770 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
771 (mdev->sync_conf.verify_alg[0] == 0))
772 rv = SS_NO_VERIFY_ALG;
773
774 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
775 mdev->agreed_pro_version < 88)
776 rv = SS_NOT_SUPPORTED;
777
Philipp Reisnerfa7d9392011-05-17 14:48:55 +0200778 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
779 rv = SS_CONNECTED_OUTDATES;
780
Philipp Reisnerb411b362009-09-25 16:07:19 -0700781 return rv;
782}
783
784/**
785 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
786 * @mdev: DRBD device.
787 * @ns: new state.
788 * @os: old state.
789 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100790static enum drbd_state_rv
791is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
792 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700793{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100794 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700795
796 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
797 os.conn > C_CONNECTED)
798 rv = SS_RESYNC_RUNNING;
799
800 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
801 rv = SS_ALREADY_STANDALONE;
802
803 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
804 rv = SS_IS_DISKLESS;
805
806 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
807 rv = SS_NO_NET_CONFIG;
808
809 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
810 rv = SS_LOWER_THAN_OUTDATED;
811
812 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
813 rv = SS_IN_TRANSIENT_STATE;
814
815 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
816 rv = SS_IN_TRANSIENT_STATE;
817
818 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
819 rv = SS_NEED_CONNECTION;
820
821 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
822 ns.conn != os.conn && os.conn > C_CONNECTED)
823 rv = SS_RESYNC_RUNNING;
824
825 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
826 os.conn < C_CONNECTED)
827 rv = SS_NEED_CONNECTION;
828
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100829 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
830 && os.conn < C_WF_REPORT_PARAMS)
831 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
832
Philipp Reisnerb411b362009-09-25 16:07:19 -0700833 return rv;
834}
835
836/**
837 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
838 * @mdev: DRBD device.
839 * @os: old state.
840 * @ns: new state.
841 * @warn_sync_abort:
842 *
843 * When we loose connection, we have to set the state of the peers disk (pdsk)
844 * to D_UNKNOWN. This rule and many more along those lines are in this function.
845 */
846static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200847 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700848{
849 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100850 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700851
852 fp = FP_DONT_CARE;
853 if (get_ldev(mdev)) {
854 fp = mdev->ldev->dc.fencing;
855 put_ldev(mdev);
856 }
857
858 /* Disallow Network errors to configure a device's network part */
859 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
860 os.conn <= C_DISCONNECTING)
861 ns.conn = os.conn;
862
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200863 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
864 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700865 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200866 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700867 ns.conn = os.conn;
868
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200869 /* we cannot fail (again) if we already detached */
870 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
871 ns.disk = D_DISKLESS;
872
873 /* if we are only D_ATTACHING yet,
874 * we can (and should) go directly to D_DISKLESS. */
875 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
876 ns.disk = D_DISKLESS;
877
Philipp Reisnerb411b362009-09-25 16:07:19 -0700878 /* After C_DISCONNECTING only C_STANDALONE may follow */
879 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
880 ns.conn = os.conn;
881
882 if (ns.conn < C_CONNECTED) {
883 ns.peer_isp = 0;
884 ns.peer = R_UNKNOWN;
885 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
886 ns.pdsk = D_UNKNOWN;
887 }
888
889 /* Clear the aftr_isp when becoming unconfigured */
890 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
891 ns.aftr_isp = 0;
892
Philipp Reisnerb411b362009-09-25 16:07:19 -0700893 /* Abort resync if a disk fails/detaches */
894 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
895 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
896 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200897 *warn_sync_abort =
898 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
899 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700900 ns.conn = C_CONNECTED;
901 }
902
Philipp Reisnerb411b362009-09-25 16:07:19 -0700903 /* Connection breaks down before we finished "Negotiating" */
904 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
905 get_ldev_if_state(mdev, D_NEGOTIATING)) {
906 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
907 ns.disk = mdev->new_state_tmp.disk;
908 ns.pdsk = mdev->new_state_tmp.pdsk;
909 } else {
910 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
911 ns.disk = D_DISKLESS;
912 ns.pdsk = D_UNKNOWN;
913 }
914 put_ldev(mdev);
915 }
916
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100917 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
918 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
919 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
920 ns.disk = D_UP_TO_DATE;
921 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
922 ns.pdsk = D_UP_TO_DATE;
923 }
924
925 /* Implications of the connection stat on the disk states */
926 disk_min = D_DISKLESS;
927 disk_max = D_UP_TO_DATE;
928 pdsk_min = D_INCONSISTENT;
929 pdsk_max = D_UNKNOWN;
930 switch ((enum drbd_conns)ns.conn) {
931 case C_WF_BITMAP_T:
932 case C_PAUSED_SYNC_T:
933 case C_STARTING_SYNC_T:
934 case C_WF_SYNC_UUID:
935 case C_BEHIND:
936 disk_min = D_INCONSISTENT;
937 disk_max = D_OUTDATED;
938 pdsk_min = D_UP_TO_DATE;
939 pdsk_max = D_UP_TO_DATE;
940 break;
941 case C_VERIFY_S:
942 case C_VERIFY_T:
943 disk_min = D_UP_TO_DATE;
944 disk_max = D_UP_TO_DATE;
945 pdsk_min = D_UP_TO_DATE;
946 pdsk_max = D_UP_TO_DATE;
947 break;
948 case C_CONNECTED:
949 disk_min = D_DISKLESS;
950 disk_max = D_UP_TO_DATE;
951 pdsk_min = D_DISKLESS;
952 pdsk_max = D_UP_TO_DATE;
953 break;
954 case C_WF_BITMAP_S:
955 case C_PAUSED_SYNC_S:
956 case C_STARTING_SYNC_S:
957 case C_AHEAD:
958 disk_min = D_UP_TO_DATE;
959 disk_max = D_UP_TO_DATE;
960 pdsk_min = D_INCONSISTENT;
961 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
962 break;
963 case C_SYNC_TARGET:
964 disk_min = D_INCONSISTENT;
965 disk_max = D_INCONSISTENT;
966 pdsk_min = D_UP_TO_DATE;
967 pdsk_max = D_UP_TO_DATE;
968 break;
969 case C_SYNC_SOURCE:
970 disk_min = D_UP_TO_DATE;
971 disk_max = D_UP_TO_DATE;
972 pdsk_min = D_INCONSISTENT;
973 pdsk_max = D_INCONSISTENT;
974 break;
975 case C_STANDALONE:
976 case C_DISCONNECTING:
977 case C_UNCONNECTED:
978 case C_TIMEOUT:
979 case C_BROKEN_PIPE:
980 case C_NETWORK_FAILURE:
981 case C_PROTOCOL_ERROR:
982 case C_TEAR_DOWN:
983 case C_WF_CONNECTION:
984 case C_WF_REPORT_PARAMS:
985 case C_MASK:
986 break;
987 }
988 if (ns.disk > disk_max)
989 ns.disk = disk_max;
990
991 if (ns.disk < disk_min) {
992 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
993 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
994 ns.disk = disk_min;
995 }
996 if (ns.pdsk > pdsk_max)
997 ns.pdsk = pdsk_max;
998
999 if (ns.pdsk < pdsk_min) {
1000 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
1001 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
1002 ns.pdsk = pdsk_min;
1003 }
1004
Philipp Reisnerb411b362009-09-25 16:07:19 -07001005 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +02001006 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1007 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001008 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001009
1010 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1011 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1012 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001013 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001014
1015 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1016 if (ns.conn == C_SYNC_SOURCE)
1017 ns.conn = C_PAUSED_SYNC_S;
1018 if (ns.conn == C_SYNC_TARGET)
1019 ns.conn = C_PAUSED_SYNC_T;
1020 } else {
1021 if (ns.conn == C_PAUSED_SYNC_S)
1022 ns.conn = C_SYNC_SOURCE;
1023 if (ns.conn == C_PAUSED_SYNC_T)
1024 ns.conn = C_SYNC_TARGET;
1025 }
1026
1027 return ns;
1028}
1029
1030/* helper for __drbd_set_state */
1031static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1032{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001033 if (mdev->agreed_pro_version < 90)
1034 mdev->ov_start_sector = 0;
1035 mdev->rs_total = drbd_bm_bits(mdev);
1036 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001037 if (cs == C_VERIFY_T) {
1038 /* starting online verify from an arbitrary position
1039 * does not fit well into the existing protocol.
1040 * on C_VERIFY_T, we initialize ov_left and friends
1041 * implicitly in receive_DataRequest once the
1042 * first P_OV_REQUEST is received */
1043 mdev->ov_start_sector = ~(sector_t)0;
1044 } else {
1045 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001046 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001047 mdev->ov_start_sector =
1048 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001049 mdev->rs_total = 1;
1050 } else
1051 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001052 mdev->ov_position = mdev->ov_start_sector;
1053 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001054 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001055}
1056
Philipp Reisner07782862010-08-31 12:00:50 +02001057static void drbd_resume_al(struct drbd_conf *mdev)
1058{
1059 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1060 dev_info(DEV, "Resumed AL updates\n");
1061}
1062
Philipp Reisnerb411b362009-09-25 16:07:19 -07001063/**
1064 * __drbd_set_state() - Set a new DRBD state
1065 * @mdev: DRBD device.
1066 * @ns: new state.
1067 * @flags: Flags
1068 * @done: Optional completion, that will get completed after the after_state_ch() finished
1069 *
1070 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1071 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001072enum drbd_state_rv
1073__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1074 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001075{
1076 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001077 enum drbd_state_rv rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001078 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001079 struct after_state_chg_work *ascw;
1080
1081 os = mdev->state;
1082
1083 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1084
1085 if (ns.i == os.i)
1086 return SS_NOTHING_TO_DO;
1087
1088 if (!(flags & CS_HARD)) {
1089 /* pre-state-change checks ; only look at ns */
1090 /* See drbd_state_sw_errors in drbd_strings.c */
1091
1092 rv = is_valid_state(mdev, ns);
1093 if (rv < SS_SUCCESS) {
1094 /* If the old state was illegal as well, then let
1095 this happen...*/
1096
Philipp Reisner1616a252010-06-10 16:55:15 +02001097 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001098 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001099 } else
1100 rv = is_valid_state_transition(mdev, ns, os);
1101 }
1102
1103 if (rv < SS_SUCCESS) {
1104 if (flags & CS_VERBOSE)
1105 print_st_err(mdev, os, ns, rv);
1106 return rv;
1107 }
1108
1109 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001110 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001111
1112 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001113 char *pbp, pb[300];
1114 pbp = pb;
1115 *pbp = 0;
1116 if (ns.role != os.role)
1117 pbp += sprintf(pbp, "role( %s -> %s ) ",
1118 drbd_role_str(os.role),
1119 drbd_role_str(ns.role));
1120 if (ns.peer != os.peer)
1121 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1122 drbd_role_str(os.peer),
1123 drbd_role_str(ns.peer));
1124 if (ns.conn != os.conn)
1125 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1126 drbd_conn_str(os.conn),
1127 drbd_conn_str(ns.conn));
1128 if (ns.disk != os.disk)
1129 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1130 drbd_disk_str(os.disk),
1131 drbd_disk_str(ns.disk));
1132 if (ns.pdsk != os.pdsk)
1133 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1134 drbd_disk_str(os.pdsk),
1135 drbd_disk_str(ns.pdsk));
1136 if (is_susp(ns) != is_susp(os))
1137 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1138 is_susp(os),
1139 is_susp(ns));
1140 if (ns.aftr_isp != os.aftr_isp)
1141 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1142 os.aftr_isp,
1143 ns.aftr_isp);
1144 if (ns.peer_isp != os.peer_isp)
1145 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1146 os.peer_isp,
1147 ns.peer_isp);
1148 if (ns.user_isp != os.user_isp)
1149 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1150 os.user_isp,
1151 ns.user_isp);
1152 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001153 }
1154
1155 /* solve the race between becoming unconfigured,
1156 * worker doing the cleanup, and
1157 * admin reconfiguring us:
1158 * on (re)configure, first set CONFIG_PENDING,
1159 * then wait for a potentially exiting worker,
1160 * start the worker, and schedule one no_op.
1161 * then proceed with configuration.
1162 */
1163 if (ns.disk == D_DISKLESS &&
1164 ns.conn == C_STANDALONE &&
1165 ns.role == R_SECONDARY &&
1166 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1167 set_bit(DEVICE_DYING, &mdev->flags);
1168
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001169 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1170 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1171 * drbd_ldev_destroy() won't happen before our corresponding
1172 * after_state_ch works run, where we put_ldev again. */
1173 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1174 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1175 atomic_inc(&mdev->local_cnt);
1176
1177 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001178
1179 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1180 drbd_print_uuids(mdev, "attached to UUIDs");
1181
Philipp Reisnerb411b362009-09-25 16:07:19 -07001182 wake_up(&mdev->misc_wait);
1183 wake_up(&mdev->state_wait);
1184
Philipp Reisnerb411b362009-09-25 16:07:19 -07001185 /* aborted verify run. log the last position */
1186 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1187 ns.conn < C_CONNECTED) {
1188 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001189 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001190 dev_info(DEV, "Online Verify reached sector %llu\n",
1191 (unsigned long long)mdev->ov_start_sector);
1192 }
1193
1194 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1195 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1196 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001197 mdev->rs_paused += (long)jiffies
1198 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001199 if (ns.conn == C_SYNC_TARGET)
1200 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001201 }
1202
1203 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1204 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1205 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001206 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001207 }
1208
1209 if (os.conn == C_CONNECTED &&
1210 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001211 unsigned long now = jiffies;
1212 int i;
1213
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001214 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001215 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001216 mdev->rs_last_events = 0;
1217 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001218 mdev->ov_last_oos_size = 0;
1219 mdev->ov_last_oos_start = 0;
1220
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001221 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001222 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001223 mdev->rs_mark_time[i] = now;
1224 }
1225
Lars Ellenberg2649f082010-11-05 10:05:47 +01001226 drbd_rs_controller_reset(mdev);
1227
Philipp Reisnerb411b362009-09-25 16:07:19 -07001228 if (ns.conn == C_VERIFY_S) {
1229 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1230 (unsigned long long)mdev->ov_position);
1231 mod_timer(&mdev->resync_timer, jiffies);
1232 }
1233 }
1234
1235 if (get_ldev(mdev)) {
1236 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1237 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1238 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1239
1240 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1241 mdf |= MDF_CRASHED_PRIMARY;
1242 if (mdev->state.role == R_PRIMARY ||
1243 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1244 mdf |= MDF_PRIMARY_IND;
1245 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1246 mdf |= MDF_CONNECTED_IND;
1247 if (mdev->state.disk > D_INCONSISTENT)
1248 mdf |= MDF_CONSISTENT;
1249 if (mdev->state.disk > D_OUTDATED)
1250 mdf |= MDF_WAS_UP_TO_DATE;
1251 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1252 mdf |= MDF_PEER_OUT_DATED;
1253 if (mdf != mdev->ldev->md.flags) {
1254 mdev->ldev->md.flags = mdf;
1255 drbd_md_mark_dirty(mdev);
1256 }
1257 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1258 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1259 put_ldev(mdev);
1260 }
1261
1262 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1263 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1264 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1265 set_bit(CONSIDER_RESYNC, &mdev->flags);
1266
1267 /* Receiver should clean up itself */
1268 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1269 drbd_thread_stop_nowait(&mdev->receiver);
1270
1271 /* Now the receiver finished cleaning up itself, it should die */
1272 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1273 drbd_thread_stop_nowait(&mdev->receiver);
1274
1275 /* Upon network failure, we need to restart the receiver. */
1276 if (os.conn > C_TEAR_DOWN &&
1277 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1278 drbd_thread_restart_nowait(&mdev->receiver);
1279
Philipp Reisner07782862010-08-31 12:00:50 +02001280 /* Resume AL writing if we get a connection */
1281 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1282 drbd_resume_al(mdev);
1283
Philipp Reisnerb411b362009-09-25 16:07:19 -07001284 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1285 if (ascw) {
1286 ascw->os = os;
1287 ascw->ns = ns;
1288 ascw->flags = flags;
1289 ascw->w.cb = w_after_state_ch;
1290 ascw->done = done;
1291 drbd_queue_work(&mdev->data.work, &ascw->w);
1292 } else {
1293 dev_warn(DEV, "Could not kmalloc an ascw\n");
1294 }
1295
1296 return rv;
1297}
1298
1299static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1300{
1301 struct after_state_chg_work *ascw =
1302 container_of(w, struct after_state_chg_work, w);
1303 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1304 if (ascw->flags & CS_WAIT_COMPLETE) {
1305 D_ASSERT(ascw->done != NULL);
1306 complete(ascw->done);
1307 }
1308 kfree(ascw);
1309
1310 return 1;
1311}
1312
1313static void abw_start_sync(struct drbd_conf *mdev, int rv)
1314{
1315 if (rv) {
1316 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1317 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1318 return;
1319 }
1320
1321 switch (mdev->state.conn) {
1322 case C_STARTING_SYNC_T:
1323 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1324 break;
1325 case C_STARTING_SYNC_S:
1326 drbd_start_resync(mdev, C_SYNC_SOURCE);
1327 break;
1328 }
1329}
1330
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001331int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1332 int (*io_fn)(struct drbd_conf *),
1333 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001334{
1335 int rv;
1336
1337 D_ASSERT(current == mdev->worker.task);
1338
1339 /* open coded non-blocking drbd_suspend_io(mdev); */
1340 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001341
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001342 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001343 rv = io_fn(mdev);
1344 drbd_bm_unlock(mdev);
1345
1346 drbd_resume_io(mdev);
1347
1348 return rv;
1349}
1350
Philipp Reisnerb411b362009-09-25 16:07:19 -07001351/**
1352 * after_state_ch() - Perform after state change actions that may sleep
1353 * @mdev: DRBD device.
1354 * @os: old state.
1355 * @ns: new state.
1356 * @flags: Flags
1357 */
1358static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1359 union drbd_state ns, enum chg_state_flags flags)
1360{
1361 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001362 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001363 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001364
1365 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1366 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1367 if (mdev->p_uuid)
1368 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1369 }
1370
1371 fp = FP_DONT_CARE;
1372 if (get_ldev(mdev)) {
1373 fp = mdev->ldev->dc.fencing;
1374 put_ldev(mdev);
1375 }
1376
1377 /* Inform userspace about the change... */
1378 drbd_bcast_state(mdev, ns);
1379
1380 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1381 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1382 drbd_khelper(mdev, "pri-on-incon-degr");
1383
1384 /* Here we have the actions that are performed after a
1385 state change. This function might sleep */
1386
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001387 nsm.i = -1;
1388 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001389 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1390 what = resend;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001391
Philipp Reisner67098932010-06-24 16:24:25 +02001392 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001393 what = restart_frozen_disk_io;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001394
Philipp Reisner3f986882010-12-20 14:48:20 +01001395 if (what != nothing)
1396 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001397 }
1398
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001399 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001400 /* case1: The outdate peer handler is successful: */
1401 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001402 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001403 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1404 drbd_uuid_new_current(mdev);
1405 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001406 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001407 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001408 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001409 spin_unlock_irq(&mdev->req_lock);
1410 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001411 /* case2: The connection was established again: */
1412 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1413 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001414 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001415 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001416 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001417 }
Philipp Reisner67098932010-06-24 16:24:25 +02001418
1419 if (what != nothing) {
1420 spin_lock_irq(&mdev->req_lock);
1421 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001422 nsm.i &= mdev->state.i;
1423 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001424 spin_unlock_irq(&mdev->req_lock);
1425 }
1426
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001427 /* Became sync source. With protocol >= 96, we still need to send out
1428 * the sync uuid now. Need to do that before any drbd_send_state, or
1429 * the other side may go "paused sync" before receiving the sync uuids,
1430 * which is unexpected. */
1431 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1432 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1433 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1434 drbd_gen_and_send_sync_uuid(mdev);
1435 put_ldev(mdev);
1436 }
1437
Philipp Reisnerb411b362009-09-25 16:07:19 -07001438 /* Do not change the order of the if above and the two below... */
1439 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1440 drbd_send_uuids(mdev);
1441 drbd_send_state(mdev);
1442 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001443 /* No point in queuing send_bitmap if we don't have a connection
1444 * anymore, so check also the _current_ state, not only the new state
1445 * at the time this work was queued. */
1446 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1447 mdev->state.conn == C_WF_BITMAP_S)
1448 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001449 "send_bitmap (WFBitMapS)",
1450 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001451
1452 /* Lost contact to peer's copy of the data */
1453 if ((os.pdsk >= D_INCONSISTENT &&
1454 os.pdsk != D_UNKNOWN &&
1455 os.pdsk != D_OUTDATED)
1456 && (ns.pdsk < D_INCONSISTENT ||
1457 ns.pdsk == D_UNKNOWN ||
1458 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001459 if (get_ldev(mdev)) {
1460 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001461 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001462 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001463 set_bit(NEW_CUR_UUID, &mdev->flags);
1464 } else {
1465 drbd_uuid_new_current(mdev);
1466 drbd_send_uuids(mdev);
1467 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001468 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001469 put_ldev(mdev);
1470 }
1471 }
1472
1473 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001474 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001475 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001476 drbd_send_uuids(mdev);
1477 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001478
1479 /* D_DISKLESS Peer becomes secondary */
1480 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001481 /* We may still be Primary ourselves.
1482 * No harm done if the bitmap still changes,
1483 * redirtied pages will follow later. */
1484 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1485 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001486 put_ldev(mdev);
1487 }
1488
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001489 /* Write out all changed bits on demote.
1490 * Though, no need to da that just yet
1491 * if there is a resync going on still */
1492 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1493 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001494 /* No changes to the bitmap expected this time, so assert that,
1495 * even though no harm was done if it did change. */
1496 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1497 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001498 put_ldev(mdev);
1499 }
1500
1501 /* Last part of the attaching process ... */
1502 if (ns.conn >= C_CONNECTED &&
1503 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001504 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001505 drbd_send_uuids(mdev);
1506 drbd_send_state(mdev);
1507 }
1508
1509 /* We want to pause/continue resync, tell peer. */
1510 if (ns.conn >= C_CONNECTED &&
1511 ((os.aftr_isp != ns.aftr_isp) ||
1512 (os.user_isp != ns.user_isp)))
1513 drbd_send_state(mdev);
1514
1515 /* In case one of the isp bits got set, suspend other devices. */
1516 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1517 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1518 suspend_other_sg(mdev);
1519
1520 /* Make sure the peer gets informed about eventual state
1521 changes (ISP bits) while we were in WFReportParams. */
1522 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1523 drbd_send_state(mdev);
1524
Philipp Reisner67531712010-10-27 12:21:30 +02001525 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1526 drbd_send_state(mdev);
1527
Philipp Reisnerb411b362009-09-25 16:07:19 -07001528 /* We are in the progress to start a full sync... */
1529 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1530 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001531 /* no other bitmap changes expected during this phase */
1532 drbd_queue_bitmap_io(mdev,
1533 &drbd_bmio_set_n_write, &abw_start_sync,
1534 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001535
1536 /* We are invalidating our self... */
1537 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1538 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001539 /* other bitmap operation expected during this phase */
1540 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1541 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001542
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001543 /* first half of local IO error, failure to attach,
1544 * or administrative detach */
1545 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1546 enum drbd_io_error_p eh;
1547 int was_io_error;
1548 /* corresponding get_ldev was in __drbd_set_state, to serialize
1549 * our cleanup here with the transition to D_DISKLESS,
1550 * so it is safe to dreference ldev here. */
1551 eh = mdev->ldev->dc.on_io_error;
1552 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1553
1554 /* current state still has to be D_FAILED,
1555 * there is only one way out: to D_DISKLESS,
1556 * and that may only happen after our put_ldev below. */
1557 if (mdev->state.disk != D_FAILED)
1558 dev_err(DEV,
1559 "ASSERT FAILED: disk is %s during detach\n",
1560 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001561
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001562 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001563 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001564 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001565 dev_err(DEV, "Sending state for detaching disk failed\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001566
1567 drbd_rs_cancel_all(mdev);
1568
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001569 /* In case we want to get something to stable storage still,
1570 * this may be the last chance.
1571 * Following put_ldev may transition to D_DISKLESS. */
1572 drbd_md_sync(mdev);
1573 put_ldev(mdev);
1574
1575 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001576 drbd_khelper(mdev, "local-io-error");
1577 }
1578
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001579 /* second half of local IO error, failure to attach,
1580 * or administrative detach,
1581 * after local_cnt references have reached zero again */
1582 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1583 /* We must still be diskless,
1584 * re-attach has to be serialized with this! */
1585 if (mdev->state.disk != D_DISKLESS)
1586 dev_err(DEV,
1587 "ASSERT FAILED: disk is %s while going diskless\n",
1588 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001589
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001590 mdev->rs_total = 0;
1591 mdev->rs_failed = 0;
1592 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001593
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001594 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001595 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001596 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001597 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001598 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001599 }
1600
Philipp Reisner738a84b2011-03-03 00:21:30 +01001601 /* Notify peer that I had a local IO error, and did not detached.. */
1602 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1603 drbd_send_state(mdev);
1604
Philipp Reisnerb411b362009-09-25 16:07:19 -07001605 /* Disks got bigger while they were detached */
1606 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1607 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1608 if (ns.conn == C_CONNECTED)
1609 resync_after_online_grow(mdev);
1610 }
1611
1612 /* A resync finished or aborted, wake paused devices... */
1613 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1614 (os.peer_isp && !ns.peer_isp) ||
1615 (os.user_isp && !ns.user_isp))
1616 resume_next_sg(mdev);
1617
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001618 /* sync target done with resync. Explicitly notify peer, even though
1619 * it should (at least for non-empty resyncs) already know itself. */
1620 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1621 drbd_send_state(mdev);
1622
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001623 /* This triggers bitmap writeout of potentially still unwritten pages
1624 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001625 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001626 * For resync aborted because of local disk failure, we cannot do
1627 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001628 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001629 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001630 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1631 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1632 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001633 put_ldev(mdev);
1634 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001635
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001636 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001637 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001638 drbd_free_tl_hash(mdev);
1639
Philipp Reisnerb411b362009-09-25 16:07:19 -07001640 /* Upon network connection, we need to start the receiver */
1641 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1642 drbd_thread_start(&mdev->receiver);
1643
1644 /* Terminate worker thread if we are unconfigured - it will be
1645 restarted as needed... */
1646 if (ns.disk == D_DISKLESS &&
1647 ns.conn == C_STANDALONE &&
1648 ns.role == R_SECONDARY) {
1649 if (os.aftr_isp != ns.aftr_isp)
1650 resume_next_sg(mdev);
1651 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1652 if (test_bit(DEVICE_DYING, &mdev->flags))
1653 drbd_thread_stop_nowait(&mdev->worker);
1654 }
1655
1656 drbd_md_sync(mdev);
1657}
1658
1659
1660static int drbd_thread_setup(void *arg)
1661{
1662 struct drbd_thread *thi = (struct drbd_thread *) arg;
1663 struct drbd_conf *mdev = thi->mdev;
1664 unsigned long flags;
1665 int retval;
1666
1667restart:
1668 retval = thi->function(thi);
1669
1670 spin_lock_irqsave(&thi->t_lock, flags);
1671
1672 /* if the receiver has been "Exiting", the last thing it did
1673 * was set the conn state to "StandAlone",
1674 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1675 * and receiver thread will be "started".
1676 * drbd_thread_start needs to set "Restarting" in that case.
1677 * t_state check and assignment needs to be within the same spinlock,
1678 * so either thread_start sees Exiting, and can remap to Restarting,
1679 * or thread_start see None, and can proceed as normal.
1680 */
1681
1682 if (thi->t_state == Restarting) {
1683 dev_info(DEV, "Restarting %s\n", current->comm);
1684 thi->t_state = Running;
1685 spin_unlock_irqrestore(&thi->t_lock, flags);
1686 goto restart;
1687 }
1688
1689 thi->task = NULL;
1690 thi->t_state = None;
1691 smp_mb();
1692 complete(&thi->stop);
1693 spin_unlock_irqrestore(&thi->t_lock, flags);
1694
1695 dev_info(DEV, "Terminating %s\n", current->comm);
1696
1697 /* Release mod reference taken when thread was started */
1698 module_put(THIS_MODULE);
1699 return retval;
1700}
1701
1702static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1703 int (*func) (struct drbd_thread *))
1704{
1705 spin_lock_init(&thi->t_lock);
1706 thi->task = NULL;
1707 thi->t_state = None;
1708 thi->function = func;
1709 thi->mdev = mdev;
1710}
1711
1712int drbd_thread_start(struct drbd_thread *thi)
1713{
1714 struct drbd_conf *mdev = thi->mdev;
1715 struct task_struct *nt;
1716 unsigned long flags;
1717
1718 const char *me =
1719 thi == &mdev->receiver ? "receiver" :
1720 thi == &mdev->asender ? "asender" :
1721 thi == &mdev->worker ? "worker" : "NONSENSE";
1722
1723 /* is used from state engine doing drbd_thread_stop_nowait,
1724 * while holding the req lock irqsave */
1725 spin_lock_irqsave(&thi->t_lock, flags);
1726
1727 switch (thi->t_state) {
1728 case None:
1729 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1730 me, current->comm, current->pid);
1731
1732 /* Get ref on module for thread - this is released when thread exits */
1733 if (!try_module_get(THIS_MODULE)) {
1734 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1735 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001736 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001737 }
1738
1739 init_completion(&thi->stop);
1740 D_ASSERT(thi->task == NULL);
1741 thi->reset_cpu_mask = 1;
1742 thi->t_state = Running;
1743 spin_unlock_irqrestore(&thi->t_lock, flags);
1744 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1745
1746 nt = kthread_create(drbd_thread_setup, (void *) thi,
1747 "drbd%d_%s", mdev_to_minor(mdev), me);
1748
1749 if (IS_ERR(nt)) {
1750 dev_err(DEV, "Couldn't start thread\n");
1751
1752 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001753 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001754 }
1755 spin_lock_irqsave(&thi->t_lock, flags);
1756 thi->task = nt;
1757 thi->t_state = Running;
1758 spin_unlock_irqrestore(&thi->t_lock, flags);
1759 wake_up_process(nt);
1760 break;
1761 case Exiting:
1762 thi->t_state = Restarting;
1763 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1764 me, current->comm, current->pid);
1765 /* fall through */
1766 case Running:
1767 case Restarting:
1768 default:
1769 spin_unlock_irqrestore(&thi->t_lock, flags);
1770 break;
1771 }
1772
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001773 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001774}
1775
1776
1777void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1778{
1779 unsigned long flags;
1780
1781 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1782
1783 /* may be called from state engine, holding the req lock irqsave */
1784 spin_lock_irqsave(&thi->t_lock, flags);
1785
1786 if (thi->t_state == None) {
1787 spin_unlock_irqrestore(&thi->t_lock, flags);
1788 if (restart)
1789 drbd_thread_start(thi);
1790 return;
1791 }
1792
1793 if (thi->t_state != ns) {
1794 if (thi->task == NULL) {
1795 spin_unlock_irqrestore(&thi->t_lock, flags);
1796 return;
1797 }
1798
1799 thi->t_state = ns;
1800 smp_mb();
1801 init_completion(&thi->stop);
1802 if (thi->task != current)
1803 force_sig(DRBD_SIGKILL, thi->task);
1804
1805 }
1806
1807 spin_unlock_irqrestore(&thi->t_lock, flags);
1808
1809 if (wait)
1810 wait_for_completion(&thi->stop);
1811}
1812
1813#ifdef CONFIG_SMP
1814/**
1815 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1816 * @mdev: DRBD device.
1817 *
1818 * Forces all threads of a device onto the same CPU. This is beneficial for
1819 * DRBD's performance. May be overwritten by user's configuration.
1820 */
1821void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1822{
1823 int ord, cpu;
1824
1825 /* user override. */
1826 if (cpumask_weight(mdev->cpu_mask))
1827 return;
1828
1829 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1830 for_each_online_cpu(cpu) {
1831 if (ord-- == 0) {
1832 cpumask_set_cpu(cpu, mdev->cpu_mask);
1833 return;
1834 }
1835 }
1836 /* should not be reached */
1837 cpumask_setall(mdev->cpu_mask);
1838}
1839
1840/**
1841 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1842 * @mdev: DRBD device.
1843 *
1844 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1845 * prematurely.
1846 */
1847void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1848{
1849 struct task_struct *p = current;
1850 struct drbd_thread *thi =
1851 p == mdev->asender.task ? &mdev->asender :
1852 p == mdev->receiver.task ? &mdev->receiver :
1853 p == mdev->worker.task ? &mdev->worker :
1854 NULL;
1855 ERR_IF(thi == NULL)
1856 return;
1857 if (!thi->reset_cpu_mask)
1858 return;
1859 thi->reset_cpu_mask = 0;
1860 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1861}
1862#endif
1863
1864/* the appropriate socket mutex must be held already */
1865int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001866 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001867 size_t size, unsigned msg_flags)
1868{
1869 int sent, ok;
1870
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001871 ERR_IF(!h) return false;
1872 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001873
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01001874 h->magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001875 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001876 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001877
Philipp Reisnerb411b362009-09-25 16:07:19 -07001878 sent = drbd_send(mdev, sock, h, size, msg_flags);
1879
1880 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001881 if (!ok && !signal_pending(current))
1882 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001883 cmdname(cmd), (int)size, sent);
1884 return ok;
1885}
1886
1887/* don't pass the socket. we may only look at it
1888 * when we hold the appropriate socket mutex.
1889 */
1890int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001891 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001892{
1893 int ok = 0;
1894 struct socket *sock;
1895
1896 if (use_data_socket) {
1897 mutex_lock(&mdev->data.mutex);
1898 sock = mdev->data.socket;
1899 } else {
1900 mutex_lock(&mdev->meta.mutex);
1901 sock = mdev->meta.socket;
1902 }
1903
1904 /* drbd_disconnect() could have called drbd_free_sock()
1905 * while we were waiting in down()... */
1906 if (likely(sock != NULL))
1907 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1908
1909 if (use_data_socket)
1910 mutex_unlock(&mdev->data.mutex);
1911 else
1912 mutex_unlock(&mdev->meta.mutex);
1913 return ok;
1914}
1915
1916int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1917 size_t size)
1918{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001919 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001920 int ok;
1921
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01001922 h.magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001923 h.command = cpu_to_be16(cmd);
1924 h.length = cpu_to_be16(size);
1925
1926 if (!drbd_get_data_sock(mdev))
1927 return 0;
1928
Philipp Reisnerb411b362009-09-25 16:07:19 -07001929 ok = (sizeof(h) ==
1930 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1931 ok = ok && (size ==
1932 drbd_send(mdev, mdev->data.socket, data, size, 0));
1933
1934 drbd_put_data_sock(mdev);
1935
1936 return ok;
1937}
1938
1939int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1940{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001941 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001942 struct socket *sock;
1943 int size, rv;
1944 const int apv = mdev->agreed_pro_version;
1945
1946 size = apv <= 87 ? sizeof(struct p_rs_param)
1947 : apv == 88 ? sizeof(struct p_rs_param)
1948 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001949 : apv <= 94 ? sizeof(struct p_rs_param_89)
1950 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001951
1952 /* used from admin command context and receiver/worker context.
1953 * to avoid kmalloc, grab the socket right here,
1954 * then use the pre-allocated sbuf there */
1955 mutex_lock(&mdev->data.mutex);
1956 sock = mdev->data.socket;
1957
1958 if (likely(sock != NULL)) {
1959 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1960
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001961 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001962
1963 /* initialize verify_alg and csums_alg */
1964 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1965
1966 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001967 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1968 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1969 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1970 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001971
1972 if (apv >= 88)
1973 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1974 if (apv >= 89)
1975 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1976
1977 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1978 } else
1979 rv = 0; /* not ok */
1980
1981 mutex_unlock(&mdev->data.mutex);
1982
1983 return rv;
1984}
1985
1986int drbd_send_protocol(struct drbd_conf *mdev)
1987{
1988 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001989 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001990
1991 size = sizeof(struct p_protocol);
1992
1993 if (mdev->agreed_pro_version >= 87)
1994 size += strlen(mdev->net_conf->integrity_alg) + 1;
1995
1996 /* we must not recurse into our own queue,
1997 * as that is blocked during handshake */
1998 p = kmalloc(size, GFP_NOIO);
1999 if (p == NULL)
2000 return 0;
2001
2002 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2003 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2004 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2005 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002006 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2007
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002008 cf = 0;
2009 if (mdev->net_conf->want_lose)
2010 cf |= CF_WANT_LOSE;
2011 if (mdev->net_conf->dry_run) {
2012 if (mdev->agreed_pro_version >= 92)
2013 cf |= CF_DRY_RUN;
2014 else {
2015 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02002016 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01002017 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002018 }
2019 }
2020 p->conn_flags = cpu_to_be32(cf);
2021
Philipp Reisnerb411b362009-09-25 16:07:19 -07002022 if (mdev->agreed_pro_version >= 87)
2023 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2024
2025 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002026 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002027 kfree(p);
2028 return rv;
2029}
2030
2031int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2032{
2033 struct p_uuids p;
2034 int i;
2035
2036 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2037 return 1;
2038
2039 for (i = UI_CURRENT; i < UI_SIZE; i++)
2040 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2041
2042 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2043 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2044 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2045 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2046 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2047 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2048
2049 put_ldev(mdev);
2050
2051 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002052 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002053}
2054
2055int drbd_send_uuids(struct drbd_conf *mdev)
2056{
2057 return _drbd_send_uuids(mdev, 0);
2058}
2059
2060int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2061{
2062 return _drbd_send_uuids(mdev, 8);
2063}
2064
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002065void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2066{
2067 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2068 u64 *uuid = mdev->ldev->md.uuid;
2069 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2070 text,
2071 (unsigned long long)uuid[UI_CURRENT],
2072 (unsigned long long)uuid[UI_BITMAP],
2073 (unsigned long long)uuid[UI_HISTORY_START],
2074 (unsigned long long)uuid[UI_HISTORY_END]);
2075 put_ldev(mdev);
2076 } else {
2077 dev_info(DEV, "%s effective data uuid: %016llX\n",
2078 text,
2079 (unsigned long long)mdev->ed_uuid);
2080 }
2081}
2082
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002083int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002084{
2085 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002086 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002087
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002088 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2089
Philipp Reisner4a23f262011-01-11 17:42:17 +01002090 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002091 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002092 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002093 drbd_md_sync(mdev);
2094 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002095
2096 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002097 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002098}
2099
Philipp Reisnere89b5912010-03-24 17:11:33 +01002100int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002101{
2102 struct p_sizes p;
2103 sector_t d_size, u_size;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002104 int q_order_type, max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002105 int ok;
2106
2107 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2108 D_ASSERT(mdev->ldev->backing_bdev);
2109 d_size = drbd_get_max_capacity(mdev->ldev);
2110 u_size = mdev->ldev->dc.disk_size;
2111 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisner99432fc2011-05-20 16:39:13 +02002112 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2113 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002114 put_ldev(mdev);
2115 } else {
2116 d_size = 0;
2117 u_size = 0;
2118 q_order_type = QUEUE_ORDERED_NONE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002119 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002120 }
2121
2122 p.d_size = cpu_to_be64(d_size);
2123 p.u_size = cpu_to_be64(u_size);
2124 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Philipp Reisner99432fc2011-05-20 16:39:13 +02002125 p.max_bio_size = cpu_to_be32(max_bio_size);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002126 p.queue_order_type = cpu_to_be16(q_order_type);
2127 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002128
2129 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002130 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002131 return ok;
2132}
2133
2134/**
2135 * drbd_send_state() - Sends the drbd state to the peer
2136 * @mdev: DRBD device.
2137 */
2138int drbd_send_state(struct drbd_conf *mdev)
2139{
2140 struct socket *sock;
2141 struct p_state p;
2142 int ok = 0;
2143
2144 /* Grab state lock so we wont send state if we're in the middle
2145 * of a cluster wide state change on another thread */
2146 drbd_state_lock(mdev);
2147
2148 mutex_lock(&mdev->data.mutex);
2149
2150 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2151 sock = mdev->data.socket;
2152
2153 if (likely(sock != NULL)) {
2154 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002155 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002156 }
2157
2158 mutex_unlock(&mdev->data.mutex);
2159
2160 drbd_state_unlock(mdev);
2161 return ok;
2162}
2163
2164int drbd_send_state_req(struct drbd_conf *mdev,
2165 union drbd_state mask, union drbd_state val)
2166{
2167 struct p_req_state p;
2168
2169 p.mask = cpu_to_be32(mask.i);
2170 p.val = cpu_to_be32(val.i);
2171
2172 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002173 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002174}
2175
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002176int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002177{
2178 struct p_req_state_reply p;
2179
2180 p.retcode = cpu_to_be32(retcode);
2181
2182 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002183 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002184}
2185
2186int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2187 struct p_compressed_bm *p,
2188 struct bm_xfer_ctx *c)
2189{
2190 struct bitstream bs;
2191 unsigned long plain_bits;
2192 unsigned long tmp;
2193 unsigned long rl;
2194 unsigned len;
2195 unsigned toggle;
2196 int bits;
2197
2198 /* may we use this feature? */
2199 if ((mdev->sync_conf.use_rle == 0) ||
2200 (mdev->agreed_pro_version < 90))
2201 return 0;
2202
2203 if (c->bit_offset >= c->bm_bits)
2204 return 0; /* nothing to do. */
2205
2206 /* use at most thus many bytes */
2207 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2208 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2209 /* plain bits covered in this code string */
2210 plain_bits = 0;
2211
2212 /* p->encoding & 0x80 stores whether the first run length is set.
2213 * bit offset is implicit.
2214 * start with toggle == 2 to be able to tell the first iteration */
2215 toggle = 2;
2216
2217 /* see how much plain bits we can stuff into one packet
2218 * using RLE and VLI. */
2219 do {
2220 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2221 : _drbd_bm_find_next(mdev, c->bit_offset);
2222 if (tmp == -1UL)
2223 tmp = c->bm_bits;
2224 rl = tmp - c->bit_offset;
2225
2226 if (toggle == 2) { /* first iteration */
2227 if (rl == 0) {
2228 /* the first checked bit was set,
2229 * store start value, */
2230 DCBP_set_start(p, 1);
2231 /* but skip encoding of zero run length */
2232 toggle = !toggle;
2233 continue;
2234 }
2235 DCBP_set_start(p, 0);
2236 }
2237
2238 /* paranoia: catch zero runlength.
2239 * can only happen if bitmap is modified while we scan it. */
2240 if (rl == 0) {
2241 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2242 "t:%u bo:%lu\n", toggle, c->bit_offset);
2243 return -1;
2244 }
2245
2246 bits = vli_encode_bits(&bs, rl);
2247 if (bits == -ENOBUFS) /* buffer full */
2248 break;
2249 if (bits <= 0) {
2250 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2251 return 0;
2252 }
2253
2254 toggle = !toggle;
2255 plain_bits += rl;
2256 c->bit_offset = tmp;
2257 } while (c->bit_offset < c->bm_bits);
2258
2259 len = bs.cur.b - p->code + !!bs.cur.bit;
2260
2261 if (plain_bits < (len << 3)) {
2262 /* incompressible with this method.
2263 * we need to rewind both word and bit position. */
2264 c->bit_offset -= plain_bits;
2265 bm_xfer_ctx_bit_to_word_offset(c);
2266 c->bit_offset = c->word_offset * BITS_PER_LONG;
2267 return 0;
2268 }
2269
2270 /* RLE + VLI was able to compress it just fine.
2271 * update c->word_offset. */
2272 bm_xfer_ctx_bit_to_word_offset(c);
2273
2274 /* store pad_bits */
2275 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2276
2277 return len;
2278}
2279
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002280/**
2281 * send_bitmap_rle_or_plain
2282 *
2283 * Return 0 when done, 1 when another iteration is needed, and a negative error
2284 * code upon failure.
2285 */
2286static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002287send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002288 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002289{
2290 struct p_compressed_bm *p = (void*)h;
2291 unsigned long num_words;
2292 int len;
2293 int ok;
2294
2295 len = fill_bitmap_rle_bits(mdev, p, c);
2296
2297 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002298 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002299
2300 if (len) {
2301 DCBP_set_code(p, RLE_VLI_Bits);
2302 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2303 sizeof(*p) + len, 0);
2304
2305 c->packets[0]++;
2306 c->bytes[0] += sizeof(*p) + len;
2307
2308 if (c->bit_offset >= c->bm_bits)
2309 len = 0; /* DONE */
2310 } else {
2311 /* was not compressible.
2312 * send a buffer full of plain text bits instead. */
2313 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2314 len = num_words * sizeof(long);
2315 if (len)
2316 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2317 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002318 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002319 c->word_offset += num_words;
2320 c->bit_offset = c->word_offset * BITS_PER_LONG;
2321
2322 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002323 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002324
2325 if (c->bit_offset > c->bm_bits)
2326 c->bit_offset = c->bm_bits;
2327 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002328 if (ok) {
2329 if (len == 0) {
2330 INFO_bm_xfer_stats(mdev, "send", c);
2331 return 0;
2332 } else
2333 return 1;
2334 }
2335 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002336}
2337
2338/* See the comment at receive_bitmap() */
2339int _drbd_send_bitmap(struct drbd_conf *mdev)
2340{
2341 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002342 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002343 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002344
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002345 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002346
2347 /* maybe we should use some per thread scratch page,
2348 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002349 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002350 if (!p) {
2351 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002352 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002353 }
2354
2355 if (get_ldev(mdev)) {
2356 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2357 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2358 drbd_bm_set_all(mdev);
2359 if (drbd_bm_write(mdev)) {
2360 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2361 * but otherwise process as per normal - need to tell other
2362 * side that a full resync is required! */
2363 dev_err(DEV, "Failed to write bitmap to disk!\n");
2364 } else {
2365 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2366 drbd_md_sync(mdev);
2367 }
2368 }
2369 put_ldev(mdev);
2370 }
2371
2372 c = (struct bm_xfer_ctx) {
2373 .bm_bits = drbd_bm_bits(mdev),
2374 .bm_words = drbd_bm_words(mdev),
2375 };
2376
2377 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002378 err = send_bitmap_rle_or_plain(mdev, p, &c);
2379 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002380
2381 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002382 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002383}
2384
2385int drbd_send_bitmap(struct drbd_conf *mdev)
2386{
2387 int err;
2388
2389 if (!drbd_get_data_sock(mdev))
2390 return -1;
2391 err = !_drbd_send_bitmap(mdev);
2392 drbd_put_data_sock(mdev);
2393 return err;
2394}
2395
2396int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2397{
2398 int ok;
2399 struct p_barrier_ack p;
2400
2401 p.barrier = barrier_nr;
2402 p.set_size = cpu_to_be32(set_size);
2403
2404 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002405 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002406 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002407 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002408 return ok;
2409}
2410
2411/**
2412 * _drbd_send_ack() - Sends an ack packet
2413 * @mdev: DRBD device.
2414 * @cmd: Packet command code.
2415 * @sector: sector, needs to be in big endian byte order
2416 * @blksize: size in byte, needs to be in big endian byte order
2417 * @block_id: Id, big endian byte order
2418 */
2419static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2420 u64 sector,
2421 u32 blksize,
2422 u64 block_id)
2423{
2424 int ok;
2425 struct p_block_ack p;
2426
2427 p.sector = sector;
2428 p.block_id = block_id;
2429 p.blksize = blksize;
2430 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2431
2432 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002433 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002434 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002435 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002436 return ok;
2437}
2438
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002439/* dp->sector and dp->block_id already/still in network byte order,
2440 * data_size is payload size according to dp->head,
2441 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002442int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002443 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002444{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002445 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2446 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002447 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2448 dp->block_id);
2449}
2450
2451int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2452 struct p_block_req *rp)
2453{
2454 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2455}
2456
2457/**
2458 * drbd_send_ack() - Sends an ack packet
2459 * @mdev: DRBD device.
2460 * @cmd: Packet command code.
2461 * @e: Epoch entry.
2462 */
2463int drbd_send_ack(struct drbd_conf *mdev,
2464 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2465{
2466 return _drbd_send_ack(mdev, cmd,
2467 cpu_to_be64(e->sector),
2468 cpu_to_be32(e->size),
2469 e->block_id);
2470}
2471
2472/* This function misuses the block_id field to signal if the blocks
2473 * are is sync or not. */
2474int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2475 sector_t sector, int blksize, u64 block_id)
2476{
2477 return _drbd_send_ack(mdev, cmd,
2478 cpu_to_be64(sector),
2479 cpu_to_be32(blksize),
2480 cpu_to_be64(block_id));
2481}
2482
2483int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2484 sector_t sector, int size, u64 block_id)
2485{
2486 int ok;
2487 struct p_block_req p;
2488
2489 p.sector = cpu_to_be64(sector);
2490 p.block_id = block_id;
2491 p.blksize = cpu_to_be32(size);
2492
2493 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002494 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002495 return ok;
2496}
2497
2498int drbd_send_drequest_csum(struct drbd_conf *mdev,
2499 sector_t sector, int size,
2500 void *digest, int digest_size,
2501 enum drbd_packets cmd)
2502{
2503 int ok;
2504 struct p_block_req p;
2505
2506 p.sector = cpu_to_be64(sector);
Andreas Gruenbacher9a8e7752011-01-11 14:04:09 +01002507 p.block_id = ID_SYNCER /* unused */;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002508 p.blksize = cpu_to_be32(size);
2509
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002510 p.head.magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002511 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002512 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002513
2514 mutex_lock(&mdev->data.mutex);
2515
2516 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2517 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2518
2519 mutex_unlock(&mdev->data.mutex);
2520
2521 return ok;
2522}
2523
2524int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2525{
2526 int ok;
2527 struct p_block_req p;
2528
2529 p.sector = cpu_to_be64(sector);
Andreas Gruenbacher9a8e7752011-01-11 14:04:09 +01002530 p.block_id = ID_SYNCER /* unused */;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002531 p.blksize = cpu_to_be32(size);
2532
2533 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002534 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002535 return ok;
2536}
2537
2538/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002539 * returns false if we should retry,
2540 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002541 */
2542static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2543{
2544 int drop_it;
2545 /* long elapsed = (long)(jiffies - mdev->last_received); */
2546
2547 drop_it = mdev->meta.socket == sock
2548 || !mdev->asender.task
2549 || get_t_state(&mdev->asender) != Running
2550 || mdev->state.conn < C_CONNECTED;
2551
2552 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002553 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002554
2555 drop_it = !--mdev->ko_count;
2556 if (!drop_it) {
2557 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2558 current->comm, current->pid, mdev->ko_count);
2559 request_ping(mdev);
2560 }
2561
2562 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2563}
2564
2565/* The idea of sendpage seems to be to put some kind of reference
2566 * to the page into the skb, and to hand it over to the NIC. In
2567 * this process get_page() gets called.
2568 *
2569 * As soon as the page was really sent over the network put_page()
2570 * gets called by some part of the network layer. [ NIC driver? ]
2571 *
2572 * [ get_page() / put_page() increment/decrement the count. If count
2573 * reaches 0 the page will be freed. ]
2574 *
2575 * This works nicely with pages from FSs.
2576 * But this means that in protocol A we might signal IO completion too early!
2577 *
2578 * In order not to corrupt data during a resync we must make sure
2579 * that we do not reuse our own buffer pages (EEs) to early, therefore
2580 * we have the net_ee list.
2581 *
2582 * XFS seems to have problems, still, it submits pages with page_count == 0!
2583 * As a workaround, we disable sendpage on pages
2584 * with page_count == 0 or PageSlab.
2585 */
2586static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002587 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002588{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002589 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002590 kunmap(page);
2591 if (sent == size)
2592 mdev->send_cnt += size>>9;
2593 return sent == size;
2594}
2595
2596static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002597 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002598{
2599 mm_segment_t oldfs = get_fs();
2600 int sent, ok;
2601 int len = size;
2602
2603 /* e.g. XFS meta- & log-data is in slab pages, which have a
2604 * page_count of 0 and/or have PageSlab() set.
2605 * we cannot use send_page for those, as that does get_page();
2606 * put_page(); and would cause either a VM_BUG directly, or
2607 * __page_cache_release a page that would actually still be referenced
2608 * by someone, leading to some obscure delayed Oops somewhere else. */
2609 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002610 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002611
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002612 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002613 drbd_update_congested(mdev);
2614 set_fs(KERNEL_DS);
2615 do {
2616 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2617 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002618 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002619 if (sent == -EAGAIN) {
2620 if (we_should_drop_the_connection(mdev,
2621 mdev->data.socket))
2622 break;
2623 else
2624 continue;
2625 }
2626 if (sent <= 0) {
2627 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2628 __func__, (int)size, len, sent);
2629 break;
2630 }
2631 len -= sent;
2632 offset += sent;
2633 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2634 set_fs(oldfs);
2635 clear_bit(NET_CONGESTED, &mdev->flags);
2636
2637 ok = (len == 0);
2638 if (likely(ok))
2639 mdev->send_cnt += size>>9;
2640 return ok;
2641}
2642
2643static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2644{
2645 struct bio_vec *bvec;
2646 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002647 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002648 __bio_for_each_segment(bvec, bio, i, 0) {
2649 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002650 bvec->bv_offset, bvec->bv_len,
2651 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002652 return 0;
2653 }
2654 return 1;
2655}
2656
2657static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2658{
2659 struct bio_vec *bvec;
2660 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002661 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002662 __bio_for_each_segment(bvec, bio, i, 0) {
2663 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002664 bvec->bv_offset, bvec->bv_len,
2665 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002666 return 0;
2667 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002668 return 1;
2669}
2670
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002671static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2672{
2673 struct page *page = e->pages;
2674 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002675 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002676 page_chain_for_each(page) {
2677 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002678 if (!_drbd_send_page(mdev, page, 0, l,
2679 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002680 return 0;
2681 len -= l;
2682 }
2683 return 1;
2684}
2685
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002686static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2687{
2688 if (mdev->agreed_pro_version >= 95)
2689 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002690 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2691 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2692 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2693 else
Jens Axboe721a9602011-03-09 11:56:30 +01002694 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002695}
2696
Philipp Reisnerb411b362009-09-25 16:07:19 -07002697/* Used to send write requests
2698 * R_PRIMARY -> Peer (P_DATA)
2699 */
2700int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2701{
2702 int ok = 1;
2703 struct p_data p;
2704 unsigned int dp_flags = 0;
2705 void *dgb;
2706 int dgs;
2707
2708 if (!drbd_get_data_sock(mdev))
2709 return 0;
2710
2711 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2712 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2713
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002714 if (req->i.size <= DRBD_MAX_SIZE_H80_PACKET) {
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002715 p.head.h80.magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002716 p.head.h80.command = cpu_to_be16(P_DATA);
2717 p.head.h80.length =
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002718 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->i.size);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002719 } else {
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002720 p.head.h95.magic = cpu_to_be16(DRBD_MAGIC_BIG);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002721 p.head.h95.command = cpu_to_be16(P_DATA);
2722 p.head.h95.length =
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002723 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->i.size);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002724 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002725
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002726 p.sector = cpu_to_be64(req->i.sector);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002727 p.block_id = (unsigned long)req;
2728 p.seq_num = cpu_to_be32(req->seq_num =
2729 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002730
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002731 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2732
Philipp Reisnerb411b362009-09-25 16:07:19 -07002733 if (mdev->state.conn >= C_SYNC_SOURCE &&
2734 mdev->state.conn <= C_PAUSED_SYNC_T)
2735 dp_flags |= DP_MAY_SET_IN_SYNC;
2736
2737 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002738 set_bit(UNPLUG_REMOTE, &mdev->flags);
2739 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002740 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002741 if (ok && dgs) {
2742 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002743 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002744 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002745 }
2746 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002747 /* For protocol A, we have to memcpy the payload into
2748 * socket buffers, as we may complete right away
2749 * as soon as we handed it over to tcp, at which point the data
2750 * pages may become invalid.
2751 *
2752 * For data-integrity enabled, we copy it as well, so we can be
2753 * sure that even if the bio pages may still be modified, it
2754 * won't change the data on the wire, thus if the digest checks
2755 * out ok after sending on this side, but does not fit on the
2756 * receiving side, we sure have detected corruption elsewhere.
2757 */
2758 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002759 ok = _drbd_send_bio(mdev, req->master_bio);
2760 else
2761 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002762
2763 /* double check digest, sometimes buffers have been modified in flight. */
2764 if (dgs > 0 && dgs <= 64) {
Bart Van Assche24c48302011-05-21 18:32:29 +02002765 /* 64 byte, 512 bit, is the largest digest size
Lars Ellenberg470be442010-11-10 10:36:52 +01002766 * currently supported in kernel crypto. */
2767 unsigned char digest[64];
2768 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2769 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2770 dev_warn(DEV,
2771 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002772 (unsigned long long)req->i.sector, req->i.size);
Lars Ellenberg470be442010-11-10 10:36:52 +01002773 }
2774 } /* else if (dgs > 64) {
2775 ... Be noisy about digest too large ...
2776 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002777 }
2778
2779 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002780
Philipp Reisnerb411b362009-09-25 16:07:19 -07002781 return ok;
2782}
2783
2784/* answer packet, used to send data back for read requests:
2785 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2786 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2787 */
2788int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2789 struct drbd_epoch_entry *e)
2790{
2791 int ok;
2792 struct p_data p;
2793 void *dgb;
2794 int dgs;
2795
2796 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2797 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2798
Philipp Reisnerd5373382010-08-23 15:18:33 +02002799 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002800 p.head.h80.magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002801 p.head.h80.command = cpu_to_be16(cmd);
2802 p.head.h80.length =
2803 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2804 } else {
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002805 p.head.h95.magic = cpu_to_be16(DRBD_MAGIC_BIG);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002806 p.head.h95.command = cpu_to_be16(cmd);
2807 p.head.h95.length =
2808 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2809 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002810
2811 p.sector = cpu_to_be64(e->sector);
2812 p.block_id = e->block_id;
2813 /* p.seq_num = 0; No sequence numbers here.. */
2814
2815 /* Only called by our kernel thread.
2816 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2817 * in response to admin command or module unload.
2818 */
2819 if (!drbd_get_data_sock(mdev))
2820 return 0;
2821
Philipp Reisner0b70a132010-08-20 13:36:10 +02002822 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002823 if (ok && dgs) {
2824 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002825 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002826 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002827 }
2828 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002829 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002830
2831 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002832
Philipp Reisnerb411b362009-09-25 16:07:19 -07002833 return ok;
2834}
2835
Philipp Reisner73a01a12010-10-27 14:33:00 +02002836int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2837{
2838 struct p_block_desc p;
2839
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002840 p.sector = cpu_to_be64(req->i.sector);
2841 p.blksize = cpu_to_be32(req->i.size);
Philipp Reisner73a01a12010-10-27 14:33:00 +02002842
2843 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2844}
2845
Philipp Reisnerb411b362009-09-25 16:07:19 -07002846/*
2847 drbd_send distinguishes two cases:
2848
2849 Packets sent via the data socket "sock"
2850 and packets sent via the meta data socket "msock"
2851
2852 sock msock
2853 -----------------+-------------------------+------------------------------
2854 timeout conf.timeout / 2 conf.timeout / 2
2855 timeout action send a ping via msock Abort communication
2856 and close all sockets
2857*/
2858
2859/*
2860 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2861 */
2862int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2863 void *buf, size_t size, unsigned msg_flags)
2864{
2865 struct kvec iov;
2866 struct msghdr msg;
2867 int rv, sent = 0;
2868
2869 if (!sock)
2870 return -1000;
2871
2872 /* THINK if (signal_pending) return ... ? */
2873
2874 iov.iov_base = buf;
2875 iov.iov_len = size;
2876
2877 msg.msg_name = NULL;
2878 msg.msg_namelen = 0;
2879 msg.msg_control = NULL;
2880 msg.msg_controllen = 0;
2881 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2882
2883 if (sock == mdev->data.socket) {
2884 mdev->ko_count = mdev->net_conf->ko_count;
2885 drbd_update_congested(mdev);
2886 }
2887 do {
2888 /* STRANGE
2889 * tcp_sendmsg does _not_ use its size parameter at all ?
2890 *
2891 * -EAGAIN on timeout, -EINTR on signal.
2892 */
2893/* THINK
2894 * do we need to block DRBD_SIG if sock == &meta.socket ??
2895 * otherwise wake_asender() might interrupt some send_*Ack !
2896 */
2897 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2898 if (rv == -EAGAIN) {
2899 if (we_should_drop_the_connection(mdev, sock))
2900 break;
2901 else
2902 continue;
2903 }
2904 D_ASSERT(rv != 0);
2905 if (rv == -EINTR) {
2906 flush_signals(current);
2907 rv = 0;
2908 }
2909 if (rv < 0)
2910 break;
2911 sent += rv;
2912 iov.iov_base += rv;
2913 iov.iov_len -= rv;
2914 } while (sent < size);
2915
2916 if (sock == mdev->data.socket)
2917 clear_bit(NET_CONGESTED, &mdev->flags);
2918
2919 if (rv <= 0) {
2920 if (rv != -EAGAIN) {
2921 dev_err(DEV, "%s_sendmsg returned %d\n",
2922 sock == mdev->meta.socket ? "msock" : "sock",
2923 rv);
2924 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2925 } else
2926 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2927 }
2928
2929 return sent;
2930}
2931
2932static int drbd_open(struct block_device *bdev, fmode_t mode)
2933{
2934 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2935 unsigned long flags;
2936 int rv = 0;
2937
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002938 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002939 spin_lock_irqsave(&mdev->req_lock, flags);
2940 /* to have a stable mdev->state.role
2941 * and no race with updating open_cnt */
2942
2943 if (mdev->state.role != R_PRIMARY) {
2944 if (mode & FMODE_WRITE)
2945 rv = -EROFS;
2946 else if (!allow_oos)
2947 rv = -EMEDIUMTYPE;
2948 }
2949
2950 if (!rv)
2951 mdev->open_cnt++;
2952 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002953 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002954
2955 return rv;
2956}
2957
2958static int drbd_release(struct gendisk *gd, fmode_t mode)
2959{
2960 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002961 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002962 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002963 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002964 return 0;
2965}
2966
Philipp Reisnerb411b362009-09-25 16:07:19 -07002967static void drbd_set_defaults(struct drbd_conf *mdev)
2968{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002969 /* This way we get a compile error when sync_conf grows,
2970 and we forgot to initialize it here */
2971 mdev->sync_conf = (struct syncer_conf) {
2972 /* .rate = */ DRBD_RATE_DEF,
2973 /* .after = */ DRBD_AFTER_DEF,
2974 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002975 /* .verify_alg = */ {}, 0,
2976 /* .cpu_mask = */ {}, 0,
2977 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002978 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002979 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2980 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2981 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2982 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002983 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2984 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002985 };
2986
2987 /* Have to use that way, because the layout differs between
2988 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002989 mdev->state = (union drbd_state) {
2990 { .role = R_SECONDARY,
2991 .peer = R_UNKNOWN,
2992 .conn = C_STANDALONE,
2993 .disk = D_DISKLESS,
2994 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002995 .susp = 0,
2996 .susp_nod = 0,
2997 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002998 } };
2999}
3000
3001void drbd_init_set_defaults(struct drbd_conf *mdev)
3002{
3003 /* the memset(,0,) did most of this.
3004 * note: only assignments, no allocation in here */
3005
3006 drbd_set_defaults(mdev);
3007
Philipp Reisnerb411b362009-09-25 16:07:19 -07003008 atomic_set(&mdev->ap_bio_cnt, 0);
3009 atomic_set(&mdev->ap_pending_cnt, 0);
3010 atomic_set(&mdev->rs_pending_cnt, 0);
3011 atomic_set(&mdev->unacked_cnt, 0);
3012 atomic_set(&mdev->local_cnt, 0);
3013 atomic_set(&mdev->net_cnt, 0);
3014 atomic_set(&mdev->packet_seq, 0);
3015 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003016 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02003017 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003018 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02003019 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003020
3021 mutex_init(&mdev->md_io_mutex);
3022 mutex_init(&mdev->data.mutex);
3023 mutex_init(&mdev->meta.mutex);
3024 sema_init(&mdev->data.work.s, 0);
3025 sema_init(&mdev->meta.work.s, 0);
3026 mutex_init(&mdev->state_mutex);
3027
3028 spin_lock_init(&mdev->data.work.q_lock);
3029 spin_lock_init(&mdev->meta.work.q_lock);
3030
3031 spin_lock_init(&mdev->al_lock);
3032 spin_lock_init(&mdev->req_lock);
3033 spin_lock_init(&mdev->peer_seq_lock);
3034 spin_lock_init(&mdev->epoch_lock);
3035
3036 INIT_LIST_HEAD(&mdev->active_ee);
3037 INIT_LIST_HEAD(&mdev->sync_ee);
3038 INIT_LIST_HEAD(&mdev->done_ee);
3039 INIT_LIST_HEAD(&mdev->read_ee);
3040 INIT_LIST_HEAD(&mdev->net_ee);
3041 INIT_LIST_HEAD(&mdev->resync_reads);
3042 INIT_LIST_HEAD(&mdev->data.work.q);
3043 INIT_LIST_HEAD(&mdev->meta.work.q);
3044 INIT_LIST_HEAD(&mdev->resync_work.list);
3045 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003046 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003047 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003048 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003049 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003050
Philipp Reisner794abb72010-12-27 11:51:23 +01003051 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003052 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003053 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003054 mdev->md_sync_work.cb = w_md_sync;
3055 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003056 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003057 init_timer(&mdev->resync_timer);
3058 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003059 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003060 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003061 mdev->resync_timer.function = resync_timer_fn;
3062 mdev->resync_timer.data = (unsigned long) mdev;
3063 mdev->md_sync_timer.function = md_sync_timer_fn;
3064 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003065 mdev->start_resync_timer.function = start_resync_timer_fn;
3066 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003067 mdev->request_timer.function = request_timer_fn;
3068 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003069
3070 init_waitqueue_head(&mdev->misc_wait);
3071 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003072 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003073 init_waitqueue_head(&mdev->ee_wait);
3074 init_waitqueue_head(&mdev->al_wait);
3075 init_waitqueue_head(&mdev->seq_wait);
3076
3077 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3078 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3079 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3080
3081 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02003082 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003083 mdev->resync_wenr = LC_FREE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003084 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3085 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003086}
3087
3088void drbd_mdev_cleanup(struct drbd_conf *mdev)
3089{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003090 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003091 if (mdev->receiver.t_state != None)
3092 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3093 mdev->receiver.t_state);
3094
3095 /* no need to lock it, I'm the only thread alive */
3096 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3097 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3098 mdev->al_writ_cnt =
3099 mdev->bm_writ_cnt =
3100 mdev->read_cnt =
3101 mdev->recv_cnt =
3102 mdev->send_cnt =
3103 mdev->writ_cnt =
3104 mdev->p_size =
3105 mdev->rs_start =
3106 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003107 mdev->rs_failed = 0;
3108 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003109 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003110 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3111 mdev->rs_mark_left[i] = 0;
3112 mdev->rs_mark_time[i] = 0;
3113 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003114 D_ASSERT(mdev->net_conf == NULL);
3115
3116 drbd_set_my_capacity(mdev, 0);
3117 if (mdev->bitmap) {
3118 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003119 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003120 drbd_bm_cleanup(mdev);
3121 }
3122
3123 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003124 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003125
3126 /*
3127 * currently we drbd_init_ee only on module load, so
3128 * we may do drbd_release_ee only on module unload!
3129 */
3130 D_ASSERT(list_empty(&mdev->active_ee));
3131 D_ASSERT(list_empty(&mdev->sync_ee));
3132 D_ASSERT(list_empty(&mdev->done_ee));
3133 D_ASSERT(list_empty(&mdev->read_ee));
3134 D_ASSERT(list_empty(&mdev->net_ee));
3135 D_ASSERT(list_empty(&mdev->resync_reads));
3136 D_ASSERT(list_empty(&mdev->data.work.q));
3137 D_ASSERT(list_empty(&mdev->meta.work.q));
3138 D_ASSERT(list_empty(&mdev->resync_work.list));
3139 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003140 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003141
3142 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003143}
3144
3145
3146static void drbd_destroy_mempools(void)
3147{
3148 struct page *page;
3149
3150 while (drbd_pp_pool) {
3151 page = drbd_pp_pool;
3152 drbd_pp_pool = (struct page *)page_private(page);
3153 __free_page(page);
3154 drbd_pp_vacant--;
3155 }
3156
3157 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3158
3159 if (drbd_ee_mempool)
3160 mempool_destroy(drbd_ee_mempool);
3161 if (drbd_request_mempool)
3162 mempool_destroy(drbd_request_mempool);
3163 if (drbd_ee_cache)
3164 kmem_cache_destroy(drbd_ee_cache);
3165 if (drbd_request_cache)
3166 kmem_cache_destroy(drbd_request_cache);
3167 if (drbd_bm_ext_cache)
3168 kmem_cache_destroy(drbd_bm_ext_cache);
3169 if (drbd_al_ext_cache)
3170 kmem_cache_destroy(drbd_al_ext_cache);
3171
3172 drbd_ee_mempool = NULL;
3173 drbd_request_mempool = NULL;
3174 drbd_ee_cache = NULL;
3175 drbd_request_cache = NULL;
3176 drbd_bm_ext_cache = NULL;
3177 drbd_al_ext_cache = NULL;
3178
3179 return;
3180}
3181
3182static int drbd_create_mempools(void)
3183{
3184 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003185 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003186 int i;
3187
3188 /* prepare our caches and mempools */
3189 drbd_request_mempool = NULL;
3190 drbd_ee_cache = NULL;
3191 drbd_request_cache = NULL;
3192 drbd_bm_ext_cache = NULL;
3193 drbd_al_ext_cache = NULL;
3194 drbd_pp_pool = NULL;
3195
3196 /* caches */
3197 drbd_request_cache = kmem_cache_create(
3198 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3199 if (drbd_request_cache == NULL)
3200 goto Enomem;
3201
3202 drbd_ee_cache = kmem_cache_create(
3203 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3204 if (drbd_ee_cache == NULL)
3205 goto Enomem;
3206
3207 drbd_bm_ext_cache = kmem_cache_create(
3208 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3209 if (drbd_bm_ext_cache == NULL)
3210 goto Enomem;
3211
3212 drbd_al_ext_cache = kmem_cache_create(
3213 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3214 if (drbd_al_ext_cache == NULL)
3215 goto Enomem;
3216
3217 /* mempools */
3218 drbd_request_mempool = mempool_create(number,
3219 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3220 if (drbd_request_mempool == NULL)
3221 goto Enomem;
3222
3223 drbd_ee_mempool = mempool_create(number,
3224 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003225 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003226 goto Enomem;
3227
3228 /* drbd's page pool */
3229 spin_lock_init(&drbd_pp_lock);
3230
3231 for (i = 0; i < number; i++) {
3232 page = alloc_page(GFP_HIGHUSER);
3233 if (!page)
3234 goto Enomem;
3235 set_page_private(page, (unsigned long)drbd_pp_pool);
3236 drbd_pp_pool = page;
3237 }
3238 drbd_pp_vacant = number;
3239
3240 return 0;
3241
3242Enomem:
3243 drbd_destroy_mempools(); /* in case we allocated some */
3244 return -ENOMEM;
3245}
3246
3247static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3248 void *unused)
3249{
3250 /* just so we have it. you never know what interesting things we
3251 * might want to do here some day...
3252 */
3253
3254 return NOTIFY_DONE;
3255}
3256
3257static struct notifier_block drbd_notifier = {
3258 .notifier_call = drbd_notify_sys,
3259};
3260
3261static void drbd_release_ee_lists(struct drbd_conf *mdev)
3262{
3263 int rr;
3264
3265 rr = drbd_release_ee(mdev, &mdev->active_ee);
3266 if (rr)
3267 dev_err(DEV, "%d EEs in active list found!\n", rr);
3268
3269 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3270 if (rr)
3271 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3272
3273 rr = drbd_release_ee(mdev, &mdev->read_ee);
3274 if (rr)
3275 dev_err(DEV, "%d EEs in read list found!\n", rr);
3276
3277 rr = drbd_release_ee(mdev, &mdev->done_ee);
3278 if (rr)
3279 dev_err(DEV, "%d EEs in done list found!\n", rr);
3280
3281 rr = drbd_release_ee(mdev, &mdev->net_ee);
3282 if (rr)
3283 dev_err(DEV, "%d EEs in net list found!\n", rr);
3284}
3285
3286/* caution. no locking.
3287 * currently only used from module cleanup code. */
3288static void drbd_delete_device(unsigned int minor)
3289{
3290 struct drbd_conf *mdev = minor_to_mdev(minor);
3291
3292 if (!mdev)
3293 return;
3294
3295 /* paranoia asserts */
3296 if (mdev->open_cnt != 0)
3297 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3298 __FILE__ , __LINE__);
3299
3300 ERR_IF (!list_empty(&mdev->data.work.q)) {
3301 struct list_head *lp;
3302 list_for_each(lp, &mdev->data.work.q) {
3303 dev_err(DEV, "lp = %p\n", lp);
3304 }
3305 };
3306 /* end paranoia asserts */
3307
3308 del_gendisk(mdev->vdisk);
3309
3310 /* cleanup stuff that may have been allocated during
3311 * device (re-)configuration or state changes */
3312
3313 if (mdev->this_bdev)
3314 bdput(mdev->this_bdev);
3315
3316 drbd_free_resources(mdev);
3317
3318 drbd_release_ee_lists(mdev);
3319
Bart Van Assche24c48302011-05-21 18:32:29 +02003320 /* should be freed on disconnect? */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003321 kfree(mdev->ee_hash);
3322 /*
3323 mdev->ee_hash_s = 0;
3324 mdev->ee_hash = NULL;
3325 */
3326
3327 lc_destroy(mdev->act_log);
3328 lc_destroy(mdev->resync);
3329
3330 kfree(mdev->p_uuid);
3331 /* mdev->p_uuid = NULL; */
3332
3333 kfree(mdev->int_dig_out);
3334 kfree(mdev->int_dig_in);
3335 kfree(mdev->int_dig_vv);
3336
3337 /* cleanup the rest that has been
3338 * allocated from drbd_new_device
3339 * and actually free the mdev itself */
3340 drbd_free_mdev(mdev);
3341}
3342
3343static void drbd_cleanup(void)
3344{
3345 unsigned int i;
3346
3347 unregister_reboot_notifier(&drbd_notifier);
3348
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003349 /* first remove proc,
3350 * drbdsetup uses it's presence to detect
3351 * whether DRBD is loaded.
3352 * If we would get stuck in proc removal,
3353 * but have netlink already deregistered,
3354 * some drbdsetup commands may wait forever
3355 * for an answer.
3356 */
3357 if (drbd_proc)
3358 remove_proc_entry("drbd", NULL);
3359
Philipp Reisnerb411b362009-09-25 16:07:19 -07003360 drbd_nl_cleanup();
3361
3362 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003363 i = minor_count;
3364 while (i--)
3365 drbd_delete_device(i);
3366 drbd_destroy_mempools();
3367 }
3368
3369 kfree(minor_table);
3370
3371 unregister_blkdev(DRBD_MAJOR, "drbd");
3372
3373 printk(KERN_INFO "drbd: module cleanup done.\n");
3374}
3375
3376/**
3377 * drbd_congested() - Callback for pdflush
3378 * @congested_data: User data
3379 * @bdi_bits: Bits pdflush is currently interested in
3380 *
3381 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3382 */
3383static int drbd_congested(void *congested_data, int bdi_bits)
3384{
3385 struct drbd_conf *mdev = congested_data;
3386 struct request_queue *q;
3387 char reason = '-';
3388 int r = 0;
3389
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003390 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003391 /* DRBD has frozen IO */
3392 r = bdi_bits;
3393 reason = 'd';
3394 goto out;
3395 }
3396
3397 if (get_ldev(mdev)) {
3398 q = bdev_get_queue(mdev->ldev->backing_bdev);
3399 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3400 put_ldev(mdev);
3401 if (r)
3402 reason = 'b';
3403 }
3404
3405 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3406 r |= (1 << BDI_async_congested);
3407 reason = reason == 'b' ? 'a' : 'n';
3408 }
3409
3410out:
3411 mdev->congestion_reason = reason;
3412 return r;
3413}
3414
3415struct drbd_conf *drbd_new_device(unsigned int minor)
3416{
3417 struct drbd_conf *mdev;
3418 struct gendisk *disk;
3419 struct request_queue *q;
3420
3421 /* GFP_KERNEL, we are outside of all write-out paths */
3422 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3423 if (!mdev)
3424 return NULL;
3425 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3426 goto out_no_cpumask;
3427
3428 mdev->minor = minor;
3429
3430 drbd_init_set_defaults(mdev);
3431
3432 q = blk_alloc_queue(GFP_KERNEL);
3433 if (!q)
3434 goto out_no_q;
3435 mdev->rq_queue = q;
3436 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003437
3438 disk = alloc_disk(1);
3439 if (!disk)
3440 goto out_no_disk;
3441 mdev->vdisk = disk;
3442
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003443 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003444
3445 disk->queue = q;
3446 disk->major = DRBD_MAJOR;
3447 disk->first_minor = minor;
3448 disk->fops = &drbd_ops;
3449 sprintf(disk->disk_name, "drbd%d", minor);
3450 disk->private_data = mdev;
3451
3452 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3453 /* we have no partitions. we contain only ourselves. */
3454 mdev->this_bdev->bd_contains = mdev->this_bdev;
3455
3456 q->backing_dev_info.congested_fn = drbd_congested;
3457 q->backing_dev_info.congested_data = mdev;
3458
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003459 blk_queue_make_request(q, drbd_make_request);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003460 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3461 This triggers a max_bio_size message upon first attach or connect */
3462 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003463 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3464 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003465 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003466
3467 mdev->md_io_page = alloc_page(GFP_KERNEL);
3468 if (!mdev->md_io_page)
3469 goto out_no_io_page;
3470
3471 if (drbd_bm_init(mdev))
3472 goto out_no_bitmap;
3473 /* no need to lock access, we are still initializing this minor device. */
3474 if (!tl_init(mdev))
3475 goto out_no_tl;
Andreas Gruenbacherdac13892011-01-21 17:18:39 +01003476 mdev->read_requests = RB_ROOT;
Andreas Gruenbacherde696712011-01-20 15:00:24 +01003477 mdev->write_requests = RB_ROOT;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003478
3479 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3480 if (!mdev->app_reads_hash)
3481 goto out_no_app_reads;
3482
3483 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3484 if (!mdev->current_epoch)
3485 goto out_no_epoch;
3486
3487 INIT_LIST_HEAD(&mdev->current_epoch->list);
3488 mdev->epochs = 1;
3489
3490 return mdev;
3491
3492/* out_whatever_else:
3493 kfree(mdev->current_epoch); */
3494out_no_epoch:
3495 kfree(mdev->app_reads_hash);
3496out_no_app_reads:
3497 tl_cleanup(mdev);
3498out_no_tl:
3499 drbd_bm_cleanup(mdev);
3500out_no_bitmap:
3501 __free_page(mdev->md_io_page);
3502out_no_io_page:
3503 put_disk(disk);
3504out_no_disk:
3505 blk_cleanup_queue(q);
3506out_no_q:
3507 free_cpumask_var(mdev->cpu_mask);
3508out_no_cpumask:
3509 kfree(mdev);
3510 return NULL;
3511}
3512
3513/* counterpart of drbd_new_device.
3514 * last part of drbd_delete_device. */
3515void drbd_free_mdev(struct drbd_conf *mdev)
3516{
3517 kfree(mdev->current_epoch);
3518 kfree(mdev->app_reads_hash);
3519 tl_cleanup(mdev);
3520 if (mdev->bitmap) /* should no longer be there. */
3521 drbd_bm_cleanup(mdev);
3522 __free_page(mdev->md_io_page);
3523 put_disk(mdev->vdisk);
3524 blk_cleanup_queue(mdev->rq_queue);
3525 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003526 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003527 kfree(mdev);
3528}
3529
3530
3531int __init drbd_init(void)
3532{
3533 int err;
3534
3535 if (sizeof(struct p_handshake) != 80) {
3536 printk(KERN_ERR
3537 "drbd: never change the size or layout "
3538 "of the HandShake packet.\n");
3539 return -EINVAL;
3540 }
3541
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003542 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003543 printk(KERN_ERR
3544 "drbd: invalid minor_count (%d)\n", minor_count);
3545#ifdef MODULE
3546 return -EINVAL;
3547#else
3548 minor_count = 8;
3549#endif
3550 }
3551
3552 err = drbd_nl_init();
3553 if (err)
3554 return err;
3555
3556 err = register_blkdev(DRBD_MAJOR, "drbd");
3557 if (err) {
3558 printk(KERN_ERR
3559 "drbd: unable to register block device major %d\n",
3560 DRBD_MAJOR);
3561 return err;
3562 }
3563
3564 register_reboot_notifier(&drbd_notifier);
3565
3566 /*
3567 * allocate all necessary structs
3568 */
3569 err = -ENOMEM;
3570
3571 init_waitqueue_head(&drbd_pp_wait);
3572
3573 drbd_proc = NULL; /* play safe for drbd_cleanup */
3574 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3575 GFP_KERNEL);
3576 if (!minor_table)
3577 goto Enomem;
3578
3579 err = drbd_create_mempools();
3580 if (err)
3581 goto Enomem;
3582
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003583 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003584 if (!drbd_proc) {
3585 printk(KERN_ERR "drbd: unable to register proc file\n");
3586 goto Enomem;
3587 }
3588
3589 rwlock_init(&global_state_lock);
3590
3591 printk(KERN_INFO "drbd: initialized. "
3592 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3593 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3594 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3595 printk(KERN_INFO "drbd: registered as block device major %d\n",
3596 DRBD_MAJOR);
3597 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3598
3599 return 0; /* Success! */
3600
3601Enomem:
3602 drbd_cleanup();
3603 if (err == -ENOMEM)
3604 /* currently always the case */
3605 printk(KERN_ERR "drbd: ran out of memory\n");
3606 else
3607 printk(KERN_ERR "drbd: initialization failure\n");
3608 return err;
3609}
3610
3611void drbd_free_bc(struct drbd_backing_dev *ldev)
3612{
3613 if (ldev == NULL)
3614 return;
3615
Tejun Heoe525fd82010-11-13 11:55:17 +01003616 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3617 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003618
3619 kfree(ldev);
3620}
3621
3622void drbd_free_sock(struct drbd_conf *mdev)
3623{
3624 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003625 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003626 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3627 sock_release(mdev->data.socket);
3628 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003629 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003630 }
3631 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003632 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003633 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3634 sock_release(mdev->meta.socket);
3635 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003636 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003637 }
3638}
3639
3640
3641void drbd_free_resources(struct drbd_conf *mdev)
3642{
3643 crypto_free_hash(mdev->csums_tfm);
3644 mdev->csums_tfm = NULL;
3645 crypto_free_hash(mdev->verify_tfm);
3646 mdev->verify_tfm = NULL;
3647 crypto_free_hash(mdev->cram_hmac_tfm);
3648 mdev->cram_hmac_tfm = NULL;
3649 crypto_free_hash(mdev->integrity_w_tfm);
3650 mdev->integrity_w_tfm = NULL;
3651 crypto_free_hash(mdev->integrity_r_tfm);
3652 mdev->integrity_r_tfm = NULL;
3653
3654 drbd_free_sock(mdev);
3655
3656 __no_warn(local,
3657 drbd_free_bc(mdev->ldev);
3658 mdev->ldev = NULL;);
3659}
3660
3661/* meta data management */
3662
3663struct meta_data_on_disk {
3664 u64 la_size; /* last agreed size. */
3665 u64 uuid[UI_SIZE]; /* UUIDs. */
3666 u64 device_uuid;
3667 u64 reserved_u64_1;
3668 u32 flags; /* MDF */
3669 u32 magic;
3670 u32 md_size_sect;
3671 u32 al_offset; /* offset to this block */
3672 u32 al_nr_extents; /* important for restoring the AL */
3673 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3674 u32 bm_offset; /* offset to the bitmap, from here */
3675 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
Philipp Reisner99432fc2011-05-20 16:39:13 +02003676 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3677 u32 reserved_u32[3];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003678
3679} __packed;
3680
3681/**
3682 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3683 * @mdev: DRBD device.
3684 */
3685void drbd_md_sync(struct drbd_conf *mdev)
3686{
3687 struct meta_data_on_disk *buffer;
3688 sector_t sector;
3689 int i;
3690
Lars Ellenbergee15b032010-09-03 10:00:09 +02003691 del_timer(&mdev->md_sync_timer);
3692 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003693 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3694 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003695
3696 /* We use here D_FAILED and not D_ATTACHING because we try to write
3697 * metadata even if we detach due to a disk failure! */
3698 if (!get_ldev_if_state(mdev, D_FAILED))
3699 return;
3700
Philipp Reisnerb411b362009-09-25 16:07:19 -07003701 mutex_lock(&mdev->md_io_mutex);
3702 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3703 memset(buffer, 0, 512);
3704
3705 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3706 for (i = UI_CURRENT; i < UI_SIZE; i++)
3707 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3708 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3709 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3710
3711 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3712 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3713 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3714 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3715 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3716
3717 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003718 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003719
3720 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3721 sector = mdev->ldev->md.md_offset;
3722
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003723 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003724 /* this was a try anyways ... */
3725 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003726 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003727 }
3728
3729 /* Update mdev->ldev->md.la_size_sect,
3730 * since we updated it on metadata. */
3731 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3732
3733 mutex_unlock(&mdev->md_io_mutex);
3734 put_ldev(mdev);
3735}
3736
3737/**
3738 * drbd_md_read() - Reads in the meta data super block
3739 * @mdev: DRBD device.
3740 * @bdev: Device from which the meta data should be read in.
3741 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003742 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003743 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3744 */
3745int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3746{
3747 struct meta_data_on_disk *buffer;
3748 int i, rv = NO_ERROR;
3749
3750 if (!get_ldev_if_state(mdev, D_ATTACHING))
3751 return ERR_IO_MD_DISK;
3752
Philipp Reisnerb411b362009-09-25 16:07:19 -07003753 mutex_lock(&mdev->md_io_mutex);
3754 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3755
3756 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003757 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003758 called BEFORE disk is attached */
3759 dev_err(DEV, "Error while reading metadata.\n");
3760 rv = ERR_IO_MD_DISK;
3761 goto err;
3762 }
3763
Andreas Gruenbachere7fad8a2011-01-11 13:54:02 +01003764 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003765 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3766 rv = ERR_MD_INVALID;
3767 goto err;
3768 }
3769 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3770 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3771 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3772 rv = ERR_MD_INVALID;
3773 goto err;
3774 }
3775 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3776 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3777 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3778 rv = ERR_MD_INVALID;
3779 goto err;
3780 }
3781 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3782 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3783 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3784 rv = ERR_MD_INVALID;
3785 goto err;
3786 }
3787
3788 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3789 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3790 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3791 rv = ERR_MD_INVALID;
3792 goto err;
3793 }
3794
3795 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3796 for (i = UI_CURRENT; i < UI_SIZE; i++)
3797 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3798 bdev->md.flags = be32_to_cpu(buffer->flags);
3799 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3800 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3801
Philipp Reisner99432fc2011-05-20 16:39:13 +02003802 spin_lock_irq(&mdev->req_lock);
3803 if (mdev->state.conn < C_CONNECTED) {
3804 int peer;
3805 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3806 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3807 mdev->peer_max_bio_size = peer;
3808 }
3809 spin_unlock_irq(&mdev->req_lock);
3810
Philipp Reisnerb411b362009-09-25 16:07:19 -07003811 if (mdev->sync_conf.al_extents < 7)
3812 mdev->sync_conf.al_extents = 127;
3813
3814 err:
3815 mutex_unlock(&mdev->md_io_mutex);
3816 put_ldev(mdev);
3817
3818 return rv;
3819}
3820
3821/**
3822 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3823 * @mdev: DRBD device.
3824 *
3825 * Call this function if you change anything that should be written to
3826 * the meta-data super block. This function sets MD_DIRTY, and starts a
3827 * timer that ensures that within five seconds you have to call drbd_md_sync().
3828 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003829#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003830void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3831{
3832 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3833 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3834 mdev->last_md_mark_dirty.line = line;
3835 mdev->last_md_mark_dirty.func = func;
3836 }
3837}
3838#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003839void drbd_md_mark_dirty(struct drbd_conf *mdev)
3840{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003841 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003842 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003843}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003844#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003845
3846static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3847{
3848 int i;
3849
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003850 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003851 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003852}
3853
3854void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3855{
3856 if (idx == UI_CURRENT) {
3857 if (mdev->state.role == R_PRIMARY)
3858 val |= 1;
3859 else
3860 val &= ~((u64)1);
3861
3862 drbd_set_ed_uuid(mdev, val);
3863 }
3864
3865 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003866 drbd_md_mark_dirty(mdev);
3867}
3868
3869
3870void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3871{
3872 if (mdev->ldev->md.uuid[idx]) {
3873 drbd_uuid_move_history(mdev);
3874 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003875 }
3876 _drbd_uuid_set(mdev, idx, val);
3877}
3878
3879/**
3880 * drbd_uuid_new_current() - Creates a new current UUID
3881 * @mdev: DRBD device.
3882 *
3883 * Creates a new current UUID, and rotates the old current UUID into
3884 * the bitmap slot. Causes an incremental resync upon next connect.
3885 */
3886void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3887{
3888 u64 val;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003889 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003890
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003891 if (bm_uuid)
3892 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3893
Philipp Reisnerb411b362009-09-25 16:07:19 -07003894 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003895
3896 get_random_bytes(&val, sizeof(u64));
3897 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003898 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003899 /* get it to stable storage _now_ */
3900 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003901}
3902
3903void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3904{
3905 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3906 return;
3907
3908 if (val == 0) {
3909 drbd_uuid_move_history(mdev);
3910 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3911 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003912 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003913 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3914 if (bm_uuid)
3915 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003916
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003917 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003918 }
3919 drbd_md_mark_dirty(mdev);
3920}
3921
3922/**
3923 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3924 * @mdev: DRBD device.
3925 *
3926 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3927 */
3928int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3929{
3930 int rv = -EIO;
3931
3932 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3933 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3934 drbd_md_sync(mdev);
3935 drbd_bm_set_all(mdev);
3936
3937 rv = drbd_bm_write(mdev);
3938
3939 if (!rv) {
3940 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3941 drbd_md_sync(mdev);
3942 }
3943
3944 put_ldev(mdev);
3945 }
3946
3947 return rv;
3948}
3949
3950/**
3951 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3952 * @mdev: DRBD device.
3953 *
3954 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3955 */
3956int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3957{
3958 int rv = -EIO;
3959
Philipp Reisner07782862010-08-31 12:00:50 +02003960 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003961 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3962 drbd_bm_clear_all(mdev);
3963 rv = drbd_bm_write(mdev);
3964 put_ldev(mdev);
3965 }
3966
3967 return rv;
3968}
3969
3970static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3971{
3972 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003973 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003974
3975 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3976
Lars Ellenberg02851e92010-12-16 14:47:39 +01003977 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003978 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003979 rv = work->io_fn(mdev);
3980 drbd_bm_unlock(mdev);
3981 put_ldev(mdev);
3982 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003983
3984 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003985 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07003986 wake_up(&mdev->misc_wait);
3987
3988 if (work->done)
3989 work->done(mdev, rv);
3990
3991 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3992 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003993 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003994
3995 return 1;
3996}
3997
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003998void drbd_ldev_destroy(struct drbd_conf *mdev)
3999{
4000 lc_destroy(mdev->resync);
4001 mdev->resync = NULL;
4002 lc_destroy(mdev->act_log);
4003 mdev->act_log = NULL;
4004 __no_warn(local,
4005 drbd_free_bc(mdev->ldev);
4006 mdev->ldev = NULL;);
4007
4008 if (mdev->md_io_tmpp) {
4009 __free_page(mdev->md_io_tmpp);
4010 mdev->md_io_tmpp = NULL;
4011 }
4012 clear_bit(GO_DISKLESS, &mdev->flags);
4013}
4014
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004015static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4016{
4017 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02004018 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4019 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004020 * the protected members anymore, though, so once put_ldev reaches zero
4021 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004022 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004023 return 1;
4024}
4025
4026void drbd_go_diskless(struct drbd_conf *mdev)
4027{
4028 D_ASSERT(mdev->state.disk == D_FAILED);
4029 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02004030 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004031}
4032
Philipp Reisnerb411b362009-09-25 16:07:19 -07004033/**
4034 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4035 * @mdev: DRBD device.
4036 * @io_fn: IO callback to be called when bitmap IO is possible
4037 * @done: callback to be called after the bitmap IO was performed
4038 * @why: Descriptive text of the reason for doing the IO
4039 *
4040 * While IO on the bitmap happens we freeze application IO thus we ensure
4041 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4042 * called from worker context. It MUST NOT be used while a previous such
4043 * work is still pending!
4044 */
4045void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4046 int (*io_fn)(struct drbd_conf *),
4047 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004048 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004049{
4050 D_ASSERT(current == mdev->worker.task);
4051
4052 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4053 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4054 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4055 if (mdev->bm_io_work.why)
4056 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4057 why, mdev->bm_io_work.why);
4058
4059 mdev->bm_io_work.io_fn = io_fn;
4060 mdev->bm_io_work.done = done;
4061 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004062 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004063
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004064 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004065 set_bit(BITMAP_IO, &mdev->flags);
4066 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004067 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004068 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004069 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004070 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004071}
4072
4073/**
4074 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4075 * @mdev: DRBD device.
4076 * @io_fn: IO callback to be called when bitmap IO is possible
4077 * @why: Descriptive text of the reason for doing the IO
4078 *
4079 * freezes application IO while that the actual IO operations runs. This
4080 * functions MAY NOT be called from worker context.
4081 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004082int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4083 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004084{
4085 int rv;
4086
4087 D_ASSERT(current != mdev->worker.task);
4088
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004089 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4090 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004091
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004092 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004093 rv = io_fn(mdev);
4094 drbd_bm_unlock(mdev);
4095
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004096 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4097 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004098
4099 return rv;
4100}
4101
4102void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4103{
4104 if ((mdev->ldev->md.flags & flag) != flag) {
4105 drbd_md_mark_dirty(mdev);
4106 mdev->ldev->md.flags |= flag;
4107 }
4108}
4109
4110void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4111{
4112 if ((mdev->ldev->md.flags & flag) != 0) {
4113 drbd_md_mark_dirty(mdev);
4114 mdev->ldev->md.flags &= ~flag;
4115 }
4116}
4117int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4118{
4119 return (bdev->md.flags & flag) != 0;
4120}
4121
4122static void md_sync_timer_fn(unsigned long data)
4123{
4124 struct drbd_conf *mdev = (struct drbd_conf *) data;
4125
4126 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4127}
4128
4129static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4130{
4131 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004132#ifdef DEBUG
4133 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4134 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4135#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004136 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004137 return 1;
4138}
4139
4140#ifdef CONFIG_DRBD_FAULT_INJECTION
4141/* Fault insertion support including random number generator shamelessly
4142 * stolen from kernel/rcutorture.c */
4143struct fault_random_state {
4144 unsigned long state;
4145 unsigned long count;
4146};
4147
4148#define FAULT_RANDOM_MULT 39916801 /* prime */
4149#define FAULT_RANDOM_ADD 479001701 /* prime */
4150#define FAULT_RANDOM_REFRESH 10000
4151
4152/*
4153 * Crude but fast random-number generator. Uses a linear congruential
4154 * generator, with occasional help from get_random_bytes().
4155 */
4156static unsigned long
4157_drbd_fault_random(struct fault_random_state *rsp)
4158{
4159 long refresh;
4160
Roel Kluin49829ea2009-12-15 22:55:44 +01004161 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004162 get_random_bytes(&refresh, sizeof(refresh));
4163 rsp->state += refresh;
4164 rsp->count = FAULT_RANDOM_REFRESH;
4165 }
4166 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4167 return swahw32(rsp->state);
4168}
4169
4170static char *
4171_drbd_fault_str(unsigned int type) {
4172 static char *_faults[] = {
4173 [DRBD_FAULT_MD_WR] = "Meta-data write",
4174 [DRBD_FAULT_MD_RD] = "Meta-data read",
4175 [DRBD_FAULT_RS_WR] = "Resync write",
4176 [DRBD_FAULT_RS_RD] = "Resync read",
4177 [DRBD_FAULT_DT_WR] = "Data write",
4178 [DRBD_FAULT_DT_RD] = "Data read",
4179 [DRBD_FAULT_DT_RA] = "Data read ahead",
4180 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004181 [DRBD_FAULT_AL_EE] = "EE allocation",
4182 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004183 };
4184
4185 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4186}
4187
4188unsigned int
4189_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4190{
4191 static struct fault_random_state rrs = {0, 0};
4192
4193 unsigned int ret = (
4194 (fault_devs == 0 ||
4195 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4196 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4197
4198 if (ret) {
4199 fault_count++;
4200
Lars Ellenberg73835062010-05-27 11:51:56 +02004201 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004202 dev_warn(DEV, "***Simulating %s failure\n",
4203 _drbd_fault_str(type));
4204 }
4205
4206 return ret;
4207}
4208#endif
4209
4210const char *drbd_buildtag(void)
4211{
4212 /* DRBD built from external sources has here a reference to the
4213 git hash of the source code. */
4214
4215 static char buildtag[38] = "\0uilt-in";
4216
4217 if (buildtag[0] == 0) {
4218#ifdef CONFIG_MODULES
4219 if (THIS_MODULE != NULL)
4220 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4221 else
4222#endif
4223 buildtag[0] = 'b';
4224 }
4225
4226 return buildtag;
4227}
4228
4229module_init(drbd_init)
4230module_exit(drbd_cleanup)
4231
Philipp Reisnerb411b362009-09-25 16:07:19 -07004232EXPORT_SYMBOL(drbd_conn_str);
4233EXPORT_SYMBOL(drbd_role_str);
4234EXPORT_SYMBOL(drbd_disk_str);
4235EXPORT_SYMBOL(drbd_set_st_err_str);