blob: 18f27afab81e7c18cf9dd716d51f74ba3945de18 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010088MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070090MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700120int disable_sendpage;
121int allow_oos;
122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
142
143/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149struct page *drbd_pp_pool;
150spinlock_t drbd_pp_lock;
151int drbd_pp_vacant;
152wait_queue_head_t drbd_pp_wait;
153
154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100156static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160};
161
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169{
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179}
180
181#endif
182
183/**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193static int tl_init(struct drbd_conf *mdev)
194{
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200205 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
211
212 mdev->tl_hash = NULL;
213 mdev->tl_hash_s = 0;
214
215 return 1;
216}
217
218static void tl_cleanup(struct drbd_conf *mdev)
219{
220 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
221 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
222 kfree(mdev->oldest_tle);
223 mdev->oldest_tle = NULL;
224 kfree(mdev->unused_spare_tle);
225 mdev->unused_spare_tle = NULL;
226 kfree(mdev->tl_hash);
227 mdev->tl_hash = NULL;
228 mdev->tl_hash_s = 0;
229}
230
Andreas Gruenbacherd6287692011-01-13 23:05:39 +0100231static void drbd_free_tl_hash(struct drbd_conf *mdev)
232{
233 struct hlist_head *h;
234
235 spin_lock_irq(&mdev->req_lock);
236
237 if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
238 spin_unlock_irq(&mdev->req_lock);
239 return;
240 }
241 /* paranoia code */
242 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
243 if (h->first)
244 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
245 (int)(h - mdev->ee_hash), h->first);
246 kfree(mdev->ee_hash);
247 mdev->ee_hash = NULL;
248 mdev->ee_hash_s = 0;
249
250 /* paranoia code */
251 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
252 if (h->first)
253 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
254 (int)(h - mdev->tl_hash), h->first);
255 kfree(mdev->tl_hash);
256 mdev->tl_hash = NULL;
257 mdev->tl_hash_s = 0;
258 spin_unlock_irq(&mdev->req_lock);
259}
260
Philipp Reisnerb411b362009-09-25 16:07:19 -0700261/**
262 * _tl_add_barrier() - Adds a barrier to the transfer log
263 * @mdev: DRBD device.
264 * @new: Barrier to be added before the current head of the TL.
265 *
266 * The caller must hold the req_lock.
267 */
268void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
269{
270 struct drbd_tl_epoch *newest_before;
271
272 INIT_LIST_HEAD(&new->requests);
273 INIT_LIST_HEAD(&new->w.list);
274 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
275 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200276 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700277
278 newest_before = mdev->newest_tle;
279 /* never send a barrier number == 0, because that is special-cased
280 * when using TCQ for our write ordering code */
281 new->br_number = (newest_before->br_number+1) ?: 1;
282 if (mdev->newest_tle != new) {
283 mdev->newest_tle->next = new;
284 mdev->newest_tle = new;
285 }
286}
287
288/**
289 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
290 * @mdev: DRBD device.
291 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
292 * @set_size: Expected number of requests before that barrier.
293 *
294 * In case the passed barrier_nr or set_size does not match the oldest
295 * &struct drbd_tl_epoch objects this function will cause a termination
296 * of the connection.
297 */
298void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
299 unsigned int set_size)
300{
301 struct drbd_tl_epoch *b, *nob; /* next old barrier */
302 struct list_head *le, *tle;
303 struct drbd_request *r;
304
305 spin_lock_irq(&mdev->req_lock);
306
307 b = mdev->oldest_tle;
308
309 /* first some paranoia code */
310 if (b == NULL) {
311 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
312 barrier_nr);
313 goto bail;
314 }
315 if (b->br_number != barrier_nr) {
316 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
317 barrier_nr, b->br_number);
318 goto bail;
319 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200320 if (b->n_writes != set_size) {
321 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
322 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700323 goto bail;
324 }
325
326 /* Clean up list of requests processed during current epoch */
327 list_for_each_safe(le, tle, &b->requests) {
328 r = list_entry(le, struct drbd_request, tl_requests);
329 _req_mod(r, barrier_acked);
330 }
331 /* There could be requests on the list waiting for completion
332 of the write to the local disk. To avoid corruptions of
333 slab's data structures we have to remove the lists head.
334
335 Also there could have been a barrier ack out of sequence, overtaking
336 the write acks - which would be a bug and violating write ordering.
337 To not deadlock in case we lose connection while such requests are
338 still pending, we need some way to find them for the
339 _req_mode(connection_lost_while_pending).
340
341 These have been list_move'd to the out_of_sequence_requests list in
342 _req_mod(, barrier_acked) above.
343 */
344 list_del_init(&b->requests);
345
346 nob = b->next;
347 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
348 _tl_add_barrier(mdev, b);
349 if (nob)
350 mdev->oldest_tle = nob;
351 /* if nob == NULL b was the only barrier, and becomes the new
352 barrier. Therefore mdev->oldest_tle points already to b */
353 } else {
354 D_ASSERT(nob != NULL);
355 mdev->oldest_tle = nob;
356 kfree(b);
357 }
358
359 spin_unlock_irq(&mdev->req_lock);
360 dec_ap_pending(mdev);
361
362 return;
363
364bail:
365 spin_unlock_irq(&mdev->req_lock);
366 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
367}
368
Philipp Reisner617049a2010-12-22 12:48:31 +0100369
Philipp Reisner11b58e72010-05-12 17:08:26 +0200370/**
371 * _tl_restart() - Walks the transfer log, and applies an action to all requests
372 * @mdev: DRBD device.
373 * @what: The action/event to perform with all request objects
374 *
375 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
376 * restart_frozen_disk_io.
377 */
378static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
379{
380 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200381 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200382 struct drbd_request *req;
383 int rv, n_writes, n_reads;
384
385 b = mdev->oldest_tle;
386 pn = &mdev->oldest_tle;
387 while (b) {
388 n_writes = 0;
389 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200390 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200391 list_for_each_safe(le, tle, &b->requests) {
392 req = list_entry(le, struct drbd_request, tl_requests);
393 rv = _req_mod(req, what);
394
395 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
396 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
397 }
398 tmp = b->next;
399
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200400 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200401 if (what == resend) {
402 b->n_writes = n_writes;
403 if (b->w.cb == NULL) {
404 b->w.cb = w_send_barrier;
405 inc_ap_pending(mdev);
406 set_bit(CREATE_BARRIER, &mdev->flags);
407 }
408
409 drbd_queue_work(&mdev->data.work, &b->w);
410 }
411 pn = &b->next;
412 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200413 if (n_reads)
414 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200415 /* there could still be requests on that ring list,
416 * in case local io is still pending */
417 list_del(&b->requests);
418
419 /* dec_ap_pending corresponding to queue_barrier.
420 * the newest barrier may not have been queued yet,
421 * in which case w.cb is still NULL. */
422 if (b->w.cb != NULL)
423 dec_ap_pending(mdev);
424
425 if (b == mdev->newest_tle) {
426 /* recycle, but reinit! */
427 D_ASSERT(tmp == NULL);
428 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200429 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200430 INIT_LIST_HEAD(&b->w.list);
431 b->w.cb = NULL;
432 b->br_number = net_random();
433 b->n_writes = 0;
434
435 *pn = b;
436 break;
437 }
438 *pn = tmp;
439 kfree(b);
440 }
441 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200442 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200443 }
444}
445
Philipp Reisnerb411b362009-09-25 16:07:19 -0700446
447/**
448 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
449 * @mdev: DRBD device.
450 *
451 * This is called after the connection to the peer was lost. The storage covered
452 * by the requests on the transfer gets marked as our of sync. Called from the
453 * receiver thread and the worker thread.
454 */
455void tl_clear(struct drbd_conf *mdev)
456{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700457 struct list_head *le, *tle;
458 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700459
460 spin_lock_irq(&mdev->req_lock);
461
Philipp Reisner11b58e72010-05-12 17:08:26 +0200462 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700463
464 /* we expect this list to be empty. */
465 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
466
467 /* but just in case, clean it up anyways! */
468 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
469 r = list_entry(le, struct drbd_request, tl_requests);
470 /* It would be nice to complete outside of spinlock.
471 * But this is easier for now. */
472 _req_mod(r, connection_lost_while_pending);
473 }
474
475 /* ensure bit indicating barrier is required is clear */
476 clear_bit(CREATE_BARRIER, &mdev->flags);
477
Philipp Reisner288f4222010-05-27 15:07:43 +0200478 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
479
Philipp Reisnerb411b362009-09-25 16:07:19 -0700480 spin_unlock_irq(&mdev->req_lock);
481}
482
Philipp Reisner11b58e72010-05-12 17:08:26 +0200483void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
484{
485 spin_lock_irq(&mdev->req_lock);
486 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700487 spin_unlock_irq(&mdev->req_lock);
488}
489
490/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100491 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700492 * @mdev: DRBD device.
493 * @os: old (current) state.
494 * @ns: new (wanted) state.
495 */
496static int cl_wide_st_chg(struct drbd_conf *mdev,
497 union drbd_state os, union drbd_state ns)
498{
499 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
500 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
501 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
502 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
503 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
504 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
505 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
506}
507
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100508enum drbd_state_rv
509drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
510 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700511{
512 unsigned long flags;
513 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100514 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700515
516 spin_lock_irqsave(&mdev->req_lock, flags);
517 os = mdev->state;
518 ns.i = (os.i & ~mask.i) | val.i;
519 rv = _drbd_set_state(mdev, ns, f, NULL);
520 ns = mdev->state;
521 spin_unlock_irqrestore(&mdev->req_lock, flags);
522
523 return rv;
524}
525
526/**
527 * drbd_force_state() - Impose a change which happens outside our control on our state
528 * @mdev: DRBD device.
529 * @mask: mask of state bits to change.
530 * @val: value of new state bits.
531 */
532void drbd_force_state(struct drbd_conf *mdev,
533 union drbd_state mask, union drbd_state val)
534{
535 drbd_change_state(mdev, CS_HARD, mask, val);
536}
537
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100538static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
539static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
540 union drbd_state,
541 union drbd_state);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700542static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200543 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700544int drbd_send_state_req(struct drbd_conf *,
545 union drbd_state, union drbd_state);
546
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100547static enum drbd_state_rv
548_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
549 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700550{
551 union drbd_state os, ns;
552 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100553 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700554
555 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
556 return SS_CW_SUCCESS;
557
558 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
559 return SS_CW_FAILED_BY_PEER;
560
561 rv = 0;
562 spin_lock_irqsave(&mdev->req_lock, flags);
563 os = mdev->state;
564 ns.i = (os.i & ~mask.i) | val.i;
565 ns = sanitize_state(mdev, os, ns, NULL);
566
567 if (!cl_wide_st_chg(mdev, os, ns))
568 rv = SS_CW_NO_NEED;
569 if (!rv) {
570 rv = is_valid_state(mdev, ns);
571 if (rv == SS_SUCCESS) {
572 rv = is_valid_state_transition(mdev, ns, os);
573 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100574 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700575 }
576 }
577 spin_unlock_irqrestore(&mdev->req_lock, flags);
578
579 return rv;
580}
581
582/**
583 * drbd_req_state() - Perform an eventually cluster wide state change
584 * @mdev: DRBD device.
585 * @mask: mask of state bits to change.
586 * @val: value of new state bits.
587 * @f: flags
588 *
589 * Should not be called directly, use drbd_request_state() or
590 * _drbd_request_state().
591 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100592static enum drbd_state_rv
593drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
594 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700595{
596 struct completion done;
597 unsigned long flags;
598 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100599 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700600
601 init_completion(&done);
602
603 if (f & CS_SERIALIZE)
604 mutex_lock(&mdev->state_mutex);
605
606 spin_lock_irqsave(&mdev->req_lock, flags);
607 os = mdev->state;
608 ns.i = (os.i & ~mask.i) | val.i;
609 ns = sanitize_state(mdev, os, ns, NULL);
610
611 if (cl_wide_st_chg(mdev, os, ns)) {
612 rv = is_valid_state(mdev, ns);
613 if (rv == SS_SUCCESS)
614 rv = is_valid_state_transition(mdev, ns, os);
615 spin_unlock_irqrestore(&mdev->req_lock, flags);
616
617 if (rv < SS_SUCCESS) {
618 if (f & CS_VERBOSE)
619 print_st_err(mdev, os, ns, rv);
620 goto abort;
621 }
622
623 drbd_state_lock(mdev);
624 if (!drbd_send_state_req(mdev, mask, val)) {
625 drbd_state_unlock(mdev);
626 rv = SS_CW_FAILED_BY_PEER;
627 if (f & CS_VERBOSE)
628 print_st_err(mdev, os, ns, rv);
629 goto abort;
630 }
631
632 wait_event(mdev->state_wait,
633 (rv = _req_st_cond(mdev, mask, val)));
634
635 if (rv < SS_SUCCESS) {
636 drbd_state_unlock(mdev);
637 if (f & CS_VERBOSE)
638 print_st_err(mdev, os, ns, rv);
639 goto abort;
640 }
641 spin_lock_irqsave(&mdev->req_lock, flags);
642 os = mdev->state;
643 ns.i = (os.i & ~mask.i) | val.i;
644 rv = _drbd_set_state(mdev, ns, f, &done);
645 drbd_state_unlock(mdev);
646 } else {
647 rv = _drbd_set_state(mdev, ns, f, &done);
648 }
649
650 spin_unlock_irqrestore(&mdev->req_lock, flags);
651
652 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
653 D_ASSERT(current != mdev->worker.task);
654 wait_for_completion(&done);
655 }
656
657abort:
658 if (f & CS_SERIALIZE)
659 mutex_unlock(&mdev->state_mutex);
660
661 return rv;
662}
663
664/**
665 * _drbd_request_state() - Request a state change (with flags)
666 * @mdev: DRBD device.
667 * @mask: mask of state bits to change.
668 * @val: value of new state bits.
669 * @f: flags
670 *
671 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
672 * flag, or when logging of failed state change requests is not desired.
673 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100674enum drbd_state_rv
675_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
676 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700677{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100678 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700679
680 wait_event(mdev->state_wait,
681 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
682
683 return rv;
684}
685
686static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
687{
688 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
689 name,
690 drbd_conn_str(ns.conn),
691 drbd_role_str(ns.role),
692 drbd_role_str(ns.peer),
693 drbd_disk_str(ns.disk),
694 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200695 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700696 ns.aftr_isp ? 'a' : '-',
697 ns.peer_isp ? 'p' : '-',
698 ns.user_isp ? 'u' : '-'
699 );
700}
701
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100702void print_st_err(struct drbd_conf *mdev, union drbd_state os,
703 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700704{
705 if (err == SS_IN_TRANSIENT_STATE)
706 return;
707 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
708 print_st(mdev, " state", os);
709 print_st(mdev, "wanted", ns);
710}
711
712
Philipp Reisnerb411b362009-09-25 16:07:19 -0700713/**
714 * is_valid_state() - Returns an SS_ error code if ns is not valid
715 * @mdev: DRBD device.
716 * @ns: State to consider.
717 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100718static enum drbd_state_rv
719is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700720{
721 /* See drbd_state_sw_errors in drbd_strings.c */
722
723 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100724 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700725
726 fp = FP_DONT_CARE;
727 if (get_ldev(mdev)) {
728 fp = mdev->ldev->dc.fencing;
729 put_ldev(mdev);
730 }
731
732 if (get_net_conf(mdev)) {
733 if (!mdev->net_conf->two_primaries &&
734 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
735 rv = SS_TWO_PRIMARIES;
736 put_net_conf(mdev);
737 }
738
739 if (rv <= 0)
740 /* already found a reason to abort */;
741 else if (ns.role == R_SECONDARY && mdev->open_cnt)
742 rv = SS_DEVICE_IN_USE;
743
744 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
745 rv = SS_NO_UP_TO_DATE_DISK;
746
747 else if (fp >= FP_RESOURCE &&
748 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
749 rv = SS_PRIMARY_NOP;
750
751 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
752 rv = SS_NO_UP_TO_DATE_DISK;
753
754 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
755 rv = SS_NO_LOCAL_DISK;
756
757 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
758 rv = SS_NO_REMOTE_DISK;
759
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200760 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
761 rv = SS_NO_UP_TO_DATE_DISK;
762
Philipp Reisnerb411b362009-09-25 16:07:19 -0700763 else if ((ns.conn == C_CONNECTED ||
764 ns.conn == C_WF_BITMAP_S ||
765 ns.conn == C_SYNC_SOURCE ||
766 ns.conn == C_PAUSED_SYNC_S) &&
767 ns.disk == D_OUTDATED)
768 rv = SS_CONNECTED_OUTDATES;
769
770 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
771 (mdev->sync_conf.verify_alg[0] == 0))
772 rv = SS_NO_VERIFY_ALG;
773
774 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
775 mdev->agreed_pro_version < 88)
776 rv = SS_NOT_SUPPORTED;
777
Philipp Reisnerfa7d9392011-05-17 14:48:55 +0200778 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
779 rv = SS_CONNECTED_OUTDATES;
780
Philipp Reisnerb411b362009-09-25 16:07:19 -0700781 return rv;
782}
783
784/**
785 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
786 * @mdev: DRBD device.
787 * @ns: new state.
788 * @os: old state.
789 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100790static enum drbd_state_rv
791is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
792 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700793{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100794 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700795
796 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
797 os.conn > C_CONNECTED)
798 rv = SS_RESYNC_RUNNING;
799
800 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
801 rv = SS_ALREADY_STANDALONE;
802
803 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
804 rv = SS_IS_DISKLESS;
805
806 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
807 rv = SS_NO_NET_CONFIG;
808
809 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
810 rv = SS_LOWER_THAN_OUTDATED;
811
812 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
813 rv = SS_IN_TRANSIENT_STATE;
814
815 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
816 rv = SS_IN_TRANSIENT_STATE;
817
818 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
819 rv = SS_NEED_CONNECTION;
820
821 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
822 ns.conn != os.conn && os.conn > C_CONNECTED)
823 rv = SS_RESYNC_RUNNING;
824
825 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
826 os.conn < C_CONNECTED)
827 rv = SS_NEED_CONNECTION;
828
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100829 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
830 && os.conn < C_WF_REPORT_PARAMS)
831 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
832
Philipp Reisnerb411b362009-09-25 16:07:19 -0700833 return rv;
834}
835
836/**
837 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
838 * @mdev: DRBD device.
839 * @os: old state.
840 * @ns: new state.
841 * @warn_sync_abort:
842 *
843 * When we loose connection, we have to set the state of the peers disk (pdsk)
844 * to D_UNKNOWN. This rule and many more along those lines are in this function.
845 */
846static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200847 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700848{
849 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100850 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700851
852 fp = FP_DONT_CARE;
853 if (get_ldev(mdev)) {
854 fp = mdev->ldev->dc.fencing;
855 put_ldev(mdev);
856 }
857
858 /* Disallow Network errors to configure a device's network part */
859 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
860 os.conn <= C_DISCONNECTING)
861 ns.conn = os.conn;
862
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200863 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
864 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700865 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200866 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700867 ns.conn = os.conn;
868
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200869 /* we cannot fail (again) if we already detached */
870 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
871 ns.disk = D_DISKLESS;
872
873 /* if we are only D_ATTACHING yet,
874 * we can (and should) go directly to D_DISKLESS. */
875 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
876 ns.disk = D_DISKLESS;
877
Philipp Reisnerb411b362009-09-25 16:07:19 -0700878 /* After C_DISCONNECTING only C_STANDALONE may follow */
879 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
880 ns.conn = os.conn;
881
882 if (ns.conn < C_CONNECTED) {
883 ns.peer_isp = 0;
884 ns.peer = R_UNKNOWN;
885 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
886 ns.pdsk = D_UNKNOWN;
887 }
888
889 /* Clear the aftr_isp when becoming unconfigured */
890 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
891 ns.aftr_isp = 0;
892
Philipp Reisnerb411b362009-09-25 16:07:19 -0700893 /* Abort resync if a disk fails/detaches */
894 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
895 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
896 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200897 *warn_sync_abort =
898 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
899 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700900 ns.conn = C_CONNECTED;
901 }
902
Philipp Reisnerb411b362009-09-25 16:07:19 -0700903 /* Connection breaks down before we finished "Negotiating" */
904 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
905 get_ldev_if_state(mdev, D_NEGOTIATING)) {
906 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
907 ns.disk = mdev->new_state_tmp.disk;
908 ns.pdsk = mdev->new_state_tmp.pdsk;
909 } else {
910 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
911 ns.disk = D_DISKLESS;
912 ns.pdsk = D_UNKNOWN;
913 }
914 put_ldev(mdev);
915 }
916
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100917 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
918 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
919 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
920 ns.disk = D_UP_TO_DATE;
921 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
922 ns.pdsk = D_UP_TO_DATE;
923 }
924
925 /* Implications of the connection stat on the disk states */
926 disk_min = D_DISKLESS;
927 disk_max = D_UP_TO_DATE;
928 pdsk_min = D_INCONSISTENT;
929 pdsk_max = D_UNKNOWN;
930 switch ((enum drbd_conns)ns.conn) {
931 case C_WF_BITMAP_T:
932 case C_PAUSED_SYNC_T:
933 case C_STARTING_SYNC_T:
934 case C_WF_SYNC_UUID:
935 case C_BEHIND:
936 disk_min = D_INCONSISTENT;
937 disk_max = D_OUTDATED;
938 pdsk_min = D_UP_TO_DATE;
939 pdsk_max = D_UP_TO_DATE;
940 break;
941 case C_VERIFY_S:
942 case C_VERIFY_T:
943 disk_min = D_UP_TO_DATE;
944 disk_max = D_UP_TO_DATE;
945 pdsk_min = D_UP_TO_DATE;
946 pdsk_max = D_UP_TO_DATE;
947 break;
948 case C_CONNECTED:
949 disk_min = D_DISKLESS;
950 disk_max = D_UP_TO_DATE;
951 pdsk_min = D_DISKLESS;
952 pdsk_max = D_UP_TO_DATE;
953 break;
954 case C_WF_BITMAP_S:
955 case C_PAUSED_SYNC_S:
956 case C_STARTING_SYNC_S:
957 case C_AHEAD:
958 disk_min = D_UP_TO_DATE;
959 disk_max = D_UP_TO_DATE;
960 pdsk_min = D_INCONSISTENT;
961 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
962 break;
963 case C_SYNC_TARGET:
964 disk_min = D_INCONSISTENT;
965 disk_max = D_INCONSISTENT;
966 pdsk_min = D_UP_TO_DATE;
967 pdsk_max = D_UP_TO_DATE;
968 break;
969 case C_SYNC_SOURCE:
970 disk_min = D_UP_TO_DATE;
971 disk_max = D_UP_TO_DATE;
972 pdsk_min = D_INCONSISTENT;
973 pdsk_max = D_INCONSISTENT;
974 break;
975 case C_STANDALONE:
976 case C_DISCONNECTING:
977 case C_UNCONNECTED:
978 case C_TIMEOUT:
979 case C_BROKEN_PIPE:
980 case C_NETWORK_FAILURE:
981 case C_PROTOCOL_ERROR:
982 case C_TEAR_DOWN:
983 case C_WF_CONNECTION:
984 case C_WF_REPORT_PARAMS:
985 case C_MASK:
986 break;
987 }
988 if (ns.disk > disk_max)
989 ns.disk = disk_max;
990
991 if (ns.disk < disk_min) {
992 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
993 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
994 ns.disk = disk_min;
995 }
996 if (ns.pdsk > pdsk_max)
997 ns.pdsk = pdsk_max;
998
999 if (ns.pdsk < pdsk_min) {
1000 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
1001 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
1002 ns.pdsk = pdsk_min;
1003 }
1004
Philipp Reisnerb411b362009-09-25 16:07:19 -07001005 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +02001006 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1007 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001008 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001009
1010 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1011 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1012 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001013 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001014
1015 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1016 if (ns.conn == C_SYNC_SOURCE)
1017 ns.conn = C_PAUSED_SYNC_S;
1018 if (ns.conn == C_SYNC_TARGET)
1019 ns.conn = C_PAUSED_SYNC_T;
1020 } else {
1021 if (ns.conn == C_PAUSED_SYNC_S)
1022 ns.conn = C_SYNC_SOURCE;
1023 if (ns.conn == C_PAUSED_SYNC_T)
1024 ns.conn = C_SYNC_TARGET;
1025 }
1026
1027 return ns;
1028}
1029
1030/* helper for __drbd_set_state */
1031static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1032{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001033 if (mdev->agreed_pro_version < 90)
1034 mdev->ov_start_sector = 0;
1035 mdev->rs_total = drbd_bm_bits(mdev);
1036 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001037 if (cs == C_VERIFY_T) {
1038 /* starting online verify from an arbitrary position
1039 * does not fit well into the existing protocol.
1040 * on C_VERIFY_T, we initialize ov_left and friends
1041 * implicitly in receive_DataRequest once the
1042 * first P_OV_REQUEST is received */
1043 mdev->ov_start_sector = ~(sector_t)0;
1044 } else {
1045 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001046 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001047 mdev->ov_start_sector =
1048 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001049 mdev->rs_total = 1;
1050 } else
1051 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001052 mdev->ov_position = mdev->ov_start_sector;
1053 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001054 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001055}
1056
Philipp Reisner07782862010-08-31 12:00:50 +02001057static void drbd_resume_al(struct drbd_conf *mdev)
1058{
1059 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1060 dev_info(DEV, "Resumed AL updates\n");
1061}
1062
Philipp Reisnerb411b362009-09-25 16:07:19 -07001063/**
1064 * __drbd_set_state() - Set a new DRBD state
1065 * @mdev: DRBD device.
1066 * @ns: new state.
1067 * @flags: Flags
1068 * @done: Optional completion, that will get completed after the after_state_ch() finished
1069 *
1070 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1071 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001072enum drbd_state_rv
1073__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1074 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001075{
1076 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001077 enum drbd_state_rv rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001078 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001079 struct after_state_chg_work *ascw;
1080
1081 os = mdev->state;
1082
1083 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1084
1085 if (ns.i == os.i)
1086 return SS_NOTHING_TO_DO;
1087
1088 if (!(flags & CS_HARD)) {
1089 /* pre-state-change checks ; only look at ns */
1090 /* See drbd_state_sw_errors in drbd_strings.c */
1091
1092 rv = is_valid_state(mdev, ns);
1093 if (rv < SS_SUCCESS) {
1094 /* If the old state was illegal as well, then let
1095 this happen...*/
1096
Philipp Reisner1616a252010-06-10 16:55:15 +02001097 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001098 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001099 } else
1100 rv = is_valid_state_transition(mdev, ns, os);
1101 }
1102
1103 if (rv < SS_SUCCESS) {
1104 if (flags & CS_VERBOSE)
1105 print_st_err(mdev, os, ns, rv);
1106 return rv;
1107 }
1108
1109 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001110 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001111
1112 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001113 char *pbp, pb[300];
1114 pbp = pb;
1115 *pbp = 0;
1116 if (ns.role != os.role)
1117 pbp += sprintf(pbp, "role( %s -> %s ) ",
1118 drbd_role_str(os.role),
1119 drbd_role_str(ns.role));
1120 if (ns.peer != os.peer)
1121 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1122 drbd_role_str(os.peer),
1123 drbd_role_str(ns.peer));
1124 if (ns.conn != os.conn)
1125 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1126 drbd_conn_str(os.conn),
1127 drbd_conn_str(ns.conn));
1128 if (ns.disk != os.disk)
1129 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1130 drbd_disk_str(os.disk),
1131 drbd_disk_str(ns.disk));
1132 if (ns.pdsk != os.pdsk)
1133 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1134 drbd_disk_str(os.pdsk),
1135 drbd_disk_str(ns.pdsk));
1136 if (is_susp(ns) != is_susp(os))
1137 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1138 is_susp(os),
1139 is_susp(ns));
1140 if (ns.aftr_isp != os.aftr_isp)
1141 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1142 os.aftr_isp,
1143 ns.aftr_isp);
1144 if (ns.peer_isp != os.peer_isp)
1145 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1146 os.peer_isp,
1147 ns.peer_isp);
1148 if (ns.user_isp != os.user_isp)
1149 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1150 os.user_isp,
1151 ns.user_isp);
1152 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001153 }
1154
1155 /* solve the race between becoming unconfigured,
1156 * worker doing the cleanup, and
1157 * admin reconfiguring us:
1158 * on (re)configure, first set CONFIG_PENDING,
1159 * then wait for a potentially exiting worker,
1160 * start the worker, and schedule one no_op.
1161 * then proceed with configuration.
1162 */
1163 if (ns.disk == D_DISKLESS &&
1164 ns.conn == C_STANDALONE &&
1165 ns.role == R_SECONDARY &&
1166 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1167 set_bit(DEVICE_DYING, &mdev->flags);
1168
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001169 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1170 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1171 * drbd_ldev_destroy() won't happen before our corresponding
1172 * after_state_ch works run, where we put_ldev again. */
1173 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1174 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1175 atomic_inc(&mdev->local_cnt);
1176
1177 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001178
1179 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1180 drbd_print_uuids(mdev, "attached to UUIDs");
1181
Philipp Reisnerb411b362009-09-25 16:07:19 -07001182 wake_up(&mdev->misc_wait);
1183 wake_up(&mdev->state_wait);
1184
Philipp Reisnerb411b362009-09-25 16:07:19 -07001185 /* aborted verify run. log the last position */
1186 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1187 ns.conn < C_CONNECTED) {
1188 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001189 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001190 dev_info(DEV, "Online Verify reached sector %llu\n",
1191 (unsigned long long)mdev->ov_start_sector);
1192 }
1193
1194 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1195 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1196 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001197 mdev->rs_paused += (long)jiffies
1198 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001199 if (ns.conn == C_SYNC_TARGET)
1200 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001201 }
1202
1203 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1204 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1205 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001206 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001207 }
1208
1209 if (os.conn == C_CONNECTED &&
1210 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001211 unsigned long now = jiffies;
1212 int i;
1213
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001214 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001215 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001216 mdev->rs_last_events = 0;
1217 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001218 mdev->ov_last_oos_size = 0;
1219 mdev->ov_last_oos_start = 0;
1220
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001221 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001222 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001223 mdev->rs_mark_time[i] = now;
1224 }
1225
Lars Ellenberg2649f082010-11-05 10:05:47 +01001226 drbd_rs_controller_reset(mdev);
1227
Philipp Reisnerb411b362009-09-25 16:07:19 -07001228 if (ns.conn == C_VERIFY_S) {
1229 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1230 (unsigned long long)mdev->ov_position);
1231 mod_timer(&mdev->resync_timer, jiffies);
1232 }
1233 }
1234
1235 if (get_ldev(mdev)) {
1236 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1237 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1238 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1239
1240 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1241 mdf |= MDF_CRASHED_PRIMARY;
1242 if (mdev->state.role == R_PRIMARY ||
1243 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1244 mdf |= MDF_PRIMARY_IND;
1245 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1246 mdf |= MDF_CONNECTED_IND;
1247 if (mdev->state.disk > D_INCONSISTENT)
1248 mdf |= MDF_CONSISTENT;
1249 if (mdev->state.disk > D_OUTDATED)
1250 mdf |= MDF_WAS_UP_TO_DATE;
1251 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1252 mdf |= MDF_PEER_OUT_DATED;
1253 if (mdf != mdev->ldev->md.flags) {
1254 mdev->ldev->md.flags = mdf;
1255 drbd_md_mark_dirty(mdev);
1256 }
1257 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1258 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1259 put_ldev(mdev);
1260 }
1261
1262 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1263 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1264 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1265 set_bit(CONSIDER_RESYNC, &mdev->flags);
1266
1267 /* Receiver should clean up itself */
1268 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1269 drbd_thread_stop_nowait(&mdev->receiver);
1270
1271 /* Now the receiver finished cleaning up itself, it should die */
1272 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1273 drbd_thread_stop_nowait(&mdev->receiver);
1274
1275 /* Upon network failure, we need to restart the receiver. */
1276 if (os.conn > C_TEAR_DOWN &&
1277 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1278 drbd_thread_restart_nowait(&mdev->receiver);
1279
Philipp Reisner07782862010-08-31 12:00:50 +02001280 /* Resume AL writing if we get a connection */
1281 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1282 drbd_resume_al(mdev);
1283
Philipp Reisnerb411b362009-09-25 16:07:19 -07001284 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1285 if (ascw) {
1286 ascw->os = os;
1287 ascw->ns = ns;
1288 ascw->flags = flags;
1289 ascw->w.cb = w_after_state_ch;
1290 ascw->done = done;
1291 drbd_queue_work(&mdev->data.work, &ascw->w);
1292 } else {
1293 dev_warn(DEV, "Could not kmalloc an ascw\n");
1294 }
1295
1296 return rv;
1297}
1298
1299static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1300{
1301 struct after_state_chg_work *ascw =
1302 container_of(w, struct after_state_chg_work, w);
1303 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1304 if (ascw->flags & CS_WAIT_COMPLETE) {
1305 D_ASSERT(ascw->done != NULL);
1306 complete(ascw->done);
1307 }
1308 kfree(ascw);
1309
1310 return 1;
1311}
1312
1313static void abw_start_sync(struct drbd_conf *mdev, int rv)
1314{
1315 if (rv) {
1316 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1317 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1318 return;
1319 }
1320
1321 switch (mdev->state.conn) {
1322 case C_STARTING_SYNC_T:
1323 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1324 break;
1325 case C_STARTING_SYNC_S:
1326 drbd_start_resync(mdev, C_SYNC_SOURCE);
1327 break;
1328 }
1329}
1330
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001331int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1332 int (*io_fn)(struct drbd_conf *),
1333 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001334{
1335 int rv;
1336
1337 D_ASSERT(current == mdev->worker.task);
1338
1339 /* open coded non-blocking drbd_suspend_io(mdev); */
1340 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001341
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001342 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001343 rv = io_fn(mdev);
1344 drbd_bm_unlock(mdev);
1345
1346 drbd_resume_io(mdev);
1347
1348 return rv;
1349}
1350
Philipp Reisnerb411b362009-09-25 16:07:19 -07001351/**
1352 * after_state_ch() - Perform after state change actions that may sleep
1353 * @mdev: DRBD device.
1354 * @os: old state.
1355 * @ns: new state.
1356 * @flags: Flags
1357 */
1358static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1359 union drbd_state ns, enum chg_state_flags flags)
1360{
1361 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001362 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001363 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001364
1365 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1366 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1367 if (mdev->p_uuid)
1368 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1369 }
1370
1371 fp = FP_DONT_CARE;
1372 if (get_ldev(mdev)) {
1373 fp = mdev->ldev->dc.fencing;
1374 put_ldev(mdev);
1375 }
1376
1377 /* Inform userspace about the change... */
1378 drbd_bcast_state(mdev, ns);
1379
1380 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1381 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1382 drbd_khelper(mdev, "pri-on-incon-degr");
1383
1384 /* Here we have the actions that are performed after a
1385 state change. This function might sleep */
1386
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001387 nsm.i = -1;
1388 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001389 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1390 what = resend;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001391
Philipp Reisner67098932010-06-24 16:24:25 +02001392 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001393 what = restart_frozen_disk_io;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001394
Philipp Reisner3f986882010-12-20 14:48:20 +01001395 if (what != nothing)
1396 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001397 }
1398
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001399 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001400 /* case1: The outdate peer handler is successful: */
1401 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001402 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001403 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1404 drbd_uuid_new_current(mdev);
1405 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001406 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001407 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001408 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001409 spin_unlock_irq(&mdev->req_lock);
1410 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001411 /* case2: The connection was established again: */
1412 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1413 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001414 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001415 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001416 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001417 }
Philipp Reisner67098932010-06-24 16:24:25 +02001418
1419 if (what != nothing) {
1420 spin_lock_irq(&mdev->req_lock);
1421 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001422 nsm.i &= mdev->state.i;
1423 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001424 spin_unlock_irq(&mdev->req_lock);
1425 }
1426
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001427 /* Became sync source. With protocol >= 96, we still need to send out
1428 * the sync uuid now. Need to do that before any drbd_send_state, or
1429 * the other side may go "paused sync" before receiving the sync uuids,
1430 * which is unexpected. */
1431 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1432 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1433 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1434 drbd_gen_and_send_sync_uuid(mdev);
1435 put_ldev(mdev);
1436 }
1437
Philipp Reisnerb411b362009-09-25 16:07:19 -07001438 /* Do not change the order of the if above and the two below... */
1439 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1440 drbd_send_uuids(mdev);
1441 drbd_send_state(mdev);
1442 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001443 /* No point in queuing send_bitmap if we don't have a connection
1444 * anymore, so check also the _current_ state, not only the new state
1445 * at the time this work was queued. */
1446 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1447 mdev->state.conn == C_WF_BITMAP_S)
1448 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001449 "send_bitmap (WFBitMapS)",
1450 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001451
1452 /* Lost contact to peer's copy of the data */
1453 if ((os.pdsk >= D_INCONSISTENT &&
1454 os.pdsk != D_UNKNOWN &&
1455 os.pdsk != D_OUTDATED)
1456 && (ns.pdsk < D_INCONSISTENT ||
1457 ns.pdsk == D_UNKNOWN ||
1458 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001459 if (get_ldev(mdev)) {
1460 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001461 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001462 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001463 set_bit(NEW_CUR_UUID, &mdev->flags);
1464 } else {
1465 drbd_uuid_new_current(mdev);
1466 drbd_send_uuids(mdev);
1467 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001468 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001469 put_ldev(mdev);
1470 }
1471 }
1472
1473 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001474 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001475 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001476 drbd_send_uuids(mdev);
1477 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001478
1479 /* D_DISKLESS Peer becomes secondary */
1480 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001481 /* We may still be Primary ourselves.
1482 * No harm done if the bitmap still changes,
1483 * redirtied pages will follow later. */
1484 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1485 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001486 put_ldev(mdev);
1487 }
1488
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001489 /* Write out all changed bits on demote.
1490 * Though, no need to da that just yet
1491 * if there is a resync going on still */
1492 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1493 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001494 /* No changes to the bitmap expected this time, so assert that,
1495 * even though no harm was done if it did change. */
1496 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1497 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001498 put_ldev(mdev);
1499 }
1500
1501 /* Last part of the attaching process ... */
1502 if (ns.conn >= C_CONNECTED &&
1503 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001504 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001505 drbd_send_uuids(mdev);
1506 drbd_send_state(mdev);
1507 }
1508
1509 /* We want to pause/continue resync, tell peer. */
1510 if (ns.conn >= C_CONNECTED &&
1511 ((os.aftr_isp != ns.aftr_isp) ||
1512 (os.user_isp != ns.user_isp)))
1513 drbd_send_state(mdev);
1514
1515 /* In case one of the isp bits got set, suspend other devices. */
1516 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1517 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1518 suspend_other_sg(mdev);
1519
1520 /* Make sure the peer gets informed about eventual state
1521 changes (ISP bits) while we were in WFReportParams. */
1522 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1523 drbd_send_state(mdev);
1524
Philipp Reisner67531712010-10-27 12:21:30 +02001525 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1526 drbd_send_state(mdev);
1527
Philipp Reisnerb411b362009-09-25 16:07:19 -07001528 /* We are in the progress to start a full sync... */
1529 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1530 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001531 /* no other bitmap changes expected during this phase */
1532 drbd_queue_bitmap_io(mdev,
1533 &drbd_bmio_set_n_write, &abw_start_sync,
1534 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001535
1536 /* We are invalidating our self... */
1537 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1538 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001539 /* other bitmap operation expected during this phase */
1540 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1541 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001542
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001543 /* first half of local IO error, failure to attach,
1544 * or administrative detach */
1545 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1546 enum drbd_io_error_p eh;
1547 int was_io_error;
1548 /* corresponding get_ldev was in __drbd_set_state, to serialize
1549 * our cleanup here with the transition to D_DISKLESS,
1550 * so it is safe to dreference ldev here. */
1551 eh = mdev->ldev->dc.on_io_error;
1552 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1553
1554 /* current state still has to be D_FAILED,
1555 * there is only one way out: to D_DISKLESS,
1556 * and that may only happen after our put_ldev below. */
1557 if (mdev->state.disk != D_FAILED)
1558 dev_err(DEV,
1559 "ASSERT FAILED: disk is %s during detach\n",
1560 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001561
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001562 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001563 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001564 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001565 dev_err(DEV, "Sending state for detaching disk failed\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001566
1567 drbd_rs_cancel_all(mdev);
1568
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001569 /* In case we want to get something to stable storage still,
1570 * this may be the last chance.
1571 * Following put_ldev may transition to D_DISKLESS. */
1572 drbd_md_sync(mdev);
1573 put_ldev(mdev);
1574
1575 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001576 drbd_khelper(mdev, "local-io-error");
1577 }
1578
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001579 /* second half of local IO error, failure to attach,
1580 * or administrative detach,
1581 * after local_cnt references have reached zero again */
1582 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1583 /* We must still be diskless,
1584 * re-attach has to be serialized with this! */
1585 if (mdev->state.disk != D_DISKLESS)
1586 dev_err(DEV,
1587 "ASSERT FAILED: disk is %s while going diskless\n",
1588 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001589
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001590 mdev->rs_total = 0;
1591 mdev->rs_failed = 0;
1592 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001593
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001594 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001595 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001596 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001597 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001598 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001599 }
1600
Philipp Reisner738a84b2011-03-03 00:21:30 +01001601 /* Notify peer that I had a local IO error, and did not detached.. */
1602 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1603 drbd_send_state(mdev);
1604
Philipp Reisnerb411b362009-09-25 16:07:19 -07001605 /* Disks got bigger while they were detached */
1606 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1607 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1608 if (ns.conn == C_CONNECTED)
1609 resync_after_online_grow(mdev);
1610 }
1611
1612 /* A resync finished or aborted, wake paused devices... */
1613 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1614 (os.peer_isp && !ns.peer_isp) ||
1615 (os.user_isp && !ns.user_isp))
1616 resume_next_sg(mdev);
1617
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001618 /* sync target done with resync. Explicitly notify peer, even though
1619 * it should (at least for non-empty resyncs) already know itself. */
1620 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1621 drbd_send_state(mdev);
1622
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001623 /* This triggers bitmap writeout of potentially still unwritten pages
1624 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001625 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001626 * For resync aborted because of local disk failure, we cannot do
1627 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001628 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001629 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001630 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1631 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1632 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001633 put_ldev(mdev);
1634 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001635
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001636 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001637 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001638 drbd_free_tl_hash(mdev);
1639
Philipp Reisnerb411b362009-09-25 16:07:19 -07001640 /* Upon network connection, we need to start the receiver */
1641 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1642 drbd_thread_start(&mdev->receiver);
1643
1644 /* Terminate worker thread if we are unconfigured - it will be
1645 restarted as needed... */
1646 if (ns.disk == D_DISKLESS &&
1647 ns.conn == C_STANDALONE &&
1648 ns.role == R_SECONDARY) {
1649 if (os.aftr_isp != ns.aftr_isp)
1650 resume_next_sg(mdev);
1651 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1652 if (test_bit(DEVICE_DYING, &mdev->flags))
1653 drbd_thread_stop_nowait(&mdev->worker);
1654 }
1655
1656 drbd_md_sync(mdev);
1657}
1658
1659
1660static int drbd_thread_setup(void *arg)
1661{
1662 struct drbd_thread *thi = (struct drbd_thread *) arg;
1663 struct drbd_conf *mdev = thi->mdev;
1664 unsigned long flags;
1665 int retval;
1666
1667restart:
1668 retval = thi->function(thi);
1669
1670 spin_lock_irqsave(&thi->t_lock, flags);
1671
1672 /* if the receiver has been "Exiting", the last thing it did
1673 * was set the conn state to "StandAlone",
1674 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1675 * and receiver thread will be "started".
1676 * drbd_thread_start needs to set "Restarting" in that case.
1677 * t_state check and assignment needs to be within the same spinlock,
1678 * so either thread_start sees Exiting, and can remap to Restarting,
1679 * or thread_start see None, and can proceed as normal.
1680 */
1681
1682 if (thi->t_state == Restarting) {
1683 dev_info(DEV, "Restarting %s\n", current->comm);
1684 thi->t_state = Running;
1685 spin_unlock_irqrestore(&thi->t_lock, flags);
1686 goto restart;
1687 }
1688
1689 thi->task = NULL;
1690 thi->t_state = None;
1691 smp_mb();
1692 complete(&thi->stop);
1693 spin_unlock_irqrestore(&thi->t_lock, flags);
1694
1695 dev_info(DEV, "Terminating %s\n", current->comm);
1696
1697 /* Release mod reference taken when thread was started */
1698 module_put(THIS_MODULE);
1699 return retval;
1700}
1701
1702static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1703 int (*func) (struct drbd_thread *))
1704{
1705 spin_lock_init(&thi->t_lock);
1706 thi->task = NULL;
1707 thi->t_state = None;
1708 thi->function = func;
1709 thi->mdev = mdev;
1710}
1711
1712int drbd_thread_start(struct drbd_thread *thi)
1713{
1714 struct drbd_conf *mdev = thi->mdev;
1715 struct task_struct *nt;
1716 unsigned long flags;
1717
1718 const char *me =
1719 thi == &mdev->receiver ? "receiver" :
1720 thi == &mdev->asender ? "asender" :
1721 thi == &mdev->worker ? "worker" : "NONSENSE";
1722
1723 /* is used from state engine doing drbd_thread_stop_nowait,
1724 * while holding the req lock irqsave */
1725 spin_lock_irqsave(&thi->t_lock, flags);
1726
1727 switch (thi->t_state) {
1728 case None:
1729 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1730 me, current->comm, current->pid);
1731
1732 /* Get ref on module for thread - this is released when thread exits */
1733 if (!try_module_get(THIS_MODULE)) {
1734 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1735 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001736 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001737 }
1738
1739 init_completion(&thi->stop);
1740 D_ASSERT(thi->task == NULL);
1741 thi->reset_cpu_mask = 1;
1742 thi->t_state = Running;
1743 spin_unlock_irqrestore(&thi->t_lock, flags);
1744 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1745
1746 nt = kthread_create(drbd_thread_setup, (void *) thi,
1747 "drbd%d_%s", mdev_to_minor(mdev), me);
1748
1749 if (IS_ERR(nt)) {
1750 dev_err(DEV, "Couldn't start thread\n");
1751
1752 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001753 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001754 }
1755 spin_lock_irqsave(&thi->t_lock, flags);
1756 thi->task = nt;
1757 thi->t_state = Running;
1758 spin_unlock_irqrestore(&thi->t_lock, flags);
1759 wake_up_process(nt);
1760 break;
1761 case Exiting:
1762 thi->t_state = Restarting;
1763 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1764 me, current->comm, current->pid);
1765 /* fall through */
1766 case Running:
1767 case Restarting:
1768 default:
1769 spin_unlock_irqrestore(&thi->t_lock, flags);
1770 break;
1771 }
1772
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001773 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001774}
1775
1776
1777void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1778{
1779 unsigned long flags;
1780
1781 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1782
1783 /* may be called from state engine, holding the req lock irqsave */
1784 spin_lock_irqsave(&thi->t_lock, flags);
1785
1786 if (thi->t_state == None) {
1787 spin_unlock_irqrestore(&thi->t_lock, flags);
1788 if (restart)
1789 drbd_thread_start(thi);
1790 return;
1791 }
1792
1793 if (thi->t_state != ns) {
1794 if (thi->task == NULL) {
1795 spin_unlock_irqrestore(&thi->t_lock, flags);
1796 return;
1797 }
1798
1799 thi->t_state = ns;
1800 smp_mb();
1801 init_completion(&thi->stop);
1802 if (thi->task != current)
1803 force_sig(DRBD_SIGKILL, thi->task);
1804
1805 }
1806
1807 spin_unlock_irqrestore(&thi->t_lock, flags);
1808
1809 if (wait)
1810 wait_for_completion(&thi->stop);
1811}
1812
1813#ifdef CONFIG_SMP
1814/**
1815 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1816 * @mdev: DRBD device.
1817 *
1818 * Forces all threads of a device onto the same CPU. This is beneficial for
1819 * DRBD's performance. May be overwritten by user's configuration.
1820 */
1821void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1822{
1823 int ord, cpu;
1824
1825 /* user override. */
1826 if (cpumask_weight(mdev->cpu_mask))
1827 return;
1828
1829 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1830 for_each_online_cpu(cpu) {
1831 if (ord-- == 0) {
1832 cpumask_set_cpu(cpu, mdev->cpu_mask);
1833 return;
1834 }
1835 }
1836 /* should not be reached */
1837 cpumask_setall(mdev->cpu_mask);
1838}
1839
1840/**
1841 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1842 * @mdev: DRBD device.
1843 *
1844 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1845 * prematurely.
1846 */
1847void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1848{
1849 struct task_struct *p = current;
1850 struct drbd_thread *thi =
1851 p == mdev->asender.task ? &mdev->asender :
1852 p == mdev->receiver.task ? &mdev->receiver :
1853 p == mdev->worker.task ? &mdev->worker :
1854 NULL;
1855 ERR_IF(thi == NULL)
1856 return;
1857 if (!thi->reset_cpu_mask)
1858 return;
1859 thi->reset_cpu_mask = 0;
1860 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1861}
1862#endif
1863
1864/* the appropriate socket mutex must be held already */
1865int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001866 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001867 size_t size, unsigned msg_flags)
1868{
1869 int sent, ok;
1870
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001871 ERR_IF(!h) return false;
1872 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001873
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01001874 h->magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001875 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001876 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001877
Philipp Reisnerb411b362009-09-25 16:07:19 -07001878 sent = drbd_send(mdev, sock, h, size, msg_flags);
1879
1880 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001881 if (!ok && !signal_pending(current))
1882 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001883 cmdname(cmd), (int)size, sent);
1884 return ok;
1885}
1886
1887/* don't pass the socket. we may only look at it
1888 * when we hold the appropriate socket mutex.
1889 */
1890int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001891 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001892{
1893 int ok = 0;
1894 struct socket *sock;
1895
1896 if (use_data_socket) {
1897 mutex_lock(&mdev->data.mutex);
1898 sock = mdev->data.socket;
1899 } else {
1900 mutex_lock(&mdev->meta.mutex);
1901 sock = mdev->meta.socket;
1902 }
1903
1904 /* drbd_disconnect() could have called drbd_free_sock()
1905 * while we were waiting in down()... */
1906 if (likely(sock != NULL))
1907 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1908
1909 if (use_data_socket)
1910 mutex_unlock(&mdev->data.mutex);
1911 else
1912 mutex_unlock(&mdev->meta.mutex);
1913 return ok;
1914}
1915
1916int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1917 size_t size)
1918{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001919 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001920 int ok;
1921
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01001922 h.magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001923 h.command = cpu_to_be16(cmd);
1924 h.length = cpu_to_be16(size);
1925
1926 if (!drbd_get_data_sock(mdev))
1927 return 0;
1928
Philipp Reisnerb411b362009-09-25 16:07:19 -07001929 ok = (sizeof(h) ==
1930 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1931 ok = ok && (size ==
1932 drbd_send(mdev, mdev->data.socket, data, size, 0));
1933
1934 drbd_put_data_sock(mdev);
1935
1936 return ok;
1937}
1938
1939int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1940{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001941 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001942 struct socket *sock;
1943 int size, rv;
1944 const int apv = mdev->agreed_pro_version;
1945
1946 size = apv <= 87 ? sizeof(struct p_rs_param)
1947 : apv == 88 ? sizeof(struct p_rs_param)
1948 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001949 : apv <= 94 ? sizeof(struct p_rs_param_89)
1950 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001951
1952 /* used from admin command context and receiver/worker context.
1953 * to avoid kmalloc, grab the socket right here,
1954 * then use the pre-allocated sbuf there */
1955 mutex_lock(&mdev->data.mutex);
1956 sock = mdev->data.socket;
1957
1958 if (likely(sock != NULL)) {
1959 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1960
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001961 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001962
1963 /* initialize verify_alg and csums_alg */
1964 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1965
1966 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001967 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1968 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1969 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1970 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001971
1972 if (apv >= 88)
1973 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1974 if (apv >= 89)
1975 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1976
1977 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1978 } else
1979 rv = 0; /* not ok */
1980
1981 mutex_unlock(&mdev->data.mutex);
1982
1983 return rv;
1984}
1985
1986int drbd_send_protocol(struct drbd_conf *mdev)
1987{
1988 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001989 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001990
1991 size = sizeof(struct p_protocol);
1992
1993 if (mdev->agreed_pro_version >= 87)
1994 size += strlen(mdev->net_conf->integrity_alg) + 1;
1995
1996 /* we must not recurse into our own queue,
1997 * as that is blocked during handshake */
1998 p = kmalloc(size, GFP_NOIO);
1999 if (p == NULL)
2000 return 0;
2001
2002 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2003 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2004 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2005 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002006 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2007
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002008 cf = 0;
2009 if (mdev->net_conf->want_lose)
2010 cf |= CF_WANT_LOSE;
2011 if (mdev->net_conf->dry_run) {
2012 if (mdev->agreed_pro_version >= 92)
2013 cf |= CF_DRY_RUN;
2014 else {
2015 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02002016 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01002017 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002018 }
2019 }
2020 p->conn_flags = cpu_to_be32(cf);
2021
Philipp Reisnerb411b362009-09-25 16:07:19 -07002022 if (mdev->agreed_pro_version >= 87)
2023 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2024
2025 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002026 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002027 kfree(p);
2028 return rv;
2029}
2030
2031int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2032{
2033 struct p_uuids p;
2034 int i;
2035
2036 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2037 return 1;
2038
2039 for (i = UI_CURRENT; i < UI_SIZE; i++)
2040 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2041
2042 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2043 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2044 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2045 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2046 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2047 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2048
2049 put_ldev(mdev);
2050
2051 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002052 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002053}
2054
2055int drbd_send_uuids(struct drbd_conf *mdev)
2056{
2057 return _drbd_send_uuids(mdev, 0);
2058}
2059
2060int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2061{
2062 return _drbd_send_uuids(mdev, 8);
2063}
2064
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002065void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2066{
2067 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2068 u64 *uuid = mdev->ldev->md.uuid;
2069 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2070 text,
2071 (unsigned long long)uuid[UI_CURRENT],
2072 (unsigned long long)uuid[UI_BITMAP],
2073 (unsigned long long)uuid[UI_HISTORY_START],
2074 (unsigned long long)uuid[UI_HISTORY_END]);
2075 put_ldev(mdev);
2076 } else {
2077 dev_info(DEV, "%s effective data uuid: %016llX\n",
2078 text,
2079 (unsigned long long)mdev->ed_uuid);
2080 }
2081}
2082
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002083int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002084{
2085 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002086 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002087
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002088 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2089
Philipp Reisner4a23f262011-01-11 17:42:17 +01002090 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002091 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002092 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002093 drbd_md_sync(mdev);
2094 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002095
2096 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002097 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002098}
2099
Philipp Reisnere89b5912010-03-24 17:11:33 +01002100int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002101{
2102 struct p_sizes p;
2103 sector_t d_size, u_size;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002104 int q_order_type, max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002105 int ok;
2106
2107 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2108 D_ASSERT(mdev->ldev->backing_bdev);
2109 d_size = drbd_get_max_capacity(mdev->ldev);
2110 u_size = mdev->ldev->dc.disk_size;
2111 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisner99432fc2011-05-20 16:39:13 +02002112 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2113 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002114 put_ldev(mdev);
2115 } else {
2116 d_size = 0;
2117 u_size = 0;
2118 q_order_type = QUEUE_ORDERED_NONE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002119 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002120 }
2121
2122 p.d_size = cpu_to_be64(d_size);
2123 p.u_size = cpu_to_be64(u_size);
2124 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Philipp Reisner99432fc2011-05-20 16:39:13 +02002125 p.max_bio_size = cpu_to_be32(max_bio_size);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002126 p.queue_order_type = cpu_to_be16(q_order_type);
2127 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002128
2129 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002130 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002131 return ok;
2132}
2133
2134/**
2135 * drbd_send_state() - Sends the drbd state to the peer
2136 * @mdev: DRBD device.
2137 */
2138int drbd_send_state(struct drbd_conf *mdev)
2139{
2140 struct socket *sock;
2141 struct p_state p;
2142 int ok = 0;
2143
2144 /* Grab state lock so we wont send state if we're in the middle
2145 * of a cluster wide state change on another thread */
2146 drbd_state_lock(mdev);
2147
2148 mutex_lock(&mdev->data.mutex);
2149
2150 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2151 sock = mdev->data.socket;
2152
2153 if (likely(sock != NULL)) {
2154 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002155 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002156 }
2157
2158 mutex_unlock(&mdev->data.mutex);
2159
2160 drbd_state_unlock(mdev);
2161 return ok;
2162}
2163
2164int drbd_send_state_req(struct drbd_conf *mdev,
2165 union drbd_state mask, union drbd_state val)
2166{
2167 struct p_req_state p;
2168
2169 p.mask = cpu_to_be32(mask.i);
2170 p.val = cpu_to_be32(val.i);
2171
2172 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002173 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002174}
2175
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002176int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002177{
2178 struct p_req_state_reply p;
2179
2180 p.retcode = cpu_to_be32(retcode);
2181
2182 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002183 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002184}
2185
2186int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2187 struct p_compressed_bm *p,
2188 struct bm_xfer_ctx *c)
2189{
2190 struct bitstream bs;
2191 unsigned long plain_bits;
2192 unsigned long tmp;
2193 unsigned long rl;
2194 unsigned len;
2195 unsigned toggle;
2196 int bits;
2197
2198 /* may we use this feature? */
2199 if ((mdev->sync_conf.use_rle == 0) ||
2200 (mdev->agreed_pro_version < 90))
2201 return 0;
2202
2203 if (c->bit_offset >= c->bm_bits)
2204 return 0; /* nothing to do. */
2205
2206 /* use at most thus many bytes */
2207 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2208 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2209 /* plain bits covered in this code string */
2210 plain_bits = 0;
2211
2212 /* p->encoding & 0x80 stores whether the first run length is set.
2213 * bit offset is implicit.
2214 * start with toggle == 2 to be able to tell the first iteration */
2215 toggle = 2;
2216
2217 /* see how much plain bits we can stuff into one packet
2218 * using RLE and VLI. */
2219 do {
2220 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2221 : _drbd_bm_find_next(mdev, c->bit_offset);
2222 if (tmp == -1UL)
2223 tmp = c->bm_bits;
2224 rl = tmp - c->bit_offset;
2225
2226 if (toggle == 2) { /* first iteration */
2227 if (rl == 0) {
2228 /* the first checked bit was set,
2229 * store start value, */
2230 DCBP_set_start(p, 1);
2231 /* but skip encoding of zero run length */
2232 toggle = !toggle;
2233 continue;
2234 }
2235 DCBP_set_start(p, 0);
2236 }
2237
2238 /* paranoia: catch zero runlength.
2239 * can only happen if bitmap is modified while we scan it. */
2240 if (rl == 0) {
2241 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2242 "t:%u bo:%lu\n", toggle, c->bit_offset);
2243 return -1;
2244 }
2245
2246 bits = vli_encode_bits(&bs, rl);
2247 if (bits == -ENOBUFS) /* buffer full */
2248 break;
2249 if (bits <= 0) {
2250 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2251 return 0;
2252 }
2253
2254 toggle = !toggle;
2255 plain_bits += rl;
2256 c->bit_offset = tmp;
2257 } while (c->bit_offset < c->bm_bits);
2258
2259 len = bs.cur.b - p->code + !!bs.cur.bit;
2260
2261 if (plain_bits < (len << 3)) {
2262 /* incompressible with this method.
2263 * we need to rewind both word and bit position. */
2264 c->bit_offset -= plain_bits;
2265 bm_xfer_ctx_bit_to_word_offset(c);
2266 c->bit_offset = c->word_offset * BITS_PER_LONG;
2267 return 0;
2268 }
2269
2270 /* RLE + VLI was able to compress it just fine.
2271 * update c->word_offset. */
2272 bm_xfer_ctx_bit_to_word_offset(c);
2273
2274 /* store pad_bits */
2275 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2276
2277 return len;
2278}
2279
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002280/**
2281 * send_bitmap_rle_or_plain
2282 *
2283 * Return 0 when done, 1 when another iteration is needed, and a negative error
2284 * code upon failure.
2285 */
2286static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002287send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002288 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002289{
2290 struct p_compressed_bm *p = (void*)h;
2291 unsigned long num_words;
2292 int len;
2293 int ok;
2294
2295 len = fill_bitmap_rle_bits(mdev, p, c);
2296
2297 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002298 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002299
2300 if (len) {
2301 DCBP_set_code(p, RLE_VLI_Bits);
2302 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2303 sizeof(*p) + len, 0);
2304
2305 c->packets[0]++;
2306 c->bytes[0] += sizeof(*p) + len;
2307
2308 if (c->bit_offset >= c->bm_bits)
2309 len = 0; /* DONE */
2310 } else {
2311 /* was not compressible.
2312 * send a buffer full of plain text bits instead. */
2313 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2314 len = num_words * sizeof(long);
2315 if (len)
2316 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2317 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002318 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002319 c->word_offset += num_words;
2320 c->bit_offset = c->word_offset * BITS_PER_LONG;
2321
2322 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002323 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002324
2325 if (c->bit_offset > c->bm_bits)
2326 c->bit_offset = c->bm_bits;
2327 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002328 if (ok) {
2329 if (len == 0) {
2330 INFO_bm_xfer_stats(mdev, "send", c);
2331 return 0;
2332 } else
2333 return 1;
2334 }
2335 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002336}
2337
2338/* See the comment at receive_bitmap() */
2339int _drbd_send_bitmap(struct drbd_conf *mdev)
2340{
2341 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002342 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002343 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002344
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002345 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002346
2347 /* maybe we should use some per thread scratch page,
2348 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002349 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002350 if (!p) {
2351 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002352 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002353 }
2354
2355 if (get_ldev(mdev)) {
2356 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2357 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2358 drbd_bm_set_all(mdev);
2359 if (drbd_bm_write(mdev)) {
2360 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2361 * but otherwise process as per normal - need to tell other
2362 * side that a full resync is required! */
2363 dev_err(DEV, "Failed to write bitmap to disk!\n");
2364 } else {
2365 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2366 drbd_md_sync(mdev);
2367 }
2368 }
2369 put_ldev(mdev);
2370 }
2371
2372 c = (struct bm_xfer_ctx) {
2373 .bm_bits = drbd_bm_bits(mdev),
2374 .bm_words = drbd_bm_words(mdev),
2375 };
2376
2377 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002378 err = send_bitmap_rle_or_plain(mdev, p, &c);
2379 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002380
2381 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002382 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002383}
2384
2385int drbd_send_bitmap(struct drbd_conf *mdev)
2386{
2387 int err;
2388
2389 if (!drbd_get_data_sock(mdev))
2390 return -1;
2391 err = !_drbd_send_bitmap(mdev);
2392 drbd_put_data_sock(mdev);
2393 return err;
2394}
2395
2396int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2397{
2398 int ok;
2399 struct p_barrier_ack p;
2400
2401 p.barrier = barrier_nr;
2402 p.set_size = cpu_to_be32(set_size);
2403
2404 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002405 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002406 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002407 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002408 return ok;
2409}
2410
2411/**
2412 * _drbd_send_ack() - Sends an ack packet
2413 * @mdev: DRBD device.
2414 * @cmd: Packet command code.
2415 * @sector: sector, needs to be in big endian byte order
2416 * @blksize: size in byte, needs to be in big endian byte order
2417 * @block_id: Id, big endian byte order
2418 */
2419static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2420 u64 sector,
2421 u32 blksize,
2422 u64 block_id)
2423{
2424 int ok;
2425 struct p_block_ack p;
2426
2427 p.sector = sector;
2428 p.block_id = block_id;
2429 p.blksize = blksize;
2430 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2431
2432 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002433 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002434 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002435 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002436 return ok;
2437}
2438
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002439/* dp->sector and dp->block_id already/still in network byte order,
2440 * data_size is payload size according to dp->head,
2441 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002442int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002443 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002444{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002445 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2446 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002447 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2448 dp->block_id);
2449}
2450
2451int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2452 struct p_block_req *rp)
2453{
2454 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2455}
2456
2457/**
2458 * drbd_send_ack() - Sends an ack packet
2459 * @mdev: DRBD device.
2460 * @cmd: Packet command code.
2461 * @e: Epoch entry.
2462 */
2463int drbd_send_ack(struct drbd_conf *mdev,
2464 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2465{
2466 return _drbd_send_ack(mdev, cmd,
Andreas Gruenbacher010f6e62011-01-14 20:59:35 +01002467 cpu_to_be64(e->i.sector),
2468 cpu_to_be32(e->i.size),
Philipp Reisnerb411b362009-09-25 16:07:19 -07002469 e->block_id);
2470}
2471
2472/* This function misuses the block_id field to signal if the blocks
2473 * are is sync or not. */
2474int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2475 sector_t sector, int blksize, u64 block_id)
2476{
2477 return _drbd_send_ack(mdev, cmd,
2478 cpu_to_be64(sector),
2479 cpu_to_be32(blksize),
2480 cpu_to_be64(block_id));
2481}
2482
2483int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2484 sector_t sector, int size, u64 block_id)
2485{
2486 int ok;
2487 struct p_block_req p;
2488
2489 p.sector = cpu_to_be64(sector);
2490 p.block_id = block_id;
2491 p.blksize = cpu_to_be32(size);
2492
2493 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002494 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002495 return ok;
2496}
2497
2498int drbd_send_drequest_csum(struct drbd_conf *mdev,
2499 sector_t sector, int size,
2500 void *digest, int digest_size,
2501 enum drbd_packets cmd)
2502{
2503 int ok;
2504 struct p_block_req p;
2505
2506 p.sector = cpu_to_be64(sector);
Andreas Gruenbacher9a8e7752011-01-11 14:04:09 +01002507 p.block_id = ID_SYNCER /* unused */;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002508 p.blksize = cpu_to_be32(size);
2509
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002510 p.head.magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002511 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002512 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002513
2514 mutex_lock(&mdev->data.mutex);
2515
2516 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2517 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2518
2519 mutex_unlock(&mdev->data.mutex);
2520
2521 return ok;
2522}
2523
2524int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2525{
2526 int ok;
2527 struct p_block_req p;
2528
2529 p.sector = cpu_to_be64(sector);
Andreas Gruenbacher9a8e7752011-01-11 14:04:09 +01002530 p.block_id = ID_SYNCER /* unused */;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002531 p.blksize = cpu_to_be32(size);
2532
2533 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002534 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002535 return ok;
2536}
2537
2538/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002539 * returns false if we should retry,
2540 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002541 */
2542static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2543{
2544 int drop_it;
2545 /* long elapsed = (long)(jiffies - mdev->last_received); */
2546
2547 drop_it = mdev->meta.socket == sock
2548 || !mdev->asender.task
2549 || get_t_state(&mdev->asender) != Running
2550 || mdev->state.conn < C_CONNECTED;
2551
2552 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002553 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002554
2555 drop_it = !--mdev->ko_count;
2556 if (!drop_it) {
2557 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2558 current->comm, current->pid, mdev->ko_count);
2559 request_ping(mdev);
2560 }
2561
2562 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2563}
2564
2565/* The idea of sendpage seems to be to put some kind of reference
2566 * to the page into the skb, and to hand it over to the NIC. In
2567 * this process get_page() gets called.
2568 *
2569 * As soon as the page was really sent over the network put_page()
2570 * gets called by some part of the network layer. [ NIC driver? ]
2571 *
2572 * [ get_page() / put_page() increment/decrement the count. If count
2573 * reaches 0 the page will be freed. ]
2574 *
2575 * This works nicely with pages from FSs.
2576 * But this means that in protocol A we might signal IO completion too early!
2577 *
2578 * In order not to corrupt data during a resync we must make sure
2579 * that we do not reuse our own buffer pages (EEs) to early, therefore
2580 * we have the net_ee list.
2581 *
2582 * XFS seems to have problems, still, it submits pages with page_count == 0!
2583 * As a workaround, we disable sendpage on pages
2584 * with page_count == 0 or PageSlab.
2585 */
2586static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002587 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002588{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002589 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002590 kunmap(page);
2591 if (sent == size)
2592 mdev->send_cnt += size>>9;
2593 return sent == size;
2594}
2595
2596static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002597 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002598{
2599 mm_segment_t oldfs = get_fs();
2600 int sent, ok;
2601 int len = size;
2602
2603 /* e.g. XFS meta- & log-data is in slab pages, which have a
2604 * page_count of 0 and/or have PageSlab() set.
2605 * we cannot use send_page for those, as that does get_page();
2606 * put_page(); and would cause either a VM_BUG directly, or
2607 * __page_cache_release a page that would actually still be referenced
2608 * by someone, leading to some obscure delayed Oops somewhere else. */
2609 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002610 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002611
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002612 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002613 drbd_update_congested(mdev);
2614 set_fs(KERNEL_DS);
2615 do {
2616 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2617 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002618 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002619 if (sent == -EAGAIN) {
2620 if (we_should_drop_the_connection(mdev,
2621 mdev->data.socket))
2622 break;
2623 else
2624 continue;
2625 }
2626 if (sent <= 0) {
2627 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2628 __func__, (int)size, len, sent);
2629 break;
2630 }
2631 len -= sent;
2632 offset += sent;
2633 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2634 set_fs(oldfs);
2635 clear_bit(NET_CONGESTED, &mdev->flags);
2636
2637 ok = (len == 0);
2638 if (likely(ok))
2639 mdev->send_cnt += size>>9;
2640 return ok;
2641}
2642
2643static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2644{
2645 struct bio_vec *bvec;
2646 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002647 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002648 __bio_for_each_segment(bvec, bio, i, 0) {
2649 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002650 bvec->bv_offset, bvec->bv_len,
2651 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002652 return 0;
2653 }
2654 return 1;
2655}
2656
2657static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2658{
2659 struct bio_vec *bvec;
2660 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002661 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002662 __bio_for_each_segment(bvec, bio, i, 0) {
2663 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002664 bvec->bv_offset, bvec->bv_len,
2665 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002666 return 0;
2667 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002668 return 1;
2669}
2670
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002671static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2672{
2673 struct page *page = e->pages;
Andreas Gruenbacher010f6e62011-01-14 20:59:35 +01002674 unsigned len = e->i.size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002675 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002676 page_chain_for_each(page) {
2677 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002678 if (!_drbd_send_page(mdev, page, 0, l,
2679 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002680 return 0;
2681 len -= l;
2682 }
2683 return 1;
2684}
2685
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002686static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2687{
2688 if (mdev->agreed_pro_version >= 95)
2689 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002690 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2691 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2692 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2693 else
Jens Axboe721a9602011-03-09 11:56:30 +01002694 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002695}
2696
Philipp Reisnerb411b362009-09-25 16:07:19 -07002697/* Used to send write requests
2698 * R_PRIMARY -> Peer (P_DATA)
2699 */
2700int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2701{
2702 int ok = 1;
2703 struct p_data p;
2704 unsigned int dp_flags = 0;
2705 void *dgb;
2706 int dgs;
2707
2708 if (!drbd_get_data_sock(mdev))
2709 return 0;
2710
2711 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2712 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2713
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002714 if (req->i.size <= DRBD_MAX_SIZE_H80_PACKET) {
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002715 p.head.h80.magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002716 p.head.h80.command = cpu_to_be16(P_DATA);
2717 p.head.h80.length =
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002718 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->i.size);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002719 } else {
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002720 p.head.h95.magic = cpu_to_be16(DRBD_MAGIC_BIG);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002721 p.head.h95.command = cpu_to_be16(P_DATA);
2722 p.head.h95.length =
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002723 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->i.size);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002724 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002725
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002726 p.sector = cpu_to_be64(req->i.sector);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002727 p.block_id = (unsigned long)req;
2728 p.seq_num = cpu_to_be32(req->seq_num =
2729 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002730
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002731 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2732
Philipp Reisnerb411b362009-09-25 16:07:19 -07002733 if (mdev->state.conn >= C_SYNC_SOURCE &&
2734 mdev->state.conn <= C_PAUSED_SYNC_T)
2735 dp_flags |= DP_MAY_SET_IN_SYNC;
2736
2737 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002738 set_bit(UNPLUG_REMOTE, &mdev->flags);
2739 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002740 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002741 if (ok && dgs) {
2742 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002743 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002744 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002745 }
2746 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002747 /* For protocol A, we have to memcpy the payload into
2748 * socket buffers, as we may complete right away
2749 * as soon as we handed it over to tcp, at which point the data
2750 * pages may become invalid.
2751 *
2752 * For data-integrity enabled, we copy it as well, so we can be
2753 * sure that even if the bio pages may still be modified, it
2754 * won't change the data on the wire, thus if the digest checks
2755 * out ok after sending on this side, but does not fit on the
2756 * receiving side, we sure have detected corruption elsewhere.
2757 */
2758 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002759 ok = _drbd_send_bio(mdev, req->master_bio);
2760 else
2761 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002762
2763 /* double check digest, sometimes buffers have been modified in flight. */
2764 if (dgs > 0 && dgs <= 64) {
Bart Van Assche24c48302011-05-21 18:32:29 +02002765 /* 64 byte, 512 bit, is the largest digest size
Lars Ellenberg470be442010-11-10 10:36:52 +01002766 * currently supported in kernel crypto. */
2767 unsigned char digest[64];
2768 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2769 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2770 dev_warn(DEV,
2771 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002772 (unsigned long long)req->i.sector, req->i.size);
Lars Ellenberg470be442010-11-10 10:36:52 +01002773 }
2774 } /* else if (dgs > 64) {
2775 ... Be noisy about digest too large ...
2776 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002777 }
2778
2779 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002780
Philipp Reisnerb411b362009-09-25 16:07:19 -07002781 return ok;
2782}
2783
2784/* answer packet, used to send data back for read requests:
2785 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2786 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2787 */
2788int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2789 struct drbd_epoch_entry *e)
2790{
2791 int ok;
2792 struct p_data p;
2793 void *dgb;
2794 int dgs;
2795
2796 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2797 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2798
Andreas Gruenbacher010f6e62011-01-14 20:59:35 +01002799 if (e->i.size <= DRBD_MAX_SIZE_H80_PACKET) {
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002800 p.head.h80.magic = cpu_to_be32(DRBD_MAGIC);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002801 p.head.h80.command = cpu_to_be16(cmd);
2802 p.head.h80.length =
Andreas Gruenbacher010f6e62011-01-14 20:59:35 +01002803 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->i.size);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002804 } else {
Andreas Gruenbacherca9bc122011-01-11 13:47:24 +01002805 p.head.h95.magic = cpu_to_be16(DRBD_MAGIC_BIG);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002806 p.head.h95.command = cpu_to_be16(cmd);
2807 p.head.h95.length =
Andreas Gruenbacher010f6e62011-01-14 20:59:35 +01002808 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->i.size);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002809 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002810
Andreas Gruenbacher010f6e62011-01-14 20:59:35 +01002811 p.sector = cpu_to_be64(e->i.sector);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002812 p.block_id = e->block_id;
2813 /* p.seq_num = 0; No sequence numbers here.. */
2814
2815 /* Only called by our kernel thread.
2816 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2817 * in response to admin command or module unload.
2818 */
2819 if (!drbd_get_data_sock(mdev))
2820 return 0;
2821
Philipp Reisner0b70a132010-08-20 13:36:10 +02002822 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002823 if (ok && dgs) {
2824 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002825 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002826 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002827 }
2828 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002829 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002830
2831 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002832
Philipp Reisnerb411b362009-09-25 16:07:19 -07002833 return ok;
2834}
2835
Philipp Reisner73a01a12010-10-27 14:33:00 +02002836int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2837{
2838 struct p_block_desc p;
2839
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002840 p.sector = cpu_to_be64(req->i.sector);
2841 p.blksize = cpu_to_be32(req->i.size);
Philipp Reisner73a01a12010-10-27 14:33:00 +02002842
2843 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2844}
2845
Philipp Reisnerb411b362009-09-25 16:07:19 -07002846/*
2847 drbd_send distinguishes two cases:
2848
2849 Packets sent via the data socket "sock"
2850 and packets sent via the meta data socket "msock"
2851
2852 sock msock
2853 -----------------+-------------------------+------------------------------
2854 timeout conf.timeout / 2 conf.timeout / 2
2855 timeout action send a ping via msock Abort communication
2856 and close all sockets
2857*/
2858
2859/*
2860 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2861 */
2862int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2863 void *buf, size_t size, unsigned msg_flags)
2864{
2865 struct kvec iov;
2866 struct msghdr msg;
2867 int rv, sent = 0;
2868
2869 if (!sock)
2870 return -1000;
2871
2872 /* THINK if (signal_pending) return ... ? */
2873
2874 iov.iov_base = buf;
2875 iov.iov_len = size;
2876
2877 msg.msg_name = NULL;
2878 msg.msg_namelen = 0;
2879 msg.msg_control = NULL;
2880 msg.msg_controllen = 0;
2881 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2882
2883 if (sock == mdev->data.socket) {
2884 mdev->ko_count = mdev->net_conf->ko_count;
2885 drbd_update_congested(mdev);
2886 }
2887 do {
2888 /* STRANGE
2889 * tcp_sendmsg does _not_ use its size parameter at all ?
2890 *
2891 * -EAGAIN on timeout, -EINTR on signal.
2892 */
2893/* THINK
2894 * do we need to block DRBD_SIG if sock == &meta.socket ??
2895 * otherwise wake_asender() might interrupt some send_*Ack !
2896 */
2897 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2898 if (rv == -EAGAIN) {
2899 if (we_should_drop_the_connection(mdev, sock))
2900 break;
2901 else
2902 continue;
2903 }
2904 D_ASSERT(rv != 0);
2905 if (rv == -EINTR) {
2906 flush_signals(current);
2907 rv = 0;
2908 }
2909 if (rv < 0)
2910 break;
2911 sent += rv;
2912 iov.iov_base += rv;
2913 iov.iov_len -= rv;
2914 } while (sent < size);
2915
2916 if (sock == mdev->data.socket)
2917 clear_bit(NET_CONGESTED, &mdev->flags);
2918
2919 if (rv <= 0) {
2920 if (rv != -EAGAIN) {
2921 dev_err(DEV, "%s_sendmsg returned %d\n",
2922 sock == mdev->meta.socket ? "msock" : "sock",
2923 rv);
2924 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2925 } else
2926 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2927 }
2928
2929 return sent;
2930}
2931
2932static int drbd_open(struct block_device *bdev, fmode_t mode)
2933{
2934 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2935 unsigned long flags;
2936 int rv = 0;
2937
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002938 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002939 spin_lock_irqsave(&mdev->req_lock, flags);
2940 /* to have a stable mdev->state.role
2941 * and no race with updating open_cnt */
2942
2943 if (mdev->state.role != R_PRIMARY) {
2944 if (mode & FMODE_WRITE)
2945 rv = -EROFS;
2946 else if (!allow_oos)
2947 rv = -EMEDIUMTYPE;
2948 }
2949
2950 if (!rv)
2951 mdev->open_cnt++;
2952 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002953 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002954
2955 return rv;
2956}
2957
2958static int drbd_release(struct gendisk *gd, fmode_t mode)
2959{
2960 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002961 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002962 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002963 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002964 return 0;
2965}
2966
Philipp Reisnerb411b362009-09-25 16:07:19 -07002967static void drbd_set_defaults(struct drbd_conf *mdev)
2968{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002969 /* This way we get a compile error when sync_conf grows,
2970 and we forgot to initialize it here */
2971 mdev->sync_conf = (struct syncer_conf) {
2972 /* .rate = */ DRBD_RATE_DEF,
2973 /* .after = */ DRBD_AFTER_DEF,
2974 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002975 /* .verify_alg = */ {}, 0,
2976 /* .cpu_mask = */ {}, 0,
2977 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002978 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002979 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2980 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2981 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2982 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002983 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2984 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002985 };
2986
2987 /* Have to use that way, because the layout differs between
2988 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002989 mdev->state = (union drbd_state) {
2990 { .role = R_SECONDARY,
2991 .peer = R_UNKNOWN,
2992 .conn = C_STANDALONE,
2993 .disk = D_DISKLESS,
2994 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002995 .susp = 0,
2996 .susp_nod = 0,
2997 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002998 } };
2999}
3000
3001void drbd_init_set_defaults(struct drbd_conf *mdev)
3002{
3003 /* the memset(,0,) did most of this.
3004 * note: only assignments, no allocation in here */
3005
3006 drbd_set_defaults(mdev);
3007
Philipp Reisnerb411b362009-09-25 16:07:19 -07003008 atomic_set(&mdev->ap_bio_cnt, 0);
3009 atomic_set(&mdev->ap_pending_cnt, 0);
3010 atomic_set(&mdev->rs_pending_cnt, 0);
3011 atomic_set(&mdev->unacked_cnt, 0);
3012 atomic_set(&mdev->local_cnt, 0);
3013 atomic_set(&mdev->net_cnt, 0);
3014 atomic_set(&mdev->packet_seq, 0);
3015 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003016 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02003017 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003018 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02003019 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003020
3021 mutex_init(&mdev->md_io_mutex);
3022 mutex_init(&mdev->data.mutex);
3023 mutex_init(&mdev->meta.mutex);
3024 sema_init(&mdev->data.work.s, 0);
3025 sema_init(&mdev->meta.work.s, 0);
3026 mutex_init(&mdev->state_mutex);
3027
3028 spin_lock_init(&mdev->data.work.q_lock);
3029 spin_lock_init(&mdev->meta.work.q_lock);
3030
3031 spin_lock_init(&mdev->al_lock);
3032 spin_lock_init(&mdev->req_lock);
3033 spin_lock_init(&mdev->peer_seq_lock);
3034 spin_lock_init(&mdev->epoch_lock);
3035
3036 INIT_LIST_HEAD(&mdev->active_ee);
3037 INIT_LIST_HEAD(&mdev->sync_ee);
3038 INIT_LIST_HEAD(&mdev->done_ee);
3039 INIT_LIST_HEAD(&mdev->read_ee);
3040 INIT_LIST_HEAD(&mdev->net_ee);
3041 INIT_LIST_HEAD(&mdev->resync_reads);
3042 INIT_LIST_HEAD(&mdev->data.work.q);
3043 INIT_LIST_HEAD(&mdev->meta.work.q);
3044 INIT_LIST_HEAD(&mdev->resync_work.list);
3045 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003046 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003047 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003048 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003049 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003050
Philipp Reisner794abb72010-12-27 11:51:23 +01003051 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003052 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003053 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003054 mdev->md_sync_work.cb = w_md_sync;
3055 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003056 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003057 init_timer(&mdev->resync_timer);
3058 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003059 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003060 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003061 mdev->resync_timer.function = resync_timer_fn;
3062 mdev->resync_timer.data = (unsigned long) mdev;
3063 mdev->md_sync_timer.function = md_sync_timer_fn;
3064 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003065 mdev->start_resync_timer.function = start_resync_timer_fn;
3066 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003067 mdev->request_timer.function = request_timer_fn;
3068 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003069
3070 init_waitqueue_head(&mdev->misc_wait);
3071 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003072 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003073 init_waitqueue_head(&mdev->ee_wait);
3074 init_waitqueue_head(&mdev->al_wait);
3075 init_waitqueue_head(&mdev->seq_wait);
3076
3077 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3078 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3079 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3080
3081 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02003082 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003083 mdev->resync_wenr = LC_FREE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003084 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3085 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003086}
3087
3088void drbd_mdev_cleanup(struct drbd_conf *mdev)
3089{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003090 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003091 if (mdev->receiver.t_state != None)
3092 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3093 mdev->receiver.t_state);
3094
3095 /* no need to lock it, I'm the only thread alive */
3096 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3097 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3098 mdev->al_writ_cnt =
3099 mdev->bm_writ_cnt =
3100 mdev->read_cnt =
3101 mdev->recv_cnt =
3102 mdev->send_cnt =
3103 mdev->writ_cnt =
3104 mdev->p_size =
3105 mdev->rs_start =
3106 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003107 mdev->rs_failed = 0;
3108 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003109 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003110 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3111 mdev->rs_mark_left[i] = 0;
3112 mdev->rs_mark_time[i] = 0;
3113 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003114 D_ASSERT(mdev->net_conf == NULL);
3115
3116 drbd_set_my_capacity(mdev, 0);
3117 if (mdev->bitmap) {
3118 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003119 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003120 drbd_bm_cleanup(mdev);
3121 }
3122
3123 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003124 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003125
3126 /*
3127 * currently we drbd_init_ee only on module load, so
3128 * we may do drbd_release_ee only on module unload!
3129 */
3130 D_ASSERT(list_empty(&mdev->active_ee));
3131 D_ASSERT(list_empty(&mdev->sync_ee));
3132 D_ASSERT(list_empty(&mdev->done_ee));
3133 D_ASSERT(list_empty(&mdev->read_ee));
3134 D_ASSERT(list_empty(&mdev->net_ee));
3135 D_ASSERT(list_empty(&mdev->resync_reads));
3136 D_ASSERT(list_empty(&mdev->data.work.q));
3137 D_ASSERT(list_empty(&mdev->meta.work.q));
3138 D_ASSERT(list_empty(&mdev->resync_work.list));
3139 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003140 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003141
3142 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003143}
3144
3145
3146static void drbd_destroy_mempools(void)
3147{
3148 struct page *page;
3149
3150 while (drbd_pp_pool) {
3151 page = drbd_pp_pool;
3152 drbd_pp_pool = (struct page *)page_private(page);
3153 __free_page(page);
3154 drbd_pp_vacant--;
3155 }
3156
3157 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3158
3159 if (drbd_ee_mempool)
3160 mempool_destroy(drbd_ee_mempool);
3161 if (drbd_request_mempool)
3162 mempool_destroy(drbd_request_mempool);
3163 if (drbd_ee_cache)
3164 kmem_cache_destroy(drbd_ee_cache);
3165 if (drbd_request_cache)
3166 kmem_cache_destroy(drbd_request_cache);
3167 if (drbd_bm_ext_cache)
3168 kmem_cache_destroy(drbd_bm_ext_cache);
3169 if (drbd_al_ext_cache)
3170 kmem_cache_destroy(drbd_al_ext_cache);
3171
3172 drbd_ee_mempool = NULL;
3173 drbd_request_mempool = NULL;
3174 drbd_ee_cache = NULL;
3175 drbd_request_cache = NULL;
3176 drbd_bm_ext_cache = NULL;
3177 drbd_al_ext_cache = NULL;
3178
3179 return;
3180}
3181
3182static int drbd_create_mempools(void)
3183{
3184 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003185 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003186 int i;
3187
3188 /* prepare our caches and mempools */
3189 drbd_request_mempool = NULL;
3190 drbd_ee_cache = NULL;
3191 drbd_request_cache = NULL;
3192 drbd_bm_ext_cache = NULL;
3193 drbd_al_ext_cache = NULL;
3194 drbd_pp_pool = NULL;
3195
3196 /* caches */
3197 drbd_request_cache = kmem_cache_create(
3198 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3199 if (drbd_request_cache == NULL)
3200 goto Enomem;
3201
3202 drbd_ee_cache = kmem_cache_create(
3203 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3204 if (drbd_ee_cache == NULL)
3205 goto Enomem;
3206
3207 drbd_bm_ext_cache = kmem_cache_create(
3208 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3209 if (drbd_bm_ext_cache == NULL)
3210 goto Enomem;
3211
3212 drbd_al_ext_cache = kmem_cache_create(
3213 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3214 if (drbd_al_ext_cache == NULL)
3215 goto Enomem;
3216
3217 /* mempools */
3218 drbd_request_mempool = mempool_create(number,
3219 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3220 if (drbd_request_mempool == NULL)
3221 goto Enomem;
3222
3223 drbd_ee_mempool = mempool_create(number,
3224 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003225 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003226 goto Enomem;
3227
3228 /* drbd's page pool */
3229 spin_lock_init(&drbd_pp_lock);
3230
3231 for (i = 0; i < number; i++) {
3232 page = alloc_page(GFP_HIGHUSER);
3233 if (!page)
3234 goto Enomem;
3235 set_page_private(page, (unsigned long)drbd_pp_pool);
3236 drbd_pp_pool = page;
3237 }
3238 drbd_pp_vacant = number;
3239
3240 return 0;
3241
3242Enomem:
3243 drbd_destroy_mempools(); /* in case we allocated some */
3244 return -ENOMEM;
3245}
3246
3247static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3248 void *unused)
3249{
3250 /* just so we have it. you never know what interesting things we
3251 * might want to do here some day...
3252 */
3253
3254 return NOTIFY_DONE;
3255}
3256
3257static struct notifier_block drbd_notifier = {
3258 .notifier_call = drbd_notify_sys,
3259};
3260
3261static void drbd_release_ee_lists(struct drbd_conf *mdev)
3262{
3263 int rr;
3264
3265 rr = drbd_release_ee(mdev, &mdev->active_ee);
3266 if (rr)
3267 dev_err(DEV, "%d EEs in active list found!\n", rr);
3268
3269 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3270 if (rr)
3271 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3272
3273 rr = drbd_release_ee(mdev, &mdev->read_ee);
3274 if (rr)
3275 dev_err(DEV, "%d EEs in read list found!\n", rr);
3276
3277 rr = drbd_release_ee(mdev, &mdev->done_ee);
3278 if (rr)
3279 dev_err(DEV, "%d EEs in done list found!\n", rr);
3280
3281 rr = drbd_release_ee(mdev, &mdev->net_ee);
3282 if (rr)
3283 dev_err(DEV, "%d EEs in net list found!\n", rr);
3284}
3285
3286/* caution. no locking.
3287 * currently only used from module cleanup code. */
3288static void drbd_delete_device(unsigned int minor)
3289{
3290 struct drbd_conf *mdev = minor_to_mdev(minor);
3291
3292 if (!mdev)
3293 return;
3294
3295 /* paranoia asserts */
3296 if (mdev->open_cnt != 0)
3297 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3298 __FILE__ , __LINE__);
3299
3300 ERR_IF (!list_empty(&mdev->data.work.q)) {
3301 struct list_head *lp;
3302 list_for_each(lp, &mdev->data.work.q) {
3303 dev_err(DEV, "lp = %p\n", lp);
3304 }
3305 };
3306 /* end paranoia asserts */
3307
3308 del_gendisk(mdev->vdisk);
3309
3310 /* cleanup stuff that may have been allocated during
3311 * device (re-)configuration or state changes */
3312
3313 if (mdev->this_bdev)
3314 bdput(mdev->this_bdev);
3315
3316 drbd_free_resources(mdev);
3317
3318 drbd_release_ee_lists(mdev);
3319
Bart Van Assche24c48302011-05-21 18:32:29 +02003320 /* should be freed on disconnect? */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003321 kfree(mdev->ee_hash);
3322 /*
3323 mdev->ee_hash_s = 0;
3324 mdev->ee_hash = NULL;
3325 */
3326
3327 lc_destroy(mdev->act_log);
3328 lc_destroy(mdev->resync);
3329
3330 kfree(mdev->p_uuid);
3331 /* mdev->p_uuid = NULL; */
3332
3333 kfree(mdev->int_dig_out);
3334 kfree(mdev->int_dig_in);
3335 kfree(mdev->int_dig_vv);
3336
3337 /* cleanup the rest that has been
3338 * allocated from drbd_new_device
3339 * and actually free the mdev itself */
3340 drbd_free_mdev(mdev);
3341}
3342
3343static void drbd_cleanup(void)
3344{
3345 unsigned int i;
3346
3347 unregister_reboot_notifier(&drbd_notifier);
3348
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003349 /* first remove proc,
3350 * drbdsetup uses it's presence to detect
3351 * whether DRBD is loaded.
3352 * If we would get stuck in proc removal,
3353 * but have netlink already deregistered,
3354 * some drbdsetup commands may wait forever
3355 * for an answer.
3356 */
3357 if (drbd_proc)
3358 remove_proc_entry("drbd", NULL);
3359
Philipp Reisnerb411b362009-09-25 16:07:19 -07003360 drbd_nl_cleanup();
3361
3362 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003363 i = minor_count;
3364 while (i--)
3365 drbd_delete_device(i);
3366 drbd_destroy_mempools();
3367 }
3368
3369 kfree(minor_table);
3370
3371 unregister_blkdev(DRBD_MAJOR, "drbd");
3372
3373 printk(KERN_INFO "drbd: module cleanup done.\n");
3374}
3375
3376/**
3377 * drbd_congested() - Callback for pdflush
3378 * @congested_data: User data
3379 * @bdi_bits: Bits pdflush is currently interested in
3380 *
3381 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3382 */
3383static int drbd_congested(void *congested_data, int bdi_bits)
3384{
3385 struct drbd_conf *mdev = congested_data;
3386 struct request_queue *q;
3387 char reason = '-';
3388 int r = 0;
3389
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003390 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003391 /* DRBD has frozen IO */
3392 r = bdi_bits;
3393 reason = 'd';
3394 goto out;
3395 }
3396
3397 if (get_ldev(mdev)) {
3398 q = bdev_get_queue(mdev->ldev->backing_bdev);
3399 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3400 put_ldev(mdev);
3401 if (r)
3402 reason = 'b';
3403 }
3404
3405 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3406 r |= (1 << BDI_async_congested);
3407 reason = reason == 'b' ? 'a' : 'n';
3408 }
3409
3410out:
3411 mdev->congestion_reason = reason;
3412 return r;
3413}
3414
3415struct drbd_conf *drbd_new_device(unsigned int minor)
3416{
3417 struct drbd_conf *mdev;
3418 struct gendisk *disk;
3419 struct request_queue *q;
3420
3421 /* GFP_KERNEL, we are outside of all write-out paths */
3422 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3423 if (!mdev)
3424 return NULL;
3425 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3426 goto out_no_cpumask;
3427
3428 mdev->minor = minor;
3429
3430 drbd_init_set_defaults(mdev);
3431
3432 q = blk_alloc_queue(GFP_KERNEL);
3433 if (!q)
3434 goto out_no_q;
3435 mdev->rq_queue = q;
3436 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003437
3438 disk = alloc_disk(1);
3439 if (!disk)
3440 goto out_no_disk;
3441 mdev->vdisk = disk;
3442
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003443 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003444
3445 disk->queue = q;
3446 disk->major = DRBD_MAJOR;
3447 disk->first_minor = minor;
3448 disk->fops = &drbd_ops;
3449 sprintf(disk->disk_name, "drbd%d", minor);
3450 disk->private_data = mdev;
3451
3452 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3453 /* we have no partitions. we contain only ourselves. */
3454 mdev->this_bdev->bd_contains = mdev->this_bdev;
3455
3456 q->backing_dev_info.congested_fn = drbd_congested;
3457 q->backing_dev_info.congested_data = mdev;
3458
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003459 blk_queue_make_request(q, drbd_make_request);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003460 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3461 This triggers a max_bio_size message upon first attach or connect */
3462 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003463 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3464 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003465 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003466
3467 mdev->md_io_page = alloc_page(GFP_KERNEL);
3468 if (!mdev->md_io_page)
3469 goto out_no_io_page;
3470
3471 if (drbd_bm_init(mdev))
3472 goto out_no_bitmap;
3473 /* no need to lock access, we are still initializing this minor device. */
3474 if (!tl_init(mdev))
3475 goto out_no_tl;
Andreas Gruenbacherdac13892011-01-21 17:18:39 +01003476 mdev->read_requests = RB_ROOT;
Andreas Gruenbacherde696712011-01-20 15:00:24 +01003477 mdev->write_requests = RB_ROOT;
Andreas Gruenbacher8b946252011-01-20 15:23:07 +01003478 mdev->epoch_entries = RB_ROOT;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003479
3480 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3481 if (!mdev->app_reads_hash)
3482 goto out_no_app_reads;
3483
3484 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3485 if (!mdev->current_epoch)
3486 goto out_no_epoch;
3487
3488 INIT_LIST_HEAD(&mdev->current_epoch->list);
3489 mdev->epochs = 1;
3490
3491 return mdev;
3492
3493/* out_whatever_else:
3494 kfree(mdev->current_epoch); */
3495out_no_epoch:
3496 kfree(mdev->app_reads_hash);
3497out_no_app_reads:
3498 tl_cleanup(mdev);
3499out_no_tl:
3500 drbd_bm_cleanup(mdev);
3501out_no_bitmap:
3502 __free_page(mdev->md_io_page);
3503out_no_io_page:
3504 put_disk(disk);
3505out_no_disk:
3506 blk_cleanup_queue(q);
3507out_no_q:
3508 free_cpumask_var(mdev->cpu_mask);
3509out_no_cpumask:
3510 kfree(mdev);
3511 return NULL;
3512}
3513
3514/* counterpart of drbd_new_device.
3515 * last part of drbd_delete_device. */
3516void drbd_free_mdev(struct drbd_conf *mdev)
3517{
3518 kfree(mdev->current_epoch);
3519 kfree(mdev->app_reads_hash);
3520 tl_cleanup(mdev);
3521 if (mdev->bitmap) /* should no longer be there. */
3522 drbd_bm_cleanup(mdev);
3523 __free_page(mdev->md_io_page);
3524 put_disk(mdev->vdisk);
3525 blk_cleanup_queue(mdev->rq_queue);
3526 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003527 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003528 kfree(mdev);
3529}
3530
3531
3532int __init drbd_init(void)
3533{
3534 int err;
3535
3536 if (sizeof(struct p_handshake) != 80) {
3537 printk(KERN_ERR
3538 "drbd: never change the size or layout "
3539 "of the HandShake packet.\n");
3540 return -EINVAL;
3541 }
3542
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003543 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003544 printk(KERN_ERR
3545 "drbd: invalid minor_count (%d)\n", minor_count);
3546#ifdef MODULE
3547 return -EINVAL;
3548#else
3549 minor_count = 8;
3550#endif
3551 }
3552
3553 err = drbd_nl_init();
3554 if (err)
3555 return err;
3556
3557 err = register_blkdev(DRBD_MAJOR, "drbd");
3558 if (err) {
3559 printk(KERN_ERR
3560 "drbd: unable to register block device major %d\n",
3561 DRBD_MAJOR);
3562 return err;
3563 }
3564
3565 register_reboot_notifier(&drbd_notifier);
3566
3567 /*
3568 * allocate all necessary structs
3569 */
3570 err = -ENOMEM;
3571
3572 init_waitqueue_head(&drbd_pp_wait);
3573
3574 drbd_proc = NULL; /* play safe for drbd_cleanup */
3575 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3576 GFP_KERNEL);
3577 if (!minor_table)
3578 goto Enomem;
3579
3580 err = drbd_create_mempools();
3581 if (err)
3582 goto Enomem;
3583
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003584 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003585 if (!drbd_proc) {
3586 printk(KERN_ERR "drbd: unable to register proc file\n");
3587 goto Enomem;
3588 }
3589
3590 rwlock_init(&global_state_lock);
3591
3592 printk(KERN_INFO "drbd: initialized. "
3593 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3594 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3595 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3596 printk(KERN_INFO "drbd: registered as block device major %d\n",
3597 DRBD_MAJOR);
3598 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3599
3600 return 0; /* Success! */
3601
3602Enomem:
3603 drbd_cleanup();
3604 if (err == -ENOMEM)
3605 /* currently always the case */
3606 printk(KERN_ERR "drbd: ran out of memory\n");
3607 else
3608 printk(KERN_ERR "drbd: initialization failure\n");
3609 return err;
3610}
3611
3612void drbd_free_bc(struct drbd_backing_dev *ldev)
3613{
3614 if (ldev == NULL)
3615 return;
3616
Tejun Heoe525fd82010-11-13 11:55:17 +01003617 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3618 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003619
3620 kfree(ldev);
3621}
3622
3623void drbd_free_sock(struct drbd_conf *mdev)
3624{
3625 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003626 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003627 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3628 sock_release(mdev->data.socket);
3629 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003630 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003631 }
3632 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003633 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003634 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3635 sock_release(mdev->meta.socket);
3636 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003637 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003638 }
3639}
3640
3641
3642void drbd_free_resources(struct drbd_conf *mdev)
3643{
3644 crypto_free_hash(mdev->csums_tfm);
3645 mdev->csums_tfm = NULL;
3646 crypto_free_hash(mdev->verify_tfm);
3647 mdev->verify_tfm = NULL;
3648 crypto_free_hash(mdev->cram_hmac_tfm);
3649 mdev->cram_hmac_tfm = NULL;
3650 crypto_free_hash(mdev->integrity_w_tfm);
3651 mdev->integrity_w_tfm = NULL;
3652 crypto_free_hash(mdev->integrity_r_tfm);
3653 mdev->integrity_r_tfm = NULL;
3654
3655 drbd_free_sock(mdev);
3656
3657 __no_warn(local,
3658 drbd_free_bc(mdev->ldev);
3659 mdev->ldev = NULL;);
3660}
3661
3662/* meta data management */
3663
3664struct meta_data_on_disk {
3665 u64 la_size; /* last agreed size. */
3666 u64 uuid[UI_SIZE]; /* UUIDs. */
3667 u64 device_uuid;
3668 u64 reserved_u64_1;
3669 u32 flags; /* MDF */
3670 u32 magic;
3671 u32 md_size_sect;
3672 u32 al_offset; /* offset to this block */
3673 u32 al_nr_extents; /* important for restoring the AL */
3674 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3675 u32 bm_offset; /* offset to the bitmap, from here */
3676 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
Philipp Reisner99432fc2011-05-20 16:39:13 +02003677 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3678 u32 reserved_u32[3];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003679
3680} __packed;
3681
3682/**
3683 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3684 * @mdev: DRBD device.
3685 */
3686void drbd_md_sync(struct drbd_conf *mdev)
3687{
3688 struct meta_data_on_disk *buffer;
3689 sector_t sector;
3690 int i;
3691
Lars Ellenbergee15b032010-09-03 10:00:09 +02003692 del_timer(&mdev->md_sync_timer);
3693 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003694 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3695 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003696
3697 /* We use here D_FAILED and not D_ATTACHING because we try to write
3698 * metadata even if we detach due to a disk failure! */
3699 if (!get_ldev_if_state(mdev, D_FAILED))
3700 return;
3701
Philipp Reisnerb411b362009-09-25 16:07:19 -07003702 mutex_lock(&mdev->md_io_mutex);
3703 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3704 memset(buffer, 0, 512);
3705
3706 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3707 for (i = UI_CURRENT; i < UI_SIZE; i++)
3708 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3709 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3710 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3711
3712 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3713 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3714 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3715 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3716 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3717
3718 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003719 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003720
3721 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3722 sector = mdev->ldev->md.md_offset;
3723
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003724 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003725 /* this was a try anyways ... */
3726 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003727 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003728 }
3729
3730 /* Update mdev->ldev->md.la_size_sect,
3731 * since we updated it on metadata. */
3732 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3733
3734 mutex_unlock(&mdev->md_io_mutex);
3735 put_ldev(mdev);
3736}
3737
3738/**
3739 * drbd_md_read() - Reads in the meta data super block
3740 * @mdev: DRBD device.
3741 * @bdev: Device from which the meta data should be read in.
3742 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003743 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003744 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3745 */
3746int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3747{
3748 struct meta_data_on_disk *buffer;
3749 int i, rv = NO_ERROR;
3750
3751 if (!get_ldev_if_state(mdev, D_ATTACHING))
3752 return ERR_IO_MD_DISK;
3753
Philipp Reisnerb411b362009-09-25 16:07:19 -07003754 mutex_lock(&mdev->md_io_mutex);
3755 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3756
3757 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003758 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003759 called BEFORE disk is attached */
3760 dev_err(DEV, "Error while reading metadata.\n");
3761 rv = ERR_IO_MD_DISK;
3762 goto err;
3763 }
3764
Andreas Gruenbachere7fad8a2011-01-11 13:54:02 +01003765 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003766 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3767 rv = ERR_MD_INVALID;
3768 goto err;
3769 }
3770 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3771 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3772 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3773 rv = ERR_MD_INVALID;
3774 goto err;
3775 }
3776 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3777 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3778 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3779 rv = ERR_MD_INVALID;
3780 goto err;
3781 }
3782 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3783 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3784 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3785 rv = ERR_MD_INVALID;
3786 goto err;
3787 }
3788
3789 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3790 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3791 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3792 rv = ERR_MD_INVALID;
3793 goto err;
3794 }
3795
3796 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3797 for (i = UI_CURRENT; i < UI_SIZE; i++)
3798 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3799 bdev->md.flags = be32_to_cpu(buffer->flags);
3800 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3801 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3802
Philipp Reisner99432fc2011-05-20 16:39:13 +02003803 spin_lock_irq(&mdev->req_lock);
3804 if (mdev->state.conn < C_CONNECTED) {
3805 int peer;
3806 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3807 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3808 mdev->peer_max_bio_size = peer;
3809 }
3810 spin_unlock_irq(&mdev->req_lock);
3811
Philipp Reisnerb411b362009-09-25 16:07:19 -07003812 if (mdev->sync_conf.al_extents < 7)
3813 mdev->sync_conf.al_extents = 127;
3814
3815 err:
3816 mutex_unlock(&mdev->md_io_mutex);
3817 put_ldev(mdev);
3818
3819 return rv;
3820}
3821
3822/**
3823 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3824 * @mdev: DRBD device.
3825 *
3826 * Call this function if you change anything that should be written to
3827 * the meta-data super block. This function sets MD_DIRTY, and starts a
3828 * timer that ensures that within five seconds you have to call drbd_md_sync().
3829 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003830#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003831void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3832{
3833 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3834 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3835 mdev->last_md_mark_dirty.line = line;
3836 mdev->last_md_mark_dirty.func = func;
3837 }
3838}
3839#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003840void drbd_md_mark_dirty(struct drbd_conf *mdev)
3841{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003842 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003843 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003844}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003845#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003846
3847static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3848{
3849 int i;
3850
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003851 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003852 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003853}
3854
3855void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3856{
3857 if (idx == UI_CURRENT) {
3858 if (mdev->state.role == R_PRIMARY)
3859 val |= 1;
3860 else
3861 val &= ~((u64)1);
3862
3863 drbd_set_ed_uuid(mdev, val);
3864 }
3865
3866 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003867 drbd_md_mark_dirty(mdev);
3868}
3869
3870
3871void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3872{
3873 if (mdev->ldev->md.uuid[idx]) {
3874 drbd_uuid_move_history(mdev);
3875 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003876 }
3877 _drbd_uuid_set(mdev, idx, val);
3878}
3879
3880/**
3881 * drbd_uuid_new_current() - Creates a new current UUID
3882 * @mdev: DRBD device.
3883 *
3884 * Creates a new current UUID, and rotates the old current UUID into
3885 * the bitmap slot. Causes an incremental resync upon next connect.
3886 */
3887void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3888{
3889 u64 val;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003890 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003891
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003892 if (bm_uuid)
3893 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3894
Philipp Reisnerb411b362009-09-25 16:07:19 -07003895 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003896
3897 get_random_bytes(&val, sizeof(u64));
3898 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003899 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003900 /* get it to stable storage _now_ */
3901 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003902}
3903
3904void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3905{
3906 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3907 return;
3908
3909 if (val == 0) {
3910 drbd_uuid_move_history(mdev);
3911 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3912 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003913 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003914 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3915 if (bm_uuid)
3916 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003917
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003918 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003919 }
3920 drbd_md_mark_dirty(mdev);
3921}
3922
3923/**
3924 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3925 * @mdev: DRBD device.
3926 *
3927 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3928 */
3929int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3930{
3931 int rv = -EIO;
3932
3933 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3934 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3935 drbd_md_sync(mdev);
3936 drbd_bm_set_all(mdev);
3937
3938 rv = drbd_bm_write(mdev);
3939
3940 if (!rv) {
3941 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3942 drbd_md_sync(mdev);
3943 }
3944
3945 put_ldev(mdev);
3946 }
3947
3948 return rv;
3949}
3950
3951/**
3952 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3953 * @mdev: DRBD device.
3954 *
3955 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3956 */
3957int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3958{
3959 int rv = -EIO;
3960
Philipp Reisner07782862010-08-31 12:00:50 +02003961 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003962 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3963 drbd_bm_clear_all(mdev);
3964 rv = drbd_bm_write(mdev);
3965 put_ldev(mdev);
3966 }
3967
3968 return rv;
3969}
3970
3971static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3972{
3973 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003974 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003975
3976 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3977
Lars Ellenberg02851e92010-12-16 14:47:39 +01003978 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003979 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003980 rv = work->io_fn(mdev);
3981 drbd_bm_unlock(mdev);
3982 put_ldev(mdev);
3983 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003984
3985 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003986 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07003987 wake_up(&mdev->misc_wait);
3988
3989 if (work->done)
3990 work->done(mdev, rv);
3991
3992 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3993 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003994 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003995
3996 return 1;
3997}
3998
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003999void drbd_ldev_destroy(struct drbd_conf *mdev)
4000{
4001 lc_destroy(mdev->resync);
4002 mdev->resync = NULL;
4003 lc_destroy(mdev->act_log);
4004 mdev->act_log = NULL;
4005 __no_warn(local,
4006 drbd_free_bc(mdev->ldev);
4007 mdev->ldev = NULL;);
4008
4009 if (mdev->md_io_tmpp) {
4010 __free_page(mdev->md_io_tmpp);
4011 mdev->md_io_tmpp = NULL;
4012 }
4013 clear_bit(GO_DISKLESS, &mdev->flags);
4014}
4015
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004016static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4017{
4018 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02004019 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4020 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004021 * the protected members anymore, though, so once put_ldev reaches zero
4022 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004023 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004024 return 1;
4025}
4026
4027void drbd_go_diskless(struct drbd_conf *mdev)
4028{
4029 D_ASSERT(mdev->state.disk == D_FAILED);
4030 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02004031 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004032}
4033
Philipp Reisnerb411b362009-09-25 16:07:19 -07004034/**
4035 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4036 * @mdev: DRBD device.
4037 * @io_fn: IO callback to be called when bitmap IO is possible
4038 * @done: callback to be called after the bitmap IO was performed
4039 * @why: Descriptive text of the reason for doing the IO
4040 *
4041 * While IO on the bitmap happens we freeze application IO thus we ensure
4042 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4043 * called from worker context. It MUST NOT be used while a previous such
4044 * work is still pending!
4045 */
4046void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4047 int (*io_fn)(struct drbd_conf *),
4048 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004049 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004050{
4051 D_ASSERT(current == mdev->worker.task);
4052
4053 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4054 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4055 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4056 if (mdev->bm_io_work.why)
4057 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4058 why, mdev->bm_io_work.why);
4059
4060 mdev->bm_io_work.io_fn = io_fn;
4061 mdev->bm_io_work.done = done;
4062 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004063 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004064
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004065 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004066 set_bit(BITMAP_IO, &mdev->flags);
4067 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004068 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004069 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004070 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004071 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004072}
4073
4074/**
4075 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4076 * @mdev: DRBD device.
4077 * @io_fn: IO callback to be called when bitmap IO is possible
4078 * @why: Descriptive text of the reason for doing the IO
4079 *
4080 * freezes application IO while that the actual IO operations runs. This
4081 * functions MAY NOT be called from worker context.
4082 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004083int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4084 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004085{
4086 int rv;
4087
4088 D_ASSERT(current != mdev->worker.task);
4089
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004090 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4091 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004092
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004093 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004094 rv = io_fn(mdev);
4095 drbd_bm_unlock(mdev);
4096
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004097 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4098 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004099
4100 return rv;
4101}
4102
4103void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4104{
4105 if ((mdev->ldev->md.flags & flag) != flag) {
4106 drbd_md_mark_dirty(mdev);
4107 mdev->ldev->md.flags |= flag;
4108 }
4109}
4110
4111void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4112{
4113 if ((mdev->ldev->md.flags & flag) != 0) {
4114 drbd_md_mark_dirty(mdev);
4115 mdev->ldev->md.flags &= ~flag;
4116 }
4117}
4118int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4119{
4120 return (bdev->md.flags & flag) != 0;
4121}
4122
4123static void md_sync_timer_fn(unsigned long data)
4124{
4125 struct drbd_conf *mdev = (struct drbd_conf *) data;
4126
4127 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4128}
4129
4130static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4131{
4132 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004133#ifdef DEBUG
4134 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4135 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4136#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004137 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004138 return 1;
4139}
4140
4141#ifdef CONFIG_DRBD_FAULT_INJECTION
4142/* Fault insertion support including random number generator shamelessly
4143 * stolen from kernel/rcutorture.c */
4144struct fault_random_state {
4145 unsigned long state;
4146 unsigned long count;
4147};
4148
4149#define FAULT_RANDOM_MULT 39916801 /* prime */
4150#define FAULT_RANDOM_ADD 479001701 /* prime */
4151#define FAULT_RANDOM_REFRESH 10000
4152
4153/*
4154 * Crude but fast random-number generator. Uses a linear congruential
4155 * generator, with occasional help from get_random_bytes().
4156 */
4157static unsigned long
4158_drbd_fault_random(struct fault_random_state *rsp)
4159{
4160 long refresh;
4161
Roel Kluin49829ea2009-12-15 22:55:44 +01004162 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004163 get_random_bytes(&refresh, sizeof(refresh));
4164 rsp->state += refresh;
4165 rsp->count = FAULT_RANDOM_REFRESH;
4166 }
4167 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4168 return swahw32(rsp->state);
4169}
4170
4171static char *
4172_drbd_fault_str(unsigned int type) {
4173 static char *_faults[] = {
4174 [DRBD_FAULT_MD_WR] = "Meta-data write",
4175 [DRBD_FAULT_MD_RD] = "Meta-data read",
4176 [DRBD_FAULT_RS_WR] = "Resync write",
4177 [DRBD_FAULT_RS_RD] = "Resync read",
4178 [DRBD_FAULT_DT_WR] = "Data write",
4179 [DRBD_FAULT_DT_RD] = "Data read",
4180 [DRBD_FAULT_DT_RA] = "Data read ahead",
4181 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004182 [DRBD_FAULT_AL_EE] = "EE allocation",
4183 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004184 };
4185
4186 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4187}
4188
4189unsigned int
4190_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4191{
4192 static struct fault_random_state rrs = {0, 0};
4193
4194 unsigned int ret = (
4195 (fault_devs == 0 ||
4196 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4197 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4198
4199 if (ret) {
4200 fault_count++;
4201
Lars Ellenberg73835062010-05-27 11:51:56 +02004202 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004203 dev_warn(DEV, "***Simulating %s failure\n",
4204 _drbd_fault_str(type));
4205 }
4206
4207 return ret;
4208}
4209#endif
4210
4211const char *drbd_buildtag(void)
4212{
4213 /* DRBD built from external sources has here a reference to the
4214 git hash of the source code. */
4215
4216 static char buildtag[38] = "\0uilt-in";
4217
4218 if (buildtag[0] == 0) {
4219#ifdef CONFIG_MODULES
4220 if (THIS_MODULE != NULL)
4221 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4222 else
4223#endif
4224 buildtag[0] = 'b';
4225 }
4226
4227 return buildtag;
4228}
4229
4230module_init(drbd_init)
4231module_exit(drbd_cleanup)
4232
Philipp Reisnerb411b362009-09-25 16:07:19 -07004233EXPORT_SYMBOL(drbd_conn_str);
4234EXPORT_SYMBOL(drbd_role_str);
4235EXPORT_SYMBOL(drbd_disk_str);
4236EXPORT_SYMBOL(drbd_set_st_err_str);