blob: 4bd636524dd1f7bd41b6c69d1610bd29a057fd79 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010088MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070090MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Rusty Russell90ab5ee2012-01-13 09:32:20 +1030120bool disable_sendpage;
121bool allow_oos;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
142
143/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149struct page *drbd_pp_pool;
150spinlock_t drbd_pp_lock;
151int drbd_pp_vacant;
152wait_queue_head_t drbd_pp_wait;
153
154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100156static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160};
161
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169{
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179}
180
181#endif
182
183/**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193static int tl_init(struct drbd_conf *mdev)
194{
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200205 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100211 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700212
213 mdev->tl_hash = NULL;
214 mdev->tl_hash_s = 0;
215
216 return 1;
217}
218
219static void tl_cleanup(struct drbd_conf *mdev)
220{
221 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
222 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
223 kfree(mdev->oldest_tle);
224 mdev->oldest_tle = NULL;
225 kfree(mdev->unused_spare_tle);
226 mdev->unused_spare_tle = NULL;
227 kfree(mdev->tl_hash);
228 mdev->tl_hash = NULL;
229 mdev->tl_hash_s = 0;
230}
231
232/**
233 * _tl_add_barrier() - Adds a barrier to the transfer log
234 * @mdev: DRBD device.
235 * @new: Barrier to be added before the current head of the TL.
236 *
237 * The caller must hold the req_lock.
238 */
239void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
240{
241 struct drbd_tl_epoch *newest_before;
242
243 INIT_LIST_HEAD(&new->requests);
244 INIT_LIST_HEAD(&new->w.list);
245 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
246 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200247 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700248
249 newest_before = mdev->newest_tle;
250 /* never send a barrier number == 0, because that is special-cased
251 * when using TCQ for our write ordering code */
252 new->br_number = (newest_before->br_number+1) ?: 1;
253 if (mdev->newest_tle != new) {
254 mdev->newest_tle->next = new;
255 mdev->newest_tle = new;
256 }
257}
258
259/**
260 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
261 * @mdev: DRBD device.
262 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
263 * @set_size: Expected number of requests before that barrier.
264 *
265 * In case the passed barrier_nr or set_size does not match the oldest
266 * &struct drbd_tl_epoch objects this function will cause a termination
267 * of the connection.
268 */
269void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
270 unsigned int set_size)
271{
272 struct drbd_tl_epoch *b, *nob; /* next old barrier */
273 struct list_head *le, *tle;
274 struct drbd_request *r;
275
276 spin_lock_irq(&mdev->req_lock);
277
278 b = mdev->oldest_tle;
279
280 /* first some paranoia code */
281 if (b == NULL) {
282 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
283 barrier_nr);
284 goto bail;
285 }
286 if (b->br_number != barrier_nr) {
287 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
288 barrier_nr, b->br_number);
289 goto bail;
290 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200291 if (b->n_writes != set_size) {
292 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
293 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700294 goto bail;
295 }
296
297 /* Clean up list of requests processed during current epoch */
298 list_for_each_safe(le, tle, &b->requests) {
299 r = list_entry(le, struct drbd_request, tl_requests);
300 _req_mod(r, barrier_acked);
301 }
302 /* There could be requests on the list waiting for completion
303 of the write to the local disk. To avoid corruptions of
304 slab's data structures we have to remove the lists head.
305
306 Also there could have been a barrier ack out of sequence, overtaking
307 the write acks - which would be a bug and violating write ordering.
308 To not deadlock in case we lose connection while such requests are
309 still pending, we need some way to find them for the
310 _req_mode(connection_lost_while_pending).
311
312 These have been list_move'd to the out_of_sequence_requests list in
313 _req_mod(, barrier_acked) above.
314 */
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100315 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700316
317 nob = b->next;
318 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
319 _tl_add_barrier(mdev, b);
320 if (nob)
321 mdev->oldest_tle = nob;
322 /* if nob == NULL b was the only barrier, and becomes the new
323 barrier. Therefore mdev->oldest_tle points already to b */
324 } else {
325 D_ASSERT(nob != NULL);
326 mdev->oldest_tle = nob;
327 kfree(b);
328 }
329
330 spin_unlock_irq(&mdev->req_lock);
331 dec_ap_pending(mdev);
332
333 return;
334
335bail:
336 spin_unlock_irq(&mdev->req_lock);
337 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
338}
339
Philipp Reisner617049a2010-12-22 12:48:31 +0100340
Philipp Reisner11b58e72010-05-12 17:08:26 +0200341/**
342 * _tl_restart() - Walks the transfer log, and applies an action to all requests
343 * @mdev: DRBD device.
344 * @what: The action/event to perform with all request objects
345 *
346 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100347 * restart_frozen_disk_io, abort_disk_io.
Philipp Reisner11b58e72010-05-12 17:08:26 +0200348 */
349static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
350{
351 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200352 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200353 struct drbd_request *req;
354 int rv, n_writes, n_reads;
355
356 b = mdev->oldest_tle;
357 pn = &mdev->oldest_tle;
358 while (b) {
359 n_writes = 0;
360 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200361 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200362 list_for_each_safe(le, tle, &b->requests) {
363 req = list_entry(le, struct drbd_request, tl_requests);
364 rv = _req_mod(req, what);
365
366 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
367 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
368 }
369 tmp = b->next;
370
Philipp Reisner2b4dd362011-03-14 13:01:50 +0100371 if (what == abort_disk_io) {
372 /* Only walk the TL, leave barrier objects in place */
373 b = tmp;
374 continue;
375 }
376
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200377 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200378 if (what == resend) {
379 b->n_writes = n_writes;
380 if (b->w.cb == NULL) {
381 b->w.cb = w_send_barrier;
382 inc_ap_pending(mdev);
383 set_bit(CREATE_BARRIER, &mdev->flags);
384 }
385
386 drbd_queue_work(&mdev->data.work, &b->w);
387 }
388 pn = &b->next;
389 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200390 if (n_reads)
391 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200392 /* there could still be requests on that ring list,
393 * in case local io is still pending */
394 list_del(&b->requests);
395
396 /* dec_ap_pending corresponding to queue_barrier.
397 * the newest barrier may not have been queued yet,
398 * in which case w.cb is still NULL. */
399 if (b->w.cb != NULL)
400 dec_ap_pending(mdev);
401
402 if (b == mdev->newest_tle) {
403 /* recycle, but reinit! */
404 D_ASSERT(tmp == NULL);
405 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200406 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200407 INIT_LIST_HEAD(&b->w.list);
408 b->w.cb = NULL;
409 b->br_number = net_random();
410 b->n_writes = 0;
411
412 *pn = b;
413 break;
414 }
415 *pn = tmp;
416 kfree(b);
417 }
418 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200419 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200420 }
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100421
422 /* Actions operating on the disk state, also want to work on
423 requests that got barrier acked. */
424 switch (what) {
425 case abort_disk_io:
426 case fail_frozen_disk_io:
427 case restart_frozen_disk_io:
428 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
429 req = list_entry(le, struct drbd_request, tl_requests);
430 _req_mod(req, what);
431 }
432
433 case connection_lost_while_pending:
434 case resend:
435 break;
436 default:
437 dev_err(DEV, "what = %d in _tl_restart()\n", what);
438 }
Philipp Reisner11b58e72010-05-12 17:08:26 +0200439}
440
Philipp Reisnerb411b362009-09-25 16:07:19 -0700441
442/**
443 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
444 * @mdev: DRBD device.
445 *
446 * This is called after the connection to the peer was lost. The storage covered
447 * by the requests on the transfer gets marked as our of sync. Called from the
448 * receiver thread and the worker thread.
449 */
450void tl_clear(struct drbd_conf *mdev)
451{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700452 struct list_head *le, *tle;
453 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700454
455 spin_lock_irq(&mdev->req_lock);
456
Philipp Reisner11b58e72010-05-12 17:08:26 +0200457 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700458
459 /* we expect this list to be empty. */
460 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
461
462 /* but just in case, clean it up anyways! */
463 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
464 r = list_entry(le, struct drbd_request, tl_requests);
465 /* It would be nice to complete outside of spinlock.
466 * But this is easier for now. */
467 _req_mod(r, connection_lost_while_pending);
468 }
469
470 /* ensure bit indicating barrier is required is clear */
471 clear_bit(CREATE_BARRIER, &mdev->flags);
472
Philipp Reisner288f4222010-05-27 15:07:43 +0200473 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
474
Philipp Reisnerb411b362009-09-25 16:07:19 -0700475 spin_unlock_irq(&mdev->req_lock);
476}
477
Philipp Reisner11b58e72010-05-12 17:08:26 +0200478void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
479{
480 spin_lock_irq(&mdev->req_lock);
481 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700482 spin_unlock_irq(&mdev->req_lock);
483}
484
485/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100486 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700487 * @mdev: DRBD device.
488 * @os: old (current) state.
489 * @ns: new (wanted) state.
490 */
491static int cl_wide_st_chg(struct drbd_conf *mdev,
492 union drbd_state os, union drbd_state ns)
493{
494 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
495 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
496 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
497 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
Philipp Reisner02ee8f92011-03-14 11:54:47 +0100498 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
Philipp Reisnerb411b362009-09-25 16:07:19 -0700499 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
500 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
501}
502
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100503enum drbd_state_rv
504drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
505 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700506{
507 unsigned long flags;
508 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100509 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700510
511 spin_lock_irqsave(&mdev->req_lock, flags);
512 os = mdev->state;
513 ns.i = (os.i & ~mask.i) | val.i;
514 rv = _drbd_set_state(mdev, ns, f, NULL);
515 ns = mdev->state;
516 spin_unlock_irqrestore(&mdev->req_lock, flags);
517
518 return rv;
519}
520
521/**
522 * drbd_force_state() - Impose a change which happens outside our control on our state
523 * @mdev: DRBD device.
524 * @mask: mask of state bits to change.
525 * @val: value of new state bits.
526 */
527void drbd_force_state(struct drbd_conf *mdev,
528 union drbd_state mask, union drbd_state val)
529{
530 drbd_change_state(mdev, CS_HARD, mask, val);
531}
532
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100533static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
534static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
535 union drbd_state,
536 union drbd_state);
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200537enum sanitize_state_warnings {
538 NO_WARNING,
539 ABORTED_ONLINE_VERIFY,
540 ABORTED_RESYNC,
541 CONNECTION_LOST_NEGOTIATING,
542 IMPLICITLY_UPGRADED_DISK,
543 IMPLICITLY_UPGRADED_PDSK,
544};
Philipp Reisnerb411b362009-09-25 16:07:19 -0700545static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200546 union drbd_state ns, enum sanitize_state_warnings *warn);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700547int drbd_send_state_req(struct drbd_conf *,
548 union drbd_state, union drbd_state);
549
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100550static enum drbd_state_rv
551_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
552 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700553{
554 union drbd_state os, ns;
555 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100556 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700557
558 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
559 return SS_CW_SUCCESS;
560
561 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
562 return SS_CW_FAILED_BY_PEER;
563
564 rv = 0;
565 spin_lock_irqsave(&mdev->req_lock, flags);
566 os = mdev->state;
567 ns.i = (os.i & ~mask.i) | val.i;
568 ns = sanitize_state(mdev, os, ns, NULL);
569
570 if (!cl_wide_st_chg(mdev, os, ns))
571 rv = SS_CW_NO_NEED;
572 if (!rv) {
573 rv = is_valid_state(mdev, ns);
574 if (rv == SS_SUCCESS) {
575 rv = is_valid_state_transition(mdev, ns, os);
576 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100577 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700578 }
579 }
580 spin_unlock_irqrestore(&mdev->req_lock, flags);
581
582 return rv;
583}
584
585/**
586 * drbd_req_state() - Perform an eventually cluster wide state change
587 * @mdev: DRBD device.
588 * @mask: mask of state bits to change.
589 * @val: value of new state bits.
590 * @f: flags
591 *
592 * Should not be called directly, use drbd_request_state() or
593 * _drbd_request_state().
594 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100595static enum drbd_state_rv
596drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
597 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700598{
599 struct completion done;
600 unsigned long flags;
601 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100602 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700603
604 init_completion(&done);
605
606 if (f & CS_SERIALIZE)
607 mutex_lock(&mdev->state_mutex);
608
609 spin_lock_irqsave(&mdev->req_lock, flags);
610 os = mdev->state;
611 ns.i = (os.i & ~mask.i) | val.i;
612 ns = sanitize_state(mdev, os, ns, NULL);
613
614 if (cl_wide_st_chg(mdev, os, ns)) {
615 rv = is_valid_state(mdev, ns);
616 if (rv == SS_SUCCESS)
617 rv = is_valid_state_transition(mdev, ns, os);
618 spin_unlock_irqrestore(&mdev->req_lock, flags);
619
620 if (rv < SS_SUCCESS) {
621 if (f & CS_VERBOSE)
622 print_st_err(mdev, os, ns, rv);
623 goto abort;
624 }
625
626 drbd_state_lock(mdev);
627 if (!drbd_send_state_req(mdev, mask, val)) {
628 drbd_state_unlock(mdev);
629 rv = SS_CW_FAILED_BY_PEER;
630 if (f & CS_VERBOSE)
631 print_st_err(mdev, os, ns, rv);
632 goto abort;
633 }
634
635 wait_event(mdev->state_wait,
636 (rv = _req_st_cond(mdev, mask, val)));
637
638 if (rv < SS_SUCCESS) {
639 drbd_state_unlock(mdev);
640 if (f & CS_VERBOSE)
641 print_st_err(mdev, os, ns, rv);
642 goto abort;
643 }
644 spin_lock_irqsave(&mdev->req_lock, flags);
645 os = mdev->state;
646 ns.i = (os.i & ~mask.i) | val.i;
647 rv = _drbd_set_state(mdev, ns, f, &done);
648 drbd_state_unlock(mdev);
649 } else {
650 rv = _drbd_set_state(mdev, ns, f, &done);
651 }
652
653 spin_unlock_irqrestore(&mdev->req_lock, flags);
654
655 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
656 D_ASSERT(current != mdev->worker.task);
657 wait_for_completion(&done);
658 }
659
660abort:
661 if (f & CS_SERIALIZE)
662 mutex_unlock(&mdev->state_mutex);
663
664 return rv;
665}
666
667/**
668 * _drbd_request_state() - Request a state change (with flags)
669 * @mdev: DRBD device.
670 * @mask: mask of state bits to change.
671 * @val: value of new state bits.
672 * @f: flags
673 *
674 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
675 * flag, or when logging of failed state change requests is not desired.
676 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100677enum drbd_state_rv
678_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
679 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700680{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100681 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700682
683 wait_event(mdev->state_wait,
684 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
685
686 return rv;
687}
688
689static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
690{
691 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
692 name,
693 drbd_conn_str(ns.conn),
694 drbd_role_str(ns.role),
695 drbd_role_str(ns.peer),
696 drbd_disk_str(ns.disk),
697 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200698 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700699 ns.aftr_isp ? 'a' : '-',
700 ns.peer_isp ? 'p' : '-',
701 ns.user_isp ? 'u' : '-'
702 );
703}
704
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100705void print_st_err(struct drbd_conf *mdev, union drbd_state os,
706 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700707{
708 if (err == SS_IN_TRANSIENT_STATE)
709 return;
710 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
711 print_st(mdev, " state", os);
712 print_st(mdev, "wanted", ns);
713}
714
715
Philipp Reisnerb411b362009-09-25 16:07:19 -0700716/**
717 * is_valid_state() - Returns an SS_ error code if ns is not valid
718 * @mdev: DRBD device.
719 * @ns: State to consider.
720 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100721static enum drbd_state_rv
722is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700723{
724 /* See drbd_state_sw_errors in drbd_strings.c */
725
726 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100727 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700728
729 fp = FP_DONT_CARE;
730 if (get_ldev(mdev)) {
731 fp = mdev->ldev->dc.fencing;
732 put_ldev(mdev);
733 }
734
735 if (get_net_conf(mdev)) {
736 if (!mdev->net_conf->two_primaries &&
737 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
738 rv = SS_TWO_PRIMARIES;
739 put_net_conf(mdev);
740 }
741
742 if (rv <= 0)
743 /* already found a reason to abort */;
744 else if (ns.role == R_SECONDARY && mdev->open_cnt)
745 rv = SS_DEVICE_IN_USE;
746
747 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
748 rv = SS_NO_UP_TO_DATE_DISK;
749
750 else if (fp >= FP_RESOURCE &&
751 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
752 rv = SS_PRIMARY_NOP;
753
754 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
755 rv = SS_NO_UP_TO_DATE_DISK;
756
757 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
758 rv = SS_NO_LOCAL_DISK;
759
760 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
761 rv = SS_NO_REMOTE_DISK;
762
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200763 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
764 rv = SS_NO_UP_TO_DATE_DISK;
765
Philipp Reisnerb411b362009-09-25 16:07:19 -0700766 else if ((ns.conn == C_CONNECTED ||
767 ns.conn == C_WF_BITMAP_S ||
768 ns.conn == C_SYNC_SOURCE ||
769 ns.conn == C_PAUSED_SYNC_S) &&
770 ns.disk == D_OUTDATED)
771 rv = SS_CONNECTED_OUTDATES;
772
773 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
774 (mdev->sync_conf.verify_alg[0] == 0))
775 rv = SS_NO_VERIFY_ALG;
776
777 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
778 mdev->agreed_pro_version < 88)
779 rv = SS_NOT_SUPPORTED;
780
Philipp Reisnerfa7d9392011-05-17 14:48:55 +0200781 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
782 rv = SS_CONNECTED_OUTDATES;
783
Philipp Reisnerb411b362009-09-25 16:07:19 -0700784 return rv;
785}
786
787/**
788 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
789 * @mdev: DRBD device.
790 * @ns: new state.
791 * @os: old state.
792 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100793static enum drbd_state_rv
794is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
795 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700796{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100797 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700798
799 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
800 os.conn > C_CONNECTED)
801 rv = SS_RESYNC_RUNNING;
802
803 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
804 rv = SS_ALREADY_STANDALONE;
805
806 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
807 rv = SS_IS_DISKLESS;
808
809 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
810 rv = SS_NO_NET_CONFIG;
811
812 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
813 rv = SS_LOWER_THAN_OUTDATED;
814
815 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
816 rv = SS_IN_TRANSIENT_STATE;
817
818 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
819 rv = SS_IN_TRANSIENT_STATE;
820
821 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
822 rv = SS_NEED_CONNECTION;
823
824 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
825 ns.conn != os.conn && os.conn > C_CONNECTED)
826 rv = SS_RESYNC_RUNNING;
827
828 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
829 os.conn < C_CONNECTED)
830 rv = SS_NEED_CONNECTION;
831
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100832 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
833 && os.conn < C_WF_REPORT_PARAMS)
834 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
835
Philipp Reisnerb411b362009-09-25 16:07:19 -0700836 return rv;
837}
838
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200839static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
840{
841 static const char *msg_table[] = {
842 [NO_WARNING] = "",
843 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
844 [ABORTED_RESYNC] = "Resync aborted.",
845 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
846 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
847 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
848 };
849
850 if (warn != NO_WARNING)
851 dev_warn(DEV, "%s\n", msg_table[warn]);
852}
853
Philipp Reisnerb411b362009-09-25 16:07:19 -0700854/**
855 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
856 * @mdev: DRBD device.
857 * @os: old state.
858 * @ns: new state.
859 * @warn_sync_abort:
860 *
861 * When we loose connection, we have to set the state of the peers disk (pdsk)
862 * to D_UNKNOWN. This rule and many more along those lines are in this function.
863 */
864static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200865 union drbd_state ns, enum sanitize_state_warnings *warn)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700866{
867 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100868 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700869
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200870 if (warn)
871 *warn = NO_WARNING;
872
Philipp Reisnerb411b362009-09-25 16:07:19 -0700873 fp = FP_DONT_CARE;
874 if (get_ldev(mdev)) {
875 fp = mdev->ldev->dc.fencing;
876 put_ldev(mdev);
877 }
878
879 /* Disallow Network errors to configure a device's network part */
880 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
881 os.conn <= C_DISCONNECTING)
882 ns.conn = os.conn;
883
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200884 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
885 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700886 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200887 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700888 ns.conn = os.conn;
889
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200890 /* we cannot fail (again) if we already detached */
891 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
892 ns.disk = D_DISKLESS;
893
894 /* if we are only D_ATTACHING yet,
895 * we can (and should) go directly to D_DISKLESS. */
896 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
897 ns.disk = D_DISKLESS;
898
Philipp Reisnerb411b362009-09-25 16:07:19 -0700899 /* After C_DISCONNECTING only C_STANDALONE may follow */
900 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
901 ns.conn = os.conn;
902
903 if (ns.conn < C_CONNECTED) {
904 ns.peer_isp = 0;
905 ns.peer = R_UNKNOWN;
906 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
907 ns.pdsk = D_UNKNOWN;
908 }
909
910 /* Clear the aftr_isp when becoming unconfigured */
911 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
912 ns.aftr_isp = 0;
913
Philipp Reisnerb411b362009-09-25 16:07:19 -0700914 /* Abort resync if a disk fails/detaches */
915 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
916 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200917 if (warn)
918 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
919 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700920 ns.conn = C_CONNECTED;
921 }
922
Philipp Reisnerb411b362009-09-25 16:07:19 -0700923 /* Connection breaks down before we finished "Negotiating" */
924 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
925 get_ldev_if_state(mdev, D_NEGOTIATING)) {
926 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
927 ns.disk = mdev->new_state_tmp.disk;
928 ns.pdsk = mdev->new_state_tmp.pdsk;
929 } else {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200930 if (warn)
931 *warn = CONNECTION_LOST_NEGOTIATING;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700932 ns.disk = D_DISKLESS;
933 ns.pdsk = D_UNKNOWN;
934 }
935 put_ldev(mdev);
936 }
937
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100938 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
939 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
940 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
941 ns.disk = D_UP_TO_DATE;
942 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
943 ns.pdsk = D_UP_TO_DATE;
944 }
945
946 /* Implications of the connection stat on the disk states */
947 disk_min = D_DISKLESS;
948 disk_max = D_UP_TO_DATE;
949 pdsk_min = D_INCONSISTENT;
950 pdsk_max = D_UNKNOWN;
951 switch ((enum drbd_conns)ns.conn) {
952 case C_WF_BITMAP_T:
953 case C_PAUSED_SYNC_T:
954 case C_STARTING_SYNC_T:
955 case C_WF_SYNC_UUID:
956 case C_BEHIND:
957 disk_min = D_INCONSISTENT;
958 disk_max = D_OUTDATED;
959 pdsk_min = D_UP_TO_DATE;
960 pdsk_max = D_UP_TO_DATE;
961 break;
962 case C_VERIFY_S:
963 case C_VERIFY_T:
964 disk_min = D_UP_TO_DATE;
965 disk_max = D_UP_TO_DATE;
966 pdsk_min = D_UP_TO_DATE;
967 pdsk_max = D_UP_TO_DATE;
968 break;
969 case C_CONNECTED:
970 disk_min = D_DISKLESS;
971 disk_max = D_UP_TO_DATE;
972 pdsk_min = D_DISKLESS;
973 pdsk_max = D_UP_TO_DATE;
974 break;
975 case C_WF_BITMAP_S:
976 case C_PAUSED_SYNC_S:
977 case C_STARTING_SYNC_S:
978 case C_AHEAD:
979 disk_min = D_UP_TO_DATE;
980 disk_max = D_UP_TO_DATE;
981 pdsk_min = D_INCONSISTENT;
982 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
983 break;
984 case C_SYNC_TARGET:
985 disk_min = D_INCONSISTENT;
986 disk_max = D_INCONSISTENT;
987 pdsk_min = D_UP_TO_DATE;
988 pdsk_max = D_UP_TO_DATE;
989 break;
990 case C_SYNC_SOURCE:
991 disk_min = D_UP_TO_DATE;
992 disk_max = D_UP_TO_DATE;
993 pdsk_min = D_INCONSISTENT;
994 pdsk_max = D_INCONSISTENT;
995 break;
996 case C_STANDALONE:
997 case C_DISCONNECTING:
998 case C_UNCONNECTED:
999 case C_TIMEOUT:
1000 case C_BROKEN_PIPE:
1001 case C_NETWORK_FAILURE:
1002 case C_PROTOCOL_ERROR:
1003 case C_TEAR_DOWN:
1004 case C_WF_CONNECTION:
1005 case C_WF_REPORT_PARAMS:
1006 case C_MASK:
1007 break;
1008 }
1009 if (ns.disk > disk_max)
1010 ns.disk = disk_max;
1011
1012 if (ns.disk < disk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001013 if (warn)
1014 *warn = IMPLICITLY_UPGRADED_DISK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +01001015 ns.disk = disk_min;
1016 }
1017 if (ns.pdsk > pdsk_max)
1018 ns.pdsk = pdsk_max;
1019
1020 if (ns.pdsk < pdsk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001021 if (warn)
1022 *warn = IMPLICITLY_UPGRADED_PDSK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +01001023 ns.pdsk = pdsk_min;
1024 }
1025
Philipp Reisnerb411b362009-09-25 16:07:19 -07001026 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +02001027 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1028 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001029 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001030
1031 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1032 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1033 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001034 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001035
1036 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1037 if (ns.conn == C_SYNC_SOURCE)
1038 ns.conn = C_PAUSED_SYNC_S;
1039 if (ns.conn == C_SYNC_TARGET)
1040 ns.conn = C_PAUSED_SYNC_T;
1041 } else {
1042 if (ns.conn == C_PAUSED_SYNC_S)
1043 ns.conn = C_SYNC_SOURCE;
1044 if (ns.conn == C_PAUSED_SYNC_T)
1045 ns.conn = C_SYNC_TARGET;
1046 }
1047
1048 return ns;
1049}
1050
1051/* helper for __drbd_set_state */
1052static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1053{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001054 if (mdev->agreed_pro_version < 90)
1055 mdev->ov_start_sector = 0;
1056 mdev->rs_total = drbd_bm_bits(mdev);
1057 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001058 if (cs == C_VERIFY_T) {
1059 /* starting online verify from an arbitrary position
1060 * does not fit well into the existing protocol.
1061 * on C_VERIFY_T, we initialize ov_left and friends
1062 * implicitly in receive_DataRequest once the
1063 * first P_OV_REQUEST is received */
1064 mdev->ov_start_sector = ~(sector_t)0;
1065 } else {
1066 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001067 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001068 mdev->ov_start_sector =
1069 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001070 mdev->rs_total = 1;
1071 } else
1072 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001073 mdev->ov_position = mdev->ov_start_sector;
1074 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001075 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001076}
1077
Philipp Reisner07782862010-08-31 12:00:50 +02001078static void drbd_resume_al(struct drbd_conf *mdev)
1079{
1080 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1081 dev_info(DEV, "Resumed AL updates\n");
1082}
1083
Philipp Reisnerb411b362009-09-25 16:07:19 -07001084/**
1085 * __drbd_set_state() - Set a new DRBD state
1086 * @mdev: DRBD device.
1087 * @ns: new state.
1088 * @flags: Flags
1089 * @done: Optional completion, that will get completed after the after_state_ch() finished
1090 *
1091 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1092 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001093enum drbd_state_rv
1094__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1095 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001096{
1097 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001098 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001099 enum sanitize_state_warnings ssw;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001100 struct after_state_chg_work *ascw;
1101
1102 os = mdev->state;
1103
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001104 ns = sanitize_state(mdev, os, ns, &ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001105
1106 if (ns.i == os.i)
1107 return SS_NOTHING_TO_DO;
1108
1109 if (!(flags & CS_HARD)) {
1110 /* pre-state-change checks ; only look at ns */
1111 /* See drbd_state_sw_errors in drbd_strings.c */
1112
1113 rv = is_valid_state(mdev, ns);
1114 if (rv < SS_SUCCESS) {
1115 /* If the old state was illegal as well, then let
1116 this happen...*/
1117
Philipp Reisner1616a252010-06-10 16:55:15 +02001118 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001119 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001120 } else
1121 rv = is_valid_state_transition(mdev, ns, os);
1122 }
1123
1124 if (rv < SS_SUCCESS) {
1125 if (flags & CS_VERBOSE)
1126 print_st_err(mdev, os, ns, rv);
1127 return rv;
1128 }
1129
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001130 print_sanitize_warnings(mdev, ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001131
1132 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001133 char *pbp, pb[300];
1134 pbp = pb;
1135 *pbp = 0;
1136 if (ns.role != os.role)
1137 pbp += sprintf(pbp, "role( %s -> %s ) ",
1138 drbd_role_str(os.role),
1139 drbd_role_str(ns.role));
1140 if (ns.peer != os.peer)
1141 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1142 drbd_role_str(os.peer),
1143 drbd_role_str(ns.peer));
1144 if (ns.conn != os.conn)
1145 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1146 drbd_conn_str(os.conn),
1147 drbd_conn_str(ns.conn));
1148 if (ns.disk != os.disk)
1149 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1150 drbd_disk_str(os.disk),
1151 drbd_disk_str(ns.disk));
1152 if (ns.pdsk != os.pdsk)
1153 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1154 drbd_disk_str(os.pdsk),
1155 drbd_disk_str(ns.pdsk));
1156 if (is_susp(ns) != is_susp(os))
1157 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1158 is_susp(os),
1159 is_susp(ns));
1160 if (ns.aftr_isp != os.aftr_isp)
1161 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1162 os.aftr_isp,
1163 ns.aftr_isp);
1164 if (ns.peer_isp != os.peer_isp)
1165 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1166 os.peer_isp,
1167 ns.peer_isp);
1168 if (ns.user_isp != os.user_isp)
1169 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1170 os.user_isp,
1171 ns.user_isp);
1172 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001173 }
1174
1175 /* solve the race between becoming unconfigured,
1176 * worker doing the cleanup, and
1177 * admin reconfiguring us:
1178 * on (re)configure, first set CONFIG_PENDING,
1179 * then wait for a potentially exiting worker,
1180 * start the worker, and schedule one no_op.
1181 * then proceed with configuration.
1182 */
1183 if (ns.disk == D_DISKLESS &&
1184 ns.conn == C_STANDALONE &&
1185 ns.role == R_SECONDARY &&
1186 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1187 set_bit(DEVICE_DYING, &mdev->flags);
1188
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001189 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1190 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1191 * drbd_ldev_destroy() won't happen before our corresponding
1192 * after_state_ch works run, where we put_ldev again. */
1193 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1194 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1195 atomic_inc(&mdev->local_cnt);
1196
1197 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001198
1199 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1200 drbd_print_uuids(mdev, "attached to UUIDs");
1201
Philipp Reisnerb411b362009-09-25 16:07:19 -07001202 wake_up(&mdev->misc_wait);
1203 wake_up(&mdev->state_wait);
1204
Philipp Reisnerb411b362009-09-25 16:07:19 -07001205 /* aborted verify run. log the last position */
1206 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1207 ns.conn < C_CONNECTED) {
1208 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001209 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001210 dev_info(DEV, "Online Verify reached sector %llu\n",
1211 (unsigned long long)mdev->ov_start_sector);
1212 }
1213
1214 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1215 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1216 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001217 mdev->rs_paused += (long)jiffies
1218 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001219 if (ns.conn == C_SYNC_TARGET)
1220 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001221 }
1222
1223 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1224 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1225 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001226 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001227 }
1228
1229 if (os.conn == C_CONNECTED &&
1230 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001231 unsigned long now = jiffies;
1232 int i;
1233
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001234 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001235 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001236 mdev->rs_last_events = 0;
1237 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001238 mdev->ov_last_oos_size = 0;
1239 mdev->ov_last_oos_start = 0;
1240
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001241 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001242 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001243 mdev->rs_mark_time[i] = now;
1244 }
1245
Lars Ellenberg2649f082010-11-05 10:05:47 +01001246 drbd_rs_controller_reset(mdev);
1247
Philipp Reisnerb411b362009-09-25 16:07:19 -07001248 if (ns.conn == C_VERIFY_S) {
1249 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1250 (unsigned long long)mdev->ov_position);
1251 mod_timer(&mdev->resync_timer, jiffies);
1252 }
1253 }
1254
1255 if (get_ldev(mdev)) {
1256 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1257 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1258 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1259
1260 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1261 mdf |= MDF_CRASHED_PRIMARY;
1262 if (mdev->state.role == R_PRIMARY ||
1263 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1264 mdf |= MDF_PRIMARY_IND;
1265 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1266 mdf |= MDF_CONNECTED_IND;
1267 if (mdev->state.disk > D_INCONSISTENT)
1268 mdf |= MDF_CONSISTENT;
1269 if (mdev->state.disk > D_OUTDATED)
1270 mdf |= MDF_WAS_UP_TO_DATE;
1271 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1272 mdf |= MDF_PEER_OUT_DATED;
1273 if (mdf != mdev->ldev->md.flags) {
1274 mdev->ldev->md.flags = mdf;
1275 drbd_md_mark_dirty(mdev);
1276 }
1277 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1278 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1279 put_ldev(mdev);
1280 }
1281
1282 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1283 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1284 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1285 set_bit(CONSIDER_RESYNC, &mdev->flags);
1286
1287 /* Receiver should clean up itself */
1288 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1289 drbd_thread_stop_nowait(&mdev->receiver);
1290
1291 /* Now the receiver finished cleaning up itself, it should die */
1292 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1293 drbd_thread_stop_nowait(&mdev->receiver);
1294
1295 /* Upon network failure, we need to restart the receiver. */
1296 if (os.conn > C_TEAR_DOWN &&
1297 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1298 drbd_thread_restart_nowait(&mdev->receiver);
1299
Philipp Reisner07782862010-08-31 12:00:50 +02001300 /* Resume AL writing if we get a connection */
1301 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1302 drbd_resume_al(mdev);
1303
Philipp Reisnerb411b362009-09-25 16:07:19 -07001304 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1305 if (ascw) {
1306 ascw->os = os;
1307 ascw->ns = ns;
1308 ascw->flags = flags;
1309 ascw->w.cb = w_after_state_ch;
1310 ascw->done = done;
1311 drbd_queue_work(&mdev->data.work, &ascw->w);
1312 } else {
1313 dev_warn(DEV, "Could not kmalloc an ascw\n");
1314 }
1315
1316 return rv;
1317}
1318
1319static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1320{
1321 struct after_state_chg_work *ascw =
1322 container_of(w, struct after_state_chg_work, w);
1323 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1324 if (ascw->flags & CS_WAIT_COMPLETE) {
1325 D_ASSERT(ascw->done != NULL);
1326 complete(ascw->done);
1327 }
1328 kfree(ascw);
1329
1330 return 1;
1331}
1332
1333static void abw_start_sync(struct drbd_conf *mdev, int rv)
1334{
1335 if (rv) {
1336 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1337 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1338 return;
1339 }
1340
1341 switch (mdev->state.conn) {
1342 case C_STARTING_SYNC_T:
1343 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1344 break;
1345 case C_STARTING_SYNC_S:
1346 drbd_start_resync(mdev, C_SYNC_SOURCE);
1347 break;
1348 }
1349}
1350
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001351int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1352 int (*io_fn)(struct drbd_conf *),
1353 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001354{
1355 int rv;
1356
1357 D_ASSERT(current == mdev->worker.task);
1358
1359 /* open coded non-blocking drbd_suspend_io(mdev); */
1360 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001361
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001362 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001363 rv = io_fn(mdev);
1364 drbd_bm_unlock(mdev);
1365
1366 drbd_resume_io(mdev);
1367
1368 return rv;
1369}
1370
Philipp Reisnerb411b362009-09-25 16:07:19 -07001371/**
1372 * after_state_ch() - Perform after state change actions that may sleep
1373 * @mdev: DRBD device.
1374 * @os: old state.
1375 * @ns: new state.
1376 * @flags: Flags
1377 */
1378static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1379 union drbd_state ns, enum chg_state_flags flags)
1380{
1381 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001382 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001383 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001384
1385 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1386 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1387 if (mdev->p_uuid)
1388 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1389 }
1390
1391 fp = FP_DONT_CARE;
1392 if (get_ldev(mdev)) {
1393 fp = mdev->ldev->dc.fencing;
1394 put_ldev(mdev);
1395 }
1396
1397 /* Inform userspace about the change... */
1398 drbd_bcast_state(mdev, ns);
1399
1400 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1401 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1402 drbd_khelper(mdev, "pri-on-incon-degr");
1403
1404 /* Here we have the actions that are performed after a
1405 state change. This function might sleep */
1406
Philipp Reisnerdfa8bed2011-06-29 14:06:08 +02001407 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1408 mod_timer(&mdev->request_timer, jiffies + HZ);
1409
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001410 nsm.i = -1;
1411 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001412 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1413 what = resend;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001414
Philipp Reisner67098932010-06-24 16:24:25 +02001415 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001416 what = restart_frozen_disk_io;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001417
Philipp Reisner3f986882010-12-20 14:48:20 +01001418 if (what != nothing)
1419 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001420 }
1421
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001422 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001423 /* case1: The outdate peer handler is successful: */
1424 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001425 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001426 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1427 drbd_uuid_new_current(mdev);
1428 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001429 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001430 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001431 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001432 spin_unlock_irq(&mdev->req_lock);
1433 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001434 /* case2: The connection was established again: */
1435 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1436 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001437 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001438 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001439 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001440 }
Philipp Reisner67098932010-06-24 16:24:25 +02001441
1442 if (what != nothing) {
1443 spin_lock_irq(&mdev->req_lock);
1444 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001445 nsm.i &= mdev->state.i;
1446 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001447 spin_unlock_irq(&mdev->req_lock);
1448 }
1449
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001450 /* Became sync source. With protocol >= 96, we still need to send out
1451 * the sync uuid now. Need to do that before any drbd_send_state, or
1452 * the other side may go "paused sync" before receiving the sync uuids,
1453 * which is unexpected. */
1454 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1455 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1456 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1457 drbd_gen_and_send_sync_uuid(mdev);
1458 put_ldev(mdev);
1459 }
1460
Philipp Reisnerb411b362009-09-25 16:07:19 -07001461 /* Do not change the order of the if above and the two below... */
1462 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1463 drbd_send_uuids(mdev);
1464 drbd_send_state(mdev);
1465 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001466 /* No point in queuing send_bitmap if we don't have a connection
1467 * anymore, so check also the _current_ state, not only the new state
1468 * at the time this work was queued. */
1469 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1470 mdev->state.conn == C_WF_BITMAP_S)
1471 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001472 "send_bitmap (WFBitMapS)",
1473 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001474
1475 /* Lost contact to peer's copy of the data */
1476 if ((os.pdsk >= D_INCONSISTENT &&
1477 os.pdsk != D_UNKNOWN &&
1478 os.pdsk != D_OUTDATED)
1479 && (ns.pdsk < D_INCONSISTENT ||
1480 ns.pdsk == D_UNKNOWN ||
1481 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001482 if (get_ldev(mdev)) {
1483 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001484 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001485 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001486 set_bit(NEW_CUR_UUID, &mdev->flags);
1487 } else {
1488 drbd_uuid_new_current(mdev);
1489 drbd_send_uuids(mdev);
1490 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001491 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001492 put_ldev(mdev);
1493 }
1494 }
1495
1496 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001497 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001498 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001499 drbd_send_uuids(mdev);
1500 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001501
1502 /* D_DISKLESS Peer becomes secondary */
1503 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001504 /* We may still be Primary ourselves.
1505 * No harm done if the bitmap still changes,
1506 * redirtied pages will follow later. */
1507 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1508 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001509 put_ldev(mdev);
1510 }
1511
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001512 /* Write out all changed bits on demote.
1513 * Though, no need to da that just yet
1514 * if there is a resync going on still */
1515 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1516 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001517 /* No changes to the bitmap expected this time, so assert that,
1518 * even though no harm was done if it did change. */
1519 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1520 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001521 put_ldev(mdev);
1522 }
1523
1524 /* Last part of the attaching process ... */
1525 if (ns.conn >= C_CONNECTED &&
1526 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001527 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001528 drbd_send_uuids(mdev);
1529 drbd_send_state(mdev);
1530 }
1531
1532 /* We want to pause/continue resync, tell peer. */
1533 if (ns.conn >= C_CONNECTED &&
1534 ((os.aftr_isp != ns.aftr_isp) ||
1535 (os.user_isp != ns.user_isp)))
1536 drbd_send_state(mdev);
1537
1538 /* In case one of the isp bits got set, suspend other devices. */
1539 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1540 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1541 suspend_other_sg(mdev);
1542
1543 /* Make sure the peer gets informed about eventual state
1544 changes (ISP bits) while we were in WFReportParams. */
1545 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1546 drbd_send_state(mdev);
1547
Philipp Reisner67531712010-10-27 12:21:30 +02001548 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1549 drbd_send_state(mdev);
1550
Philipp Reisnerb411b362009-09-25 16:07:19 -07001551 /* We are in the progress to start a full sync... */
1552 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1553 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001554 /* no other bitmap changes expected during this phase */
1555 drbd_queue_bitmap_io(mdev,
1556 &drbd_bmio_set_n_write, &abw_start_sync,
1557 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001558
1559 /* We are invalidating our self... */
1560 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1561 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001562 /* other bitmap operation expected during this phase */
1563 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1564 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001565
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001566 /* first half of local IO error, failure to attach,
1567 * or administrative detach */
1568 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1569 enum drbd_io_error_p eh;
1570 int was_io_error;
1571 /* corresponding get_ldev was in __drbd_set_state, to serialize
1572 * our cleanup here with the transition to D_DISKLESS,
1573 * so it is safe to dreference ldev here. */
1574 eh = mdev->ldev->dc.on_io_error;
1575 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1576
Philipp Reisner2b4dd362011-03-14 13:01:50 +01001577 /* Immediately allow completion of all application IO, that waits
1578 for completion from the local disk. */
1579 tl_restart(mdev, abort_disk_io);
1580
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001581 /* current state still has to be D_FAILED,
1582 * there is only one way out: to D_DISKLESS,
1583 * and that may only happen after our put_ldev below. */
1584 if (mdev->state.disk != D_FAILED)
1585 dev_err(DEV,
1586 "ASSERT FAILED: disk is %s during detach\n",
1587 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001588
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001589 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001590 dev_info(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001591
1592 drbd_rs_cancel_all(mdev);
1593
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001594 /* In case we want to get something to stable storage still,
1595 * this may be the last chance.
1596 * Following put_ldev may transition to D_DISKLESS. */
1597 drbd_md_sync(mdev);
1598 put_ldev(mdev);
1599
1600 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001601 drbd_khelper(mdev, "local-io-error");
1602 }
1603
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001604 /* second half of local IO error, failure to attach,
1605 * or administrative detach,
1606 * after local_cnt references have reached zero again */
1607 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1608 /* We must still be diskless,
1609 * re-attach has to be serialized with this! */
1610 if (mdev->state.disk != D_DISKLESS)
1611 dev_err(DEV,
1612 "ASSERT FAILED: disk is %s while going diskless\n",
1613 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001614
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001615 mdev->rs_total = 0;
1616 mdev->rs_failed = 0;
1617 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001618
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001619 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001620 dev_info(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001621 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001622 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001623 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001624 }
1625
Philipp Reisner738a84b2011-03-03 00:21:30 +01001626 /* Notify peer that I had a local IO error, and did not detached.. */
1627 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1628 drbd_send_state(mdev);
1629
Philipp Reisnerb411b362009-09-25 16:07:19 -07001630 /* Disks got bigger while they were detached */
1631 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1632 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1633 if (ns.conn == C_CONNECTED)
1634 resync_after_online_grow(mdev);
1635 }
1636
1637 /* A resync finished or aborted, wake paused devices... */
1638 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1639 (os.peer_isp && !ns.peer_isp) ||
1640 (os.user_isp && !ns.user_isp))
1641 resume_next_sg(mdev);
1642
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001643 /* sync target done with resync. Explicitly notify peer, even though
1644 * it should (at least for non-empty resyncs) already know itself. */
1645 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1646 drbd_send_state(mdev);
1647
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001648 /* This triggers bitmap writeout of potentially still unwritten pages
1649 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001650 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001651 * For resync aborted because of local disk failure, we cannot do
1652 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001653 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001654 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001655 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1656 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1657 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001658 put_ldev(mdev);
1659 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001660
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001661 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001662 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001663 drbd_free_tl_hash(mdev);
1664
Philipp Reisnerb411b362009-09-25 16:07:19 -07001665 /* Upon network connection, we need to start the receiver */
1666 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1667 drbd_thread_start(&mdev->receiver);
1668
1669 /* Terminate worker thread if we are unconfigured - it will be
1670 restarted as needed... */
1671 if (ns.disk == D_DISKLESS &&
1672 ns.conn == C_STANDALONE &&
1673 ns.role == R_SECONDARY) {
1674 if (os.aftr_isp != ns.aftr_isp)
1675 resume_next_sg(mdev);
1676 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1677 if (test_bit(DEVICE_DYING, &mdev->flags))
1678 drbd_thread_stop_nowait(&mdev->worker);
1679 }
1680
1681 drbd_md_sync(mdev);
1682}
1683
1684
1685static int drbd_thread_setup(void *arg)
1686{
1687 struct drbd_thread *thi = (struct drbd_thread *) arg;
1688 struct drbd_conf *mdev = thi->mdev;
1689 unsigned long flags;
1690 int retval;
1691
1692restart:
1693 retval = thi->function(thi);
1694
1695 spin_lock_irqsave(&thi->t_lock, flags);
1696
1697 /* if the receiver has been "Exiting", the last thing it did
1698 * was set the conn state to "StandAlone",
1699 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1700 * and receiver thread will be "started".
1701 * drbd_thread_start needs to set "Restarting" in that case.
1702 * t_state check and assignment needs to be within the same spinlock,
1703 * so either thread_start sees Exiting, and can remap to Restarting,
1704 * or thread_start see None, and can proceed as normal.
1705 */
1706
1707 if (thi->t_state == Restarting) {
1708 dev_info(DEV, "Restarting %s\n", current->comm);
1709 thi->t_state = Running;
1710 spin_unlock_irqrestore(&thi->t_lock, flags);
1711 goto restart;
1712 }
1713
1714 thi->task = NULL;
1715 thi->t_state = None;
1716 smp_mb();
1717 complete(&thi->stop);
1718 spin_unlock_irqrestore(&thi->t_lock, flags);
1719
1720 dev_info(DEV, "Terminating %s\n", current->comm);
1721
1722 /* Release mod reference taken when thread was started */
1723 module_put(THIS_MODULE);
1724 return retval;
1725}
1726
1727static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1728 int (*func) (struct drbd_thread *))
1729{
1730 spin_lock_init(&thi->t_lock);
1731 thi->task = NULL;
1732 thi->t_state = None;
1733 thi->function = func;
1734 thi->mdev = mdev;
1735}
1736
1737int drbd_thread_start(struct drbd_thread *thi)
1738{
1739 struct drbd_conf *mdev = thi->mdev;
1740 struct task_struct *nt;
1741 unsigned long flags;
1742
1743 const char *me =
1744 thi == &mdev->receiver ? "receiver" :
1745 thi == &mdev->asender ? "asender" :
1746 thi == &mdev->worker ? "worker" : "NONSENSE";
1747
1748 /* is used from state engine doing drbd_thread_stop_nowait,
1749 * while holding the req lock irqsave */
1750 spin_lock_irqsave(&thi->t_lock, flags);
1751
1752 switch (thi->t_state) {
1753 case None:
1754 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1755 me, current->comm, current->pid);
1756
1757 /* Get ref on module for thread - this is released when thread exits */
1758 if (!try_module_get(THIS_MODULE)) {
1759 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1760 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001761 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001762 }
1763
1764 init_completion(&thi->stop);
1765 D_ASSERT(thi->task == NULL);
1766 thi->reset_cpu_mask = 1;
1767 thi->t_state = Running;
1768 spin_unlock_irqrestore(&thi->t_lock, flags);
1769 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1770
1771 nt = kthread_create(drbd_thread_setup, (void *) thi,
1772 "drbd%d_%s", mdev_to_minor(mdev), me);
1773
1774 if (IS_ERR(nt)) {
1775 dev_err(DEV, "Couldn't start thread\n");
1776
1777 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001778 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001779 }
1780 spin_lock_irqsave(&thi->t_lock, flags);
1781 thi->task = nt;
1782 thi->t_state = Running;
1783 spin_unlock_irqrestore(&thi->t_lock, flags);
1784 wake_up_process(nt);
1785 break;
1786 case Exiting:
1787 thi->t_state = Restarting;
1788 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1789 me, current->comm, current->pid);
1790 /* fall through */
1791 case Running:
1792 case Restarting:
1793 default:
1794 spin_unlock_irqrestore(&thi->t_lock, flags);
1795 break;
1796 }
1797
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001798 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001799}
1800
1801
1802void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1803{
1804 unsigned long flags;
1805
1806 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1807
1808 /* may be called from state engine, holding the req lock irqsave */
1809 spin_lock_irqsave(&thi->t_lock, flags);
1810
1811 if (thi->t_state == None) {
1812 spin_unlock_irqrestore(&thi->t_lock, flags);
1813 if (restart)
1814 drbd_thread_start(thi);
1815 return;
1816 }
1817
1818 if (thi->t_state != ns) {
1819 if (thi->task == NULL) {
1820 spin_unlock_irqrestore(&thi->t_lock, flags);
1821 return;
1822 }
1823
1824 thi->t_state = ns;
1825 smp_mb();
1826 init_completion(&thi->stop);
1827 if (thi->task != current)
1828 force_sig(DRBD_SIGKILL, thi->task);
1829
1830 }
1831
1832 spin_unlock_irqrestore(&thi->t_lock, flags);
1833
1834 if (wait)
1835 wait_for_completion(&thi->stop);
1836}
1837
1838#ifdef CONFIG_SMP
1839/**
1840 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1841 * @mdev: DRBD device.
1842 *
1843 * Forces all threads of a device onto the same CPU. This is beneficial for
1844 * DRBD's performance. May be overwritten by user's configuration.
1845 */
1846void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1847{
1848 int ord, cpu;
1849
1850 /* user override. */
1851 if (cpumask_weight(mdev->cpu_mask))
1852 return;
1853
1854 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1855 for_each_online_cpu(cpu) {
1856 if (ord-- == 0) {
1857 cpumask_set_cpu(cpu, mdev->cpu_mask);
1858 return;
1859 }
1860 }
1861 /* should not be reached */
1862 cpumask_setall(mdev->cpu_mask);
1863}
1864
1865/**
1866 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1867 * @mdev: DRBD device.
1868 *
1869 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1870 * prematurely.
1871 */
1872void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1873{
1874 struct task_struct *p = current;
1875 struct drbd_thread *thi =
1876 p == mdev->asender.task ? &mdev->asender :
1877 p == mdev->receiver.task ? &mdev->receiver :
1878 p == mdev->worker.task ? &mdev->worker :
1879 NULL;
1880 ERR_IF(thi == NULL)
1881 return;
1882 if (!thi->reset_cpu_mask)
1883 return;
1884 thi->reset_cpu_mask = 0;
1885 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1886}
1887#endif
1888
1889/* the appropriate socket mutex must be held already */
1890int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001891 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001892 size_t size, unsigned msg_flags)
1893{
1894 int sent, ok;
1895
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001896 ERR_IF(!h) return false;
1897 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001898
1899 h->magic = BE_DRBD_MAGIC;
1900 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001901 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001902
Philipp Reisnerb411b362009-09-25 16:07:19 -07001903 sent = drbd_send(mdev, sock, h, size, msg_flags);
1904
1905 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001906 if (!ok && !signal_pending(current))
1907 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001908 cmdname(cmd), (int)size, sent);
1909 return ok;
1910}
1911
1912/* don't pass the socket. we may only look at it
1913 * when we hold the appropriate socket mutex.
1914 */
1915int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001916 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001917{
1918 int ok = 0;
1919 struct socket *sock;
1920
1921 if (use_data_socket) {
1922 mutex_lock(&mdev->data.mutex);
1923 sock = mdev->data.socket;
1924 } else {
1925 mutex_lock(&mdev->meta.mutex);
1926 sock = mdev->meta.socket;
1927 }
1928
1929 /* drbd_disconnect() could have called drbd_free_sock()
1930 * while we were waiting in down()... */
1931 if (likely(sock != NULL))
1932 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1933
1934 if (use_data_socket)
1935 mutex_unlock(&mdev->data.mutex);
1936 else
1937 mutex_unlock(&mdev->meta.mutex);
1938 return ok;
1939}
1940
1941int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1942 size_t size)
1943{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001944 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001945 int ok;
1946
1947 h.magic = BE_DRBD_MAGIC;
1948 h.command = cpu_to_be16(cmd);
1949 h.length = cpu_to_be16(size);
1950
1951 if (!drbd_get_data_sock(mdev))
1952 return 0;
1953
Philipp Reisnerb411b362009-09-25 16:07:19 -07001954 ok = (sizeof(h) ==
1955 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1956 ok = ok && (size ==
1957 drbd_send(mdev, mdev->data.socket, data, size, 0));
1958
1959 drbd_put_data_sock(mdev);
1960
1961 return ok;
1962}
1963
1964int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1965{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001966 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001967 struct socket *sock;
1968 int size, rv;
1969 const int apv = mdev->agreed_pro_version;
1970
1971 size = apv <= 87 ? sizeof(struct p_rs_param)
1972 : apv == 88 ? sizeof(struct p_rs_param)
1973 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001974 : apv <= 94 ? sizeof(struct p_rs_param_89)
1975 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001976
1977 /* used from admin command context and receiver/worker context.
1978 * to avoid kmalloc, grab the socket right here,
1979 * then use the pre-allocated sbuf there */
1980 mutex_lock(&mdev->data.mutex);
1981 sock = mdev->data.socket;
1982
1983 if (likely(sock != NULL)) {
1984 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1985
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001986 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001987
1988 /* initialize verify_alg and csums_alg */
1989 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1990
1991 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001992 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1993 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1994 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1995 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001996
1997 if (apv >= 88)
1998 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1999 if (apv >= 89)
2000 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2001
2002 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2003 } else
2004 rv = 0; /* not ok */
2005
2006 mutex_unlock(&mdev->data.mutex);
2007
2008 return rv;
2009}
2010
2011int drbd_send_protocol(struct drbd_conf *mdev)
2012{
2013 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002014 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002015
2016 size = sizeof(struct p_protocol);
2017
2018 if (mdev->agreed_pro_version >= 87)
2019 size += strlen(mdev->net_conf->integrity_alg) + 1;
2020
2021 /* we must not recurse into our own queue,
2022 * as that is blocked during handshake */
2023 p = kmalloc(size, GFP_NOIO);
2024 if (p == NULL)
2025 return 0;
2026
2027 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2028 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2029 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2030 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002031 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2032
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002033 cf = 0;
2034 if (mdev->net_conf->want_lose)
2035 cf |= CF_WANT_LOSE;
2036 if (mdev->net_conf->dry_run) {
2037 if (mdev->agreed_pro_version >= 92)
2038 cf |= CF_DRY_RUN;
2039 else {
2040 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02002041 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01002042 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002043 }
2044 }
2045 p->conn_flags = cpu_to_be32(cf);
2046
Philipp Reisnerb411b362009-09-25 16:07:19 -07002047 if (mdev->agreed_pro_version >= 87)
2048 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2049
2050 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002051 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002052 kfree(p);
2053 return rv;
2054}
2055
2056int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2057{
2058 struct p_uuids p;
2059 int i;
2060
2061 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2062 return 1;
2063
2064 for (i = UI_CURRENT; i < UI_SIZE; i++)
2065 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2066
2067 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2068 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2069 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2070 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2071 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2072 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2073
2074 put_ldev(mdev);
2075
2076 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002077 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002078}
2079
2080int drbd_send_uuids(struct drbd_conf *mdev)
2081{
2082 return _drbd_send_uuids(mdev, 0);
2083}
2084
2085int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2086{
2087 return _drbd_send_uuids(mdev, 8);
2088}
2089
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002090void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2091{
2092 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2093 u64 *uuid = mdev->ldev->md.uuid;
2094 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2095 text,
2096 (unsigned long long)uuid[UI_CURRENT],
2097 (unsigned long long)uuid[UI_BITMAP],
2098 (unsigned long long)uuid[UI_HISTORY_START],
2099 (unsigned long long)uuid[UI_HISTORY_END]);
2100 put_ldev(mdev);
2101 } else {
2102 dev_info(DEV, "%s effective data uuid: %016llX\n",
2103 text,
2104 (unsigned long long)mdev->ed_uuid);
2105 }
2106}
2107
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002108int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002109{
2110 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002111 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002112
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002113 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2114
Philipp Reisner4a23f262011-01-11 17:42:17 +01002115 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002116 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002117 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002118 drbd_md_sync(mdev);
2119 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002120
2121 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002122 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002123}
2124
Philipp Reisnere89b5912010-03-24 17:11:33 +01002125int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002126{
2127 struct p_sizes p;
2128 sector_t d_size, u_size;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002129 int q_order_type, max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002130 int ok;
2131
2132 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2133 D_ASSERT(mdev->ldev->backing_bdev);
2134 d_size = drbd_get_max_capacity(mdev->ldev);
2135 u_size = mdev->ldev->dc.disk_size;
2136 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisner99432fc2011-05-20 16:39:13 +02002137 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2138 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002139 put_ldev(mdev);
2140 } else {
2141 d_size = 0;
2142 u_size = 0;
2143 q_order_type = QUEUE_ORDERED_NONE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002144 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002145 }
2146
Philipp Reisner68093842011-06-30 15:43:06 +02002147 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2148 if (mdev->agreed_pro_version <= 94)
2149 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2150
Philipp Reisnerb411b362009-09-25 16:07:19 -07002151 p.d_size = cpu_to_be64(d_size);
2152 p.u_size = cpu_to_be64(u_size);
2153 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Philipp Reisner99432fc2011-05-20 16:39:13 +02002154 p.max_bio_size = cpu_to_be32(max_bio_size);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002155 p.queue_order_type = cpu_to_be16(q_order_type);
2156 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002157
2158 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002159 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002160 return ok;
2161}
2162
2163/**
2164 * drbd_send_state() - Sends the drbd state to the peer
2165 * @mdev: DRBD device.
2166 */
2167int drbd_send_state(struct drbd_conf *mdev)
2168{
2169 struct socket *sock;
2170 struct p_state p;
2171 int ok = 0;
2172
2173 /* Grab state lock so we wont send state if we're in the middle
2174 * of a cluster wide state change on another thread */
2175 drbd_state_lock(mdev);
2176
2177 mutex_lock(&mdev->data.mutex);
2178
2179 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2180 sock = mdev->data.socket;
2181
2182 if (likely(sock != NULL)) {
2183 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002184 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002185 }
2186
2187 mutex_unlock(&mdev->data.mutex);
2188
2189 drbd_state_unlock(mdev);
2190 return ok;
2191}
2192
2193int drbd_send_state_req(struct drbd_conf *mdev,
2194 union drbd_state mask, union drbd_state val)
2195{
2196 struct p_req_state p;
2197
2198 p.mask = cpu_to_be32(mask.i);
2199 p.val = cpu_to_be32(val.i);
2200
2201 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002202 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002203}
2204
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002205int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002206{
2207 struct p_req_state_reply p;
2208
2209 p.retcode = cpu_to_be32(retcode);
2210
2211 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002212 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002213}
2214
2215int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2216 struct p_compressed_bm *p,
2217 struct bm_xfer_ctx *c)
2218{
2219 struct bitstream bs;
2220 unsigned long plain_bits;
2221 unsigned long tmp;
2222 unsigned long rl;
2223 unsigned len;
2224 unsigned toggle;
2225 int bits;
2226
2227 /* may we use this feature? */
2228 if ((mdev->sync_conf.use_rle == 0) ||
2229 (mdev->agreed_pro_version < 90))
2230 return 0;
2231
2232 if (c->bit_offset >= c->bm_bits)
2233 return 0; /* nothing to do. */
2234
2235 /* use at most thus many bytes */
2236 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2237 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2238 /* plain bits covered in this code string */
2239 plain_bits = 0;
2240
2241 /* p->encoding & 0x80 stores whether the first run length is set.
2242 * bit offset is implicit.
2243 * start with toggle == 2 to be able to tell the first iteration */
2244 toggle = 2;
2245
2246 /* see how much plain bits we can stuff into one packet
2247 * using RLE and VLI. */
2248 do {
2249 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2250 : _drbd_bm_find_next(mdev, c->bit_offset);
2251 if (tmp == -1UL)
2252 tmp = c->bm_bits;
2253 rl = tmp - c->bit_offset;
2254
2255 if (toggle == 2) { /* first iteration */
2256 if (rl == 0) {
2257 /* the first checked bit was set,
2258 * store start value, */
2259 DCBP_set_start(p, 1);
2260 /* but skip encoding of zero run length */
2261 toggle = !toggle;
2262 continue;
2263 }
2264 DCBP_set_start(p, 0);
2265 }
2266
2267 /* paranoia: catch zero runlength.
2268 * can only happen if bitmap is modified while we scan it. */
2269 if (rl == 0) {
2270 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2271 "t:%u bo:%lu\n", toggle, c->bit_offset);
2272 return -1;
2273 }
2274
2275 bits = vli_encode_bits(&bs, rl);
2276 if (bits == -ENOBUFS) /* buffer full */
2277 break;
2278 if (bits <= 0) {
2279 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2280 return 0;
2281 }
2282
2283 toggle = !toggle;
2284 plain_bits += rl;
2285 c->bit_offset = tmp;
2286 } while (c->bit_offset < c->bm_bits);
2287
2288 len = bs.cur.b - p->code + !!bs.cur.bit;
2289
2290 if (plain_bits < (len << 3)) {
2291 /* incompressible with this method.
2292 * we need to rewind both word and bit position. */
2293 c->bit_offset -= plain_bits;
2294 bm_xfer_ctx_bit_to_word_offset(c);
2295 c->bit_offset = c->word_offset * BITS_PER_LONG;
2296 return 0;
2297 }
2298
2299 /* RLE + VLI was able to compress it just fine.
2300 * update c->word_offset. */
2301 bm_xfer_ctx_bit_to_word_offset(c);
2302
2303 /* store pad_bits */
2304 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2305
2306 return len;
2307}
2308
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002309/**
2310 * send_bitmap_rle_or_plain
2311 *
2312 * Return 0 when done, 1 when another iteration is needed, and a negative error
2313 * code upon failure.
2314 */
2315static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002316send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002317 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002318{
2319 struct p_compressed_bm *p = (void*)h;
2320 unsigned long num_words;
2321 int len;
2322 int ok;
2323
2324 len = fill_bitmap_rle_bits(mdev, p, c);
2325
2326 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002327 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002328
2329 if (len) {
2330 DCBP_set_code(p, RLE_VLI_Bits);
2331 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2332 sizeof(*p) + len, 0);
2333
2334 c->packets[0]++;
2335 c->bytes[0] += sizeof(*p) + len;
2336
2337 if (c->bit_offset >= c->bm_bits)
2338 len = 0; /* DONE */
2339 } else {
2340 /* was not compressible.
2341 * send a buffer full of plain text bits instead. */
2342 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2343 len = num_words * sizeof(long);
2344 if (len)
2345 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2346 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002347 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002348 c->word_offset += num_words;
2349 c->bit_offset = c->word_offset * BITS_PER_LONG;
2350
2351 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002352 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002353
2354 if (c->bit_offset > c->bm_bits)
2355 c->bit_offset = c->bm_bits;
2356 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002357 if (ok) {
2358 if (len == 0) {
2359 INFO_bm_xfer_stats(mdev, "send", c);
2360 return 0;
2361 } else
2362 return 1;
2363 }
2364 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002365}
2366
2367/* See the comment at receive_bitmap() */
2368int _drbd_send_bitmap(struct drbd_conf *mdev)
2369{
2370 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002371 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002372 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002373
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002374 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002375
2376 /* maybe we should use some per thread scratch page,
2377 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002378 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002379 if (!p) {
2380 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002381 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002382 }
2383
2384 if (get_ldev(mdev)) {
2385 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2386 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2387 drbd_bm_set_all(mdev);
2388 if (drbd_bm_write(mdev)) {
2389 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2390 * but otherwise process as per normal - need to tell other
2391 * side that a full resync is required! */
2392 dev_err(DEV, "Failed to write bitmap to disk!\n");
2393 } else {
2394 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2395 drbd_md_sync(mdev);
2396 }
2397 }
2398 put_ldev(mdev);
2399 }
2400
2401 c = (struct bm_xfer_ctx) {
2402 .bm_bits = drbd_bm_bits(mdev),
2403 .bm_words = drbd_bm_words(mdev),
2404 };
2405
2406 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002407 err = send_bitmap_rle_or_plain(mdev, p, &c);
2408 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002409
2410 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002411 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002412}
2413
2414int drbd_send_bitmap(struct drbd_conf *mdev)
2415{
2416 int err;
2417
2418 if (!drbd_get_data_sock(mdev))
2419 return -1;
2420 err = !_drbd_send_bitmap(mdev);
2421 drbd_put_data_sock(mdev);
2422 return err;
2423}
2424
2425int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2426{
2427 int ok;
2428 struct p_barrier_ack p;
2429
2430 p.barrier = barrier_nr;
2431 p.set_size = cpu_to_be32(set_size);
2432
2433 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002434 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002435 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002436 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002437 return ok;
2438}
2439
2440/**
2441 * _drbd_send_ack() - Sends an ack packet
2442 * @mdev: DRBD device.
2443 * @cmd: Packet command code.
2444 * @sector: sector, needs to be in big endian byte order
2445 * @blksize: size in byte, needs to be in big endian byte order
2446 * @block_id: Id, big endian byte order
2447 */
2448static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2449 u64 sector,
2450 u32 blksize,
2451 u64 block_id)
2452{
2453 int ok;
2454 struct p_block_ack p;
2455
2456 p.sector = sector;
2457 p.block_id = block_id;
2458 p.blksize = blksize;
2459 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2460
2461 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002462 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002463 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002464 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002465 return ok;
2466}
2467
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002468/* dp->sector and dp->block_id already/still in network byte order,
2469 * data_size is payload size according to dp->head,
2470 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002471int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002472 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002473{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002474 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2475 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002476 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2477 dp->block_id);
2478}
2479
2480int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2481 struct p_block_req *rp)
2482{
2483 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2484}
2485
2486/**
2487 * drbd_send_ack() - Sends an ack packet
2488 * @mdev: DRBD device.
2489 * @cmd: Packet command code.
2490 * @e: Epoch entry.
2491 */
2492int drbd_send_ack(struct drbd_conf *mdev,
2493 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2494{
2495 return _drbd_send_ack(mdev, cmd,
2496 cpu_to_be64(e->sector),
2497 cpu_to_be32(e->size),
2498 e->block_id);
2499}
2500
2501/* This function misuses the block_id field to signal if the blocks
2502 * are is sync or not. */
2503int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2504 sector_t sector, int blksize, u64 block_id)
2505{
2506 return _drbd_send_ack(mdev, cmd,
2507 cpu_to_be64(sector),
2508 cpu_to_be32(blksize),
2509 cpu_to_be64(block_id));
2510}
2511
2512int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2513 sector_t sector, int size, u64 block_id)
2514{
2515 int ok;
2516 struct p_block_req p;
2517
2518 p.sector = cpu_to_be64(sector);
2519 p.block_id = block_id;
2520 p.blksize = cpu_to_be32(size);
2521
2522 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002523 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002524 return ok;
2525}
2526
2527int drbd_send_drequest_csum(struct drbd_conf *mdev,
2528 sector_t sector, int size,
2529 void *digest, int digest_size,
2530 enum drbd_packets cmd)
2531{
2532 int ok;
2533 struct p_block_req p;
2534
2535 p.sector = cpu_to_be64(sector);
2536 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2537 p.blksize = cpu_to_be32(size);
2538
2539 p.head.magic = BE_DRBD_MAGIC;
2540 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002541 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002542
2543 mutex_lock(&mdev->data.mutex);
2544
2545 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2546 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2547
2548 mutex_unlock(&mdev->data.mutex);
2549
2550 return ok;
2551}
2552
2553int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2554{
2555 int ok;
2556 struct p_block_req p;
2557
2558 p.sector = cpu_to_be64(sector);
2559 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2560 p.blksize = cpu_to_be32(size);
2561
2562 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002563 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002564 return ok;
2565}
2566
2567/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002568 * returns false if we should retry,
2569 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002570 */
2571static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2572{
2573 int drop_it;
2574 /* long elapsed = (long)(jiffies - mdev->last_received); */
2575
2576 drop_it = mdev->meta.socket == sock
2577 || !mdev->asender.task
2578 || get_t_state(&mdev->asender) != Running
2579 || mdev->state.conn < C_CONNECTED;
2580
2581 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002582 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002583
2584 drop_it = !--mdev->ko_count;
2585 if (!drop_it) {
2586 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2587 current->comm, current->pid, mdev->ko_count);
2588 request_ping(mdev);
2589 }
2590
2591 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2592}
2593
2594/* The idea of sendpage seems to be to put some kind of reference
2595 * to the page into the skb, and to hand it over to the NIC. In
2596 * this process get_page() gets called.
2597 *
2598 * As soon as the page was really sent over the network put_page()
2599 * gets called by some part of the network layer. [ NIC driver? ]
2600 *
2601 * [ get_page() / put_page() increment/decrement the count. If count
2602 * reaches 0 the page will be freed. ]
2603 *
2604 * This works nicely with pages from FSs.
2605 * But this means that in protocol A we might signal IO completion too early!
2606 *
2607 * In order not to corrupt data during a resync we must make sure
2608 * that we do not reuse our own buffer pages (EEs) to early, therefore
2609 * we have the net_ee list.
2610 *
2611 * XFS seems to have problems, still, it submits pages with page_count == 0!
2612 * As a workaround, we disable sendpage on pages
2613 * with page_count == 0 or PageSlab.
2614 */
2615static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002616 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002617{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002618 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002619 kunmap(page);
2620 if (sent == size)
2621 mdev->send_cnt += size>>9;
2622 return sent == size;
2623}
2624
2625static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002626 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002627{
2628 mm_segment_t oldfs = get_fs();
2629 int sent, ok;
2630 int len = size;
2631
2632 /* e.g. XFS meta- & log-data is in slab pages, which have a
2633 * page_count of 0 and/or have PageSlab() set.
2634 * we cannot use send_page for those, as that does get_page();
2635 * put_page(); and would cause either a VM_BUG directly, or
2636 * __page_cache_release a page that would actually still be referenced
2637 * by someone, leading to some obscure delayed Oops somewhere else. */
2638 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002639 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002640
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002641 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002642 drbd_update_congested(mdev);
2643 set_fs(KERNEL_DS);
2644 do {
2645 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2646 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002647 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002648 if (sent == -EAGAIN) {
2649 if (we_should_drop_the_connection(mdev,
2650 mdev->data.socket))
2651 break;
2652 else
2653 continue;
2654 }
2655 if (sent <= 0) {
2656 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2657 __func__, (int)size, len, sent);
2658 break;
2659 }
2660 len -= sent;
2661 offset += sent;
2662 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2663 set_fs(oldfs);
2664 clear_bit(NET_CONGESTED, &mdev->flags);
2665
2666 ok = (len == 0);
2667 if (likely(ok))
2668 mdev->send_cnt += size>>9;
2669 return ok;
2670}
2671
2672static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2673{
2674 struct bio_vec *bvec;
2675 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002676 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002677 __bio_for_each_segment(bvec, bio, i, 0) {
2678 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002679 bvec->bv_offset, bvec->bv_len,
2680 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002681 return 0;
2682 }
2683 return 1;
2684}
2685
2686static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2687{
2688 struct bio_vec *bvec;
2689 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002690 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002691 __bio_for_each_segment(bvec, bio, i, 0) {
2692 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002693 bvec->bv_offset, bvec->bv_len,
2694 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002695 return 0;
2696 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002697 return 1;
2698}
2699
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002700static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2701{
2702 struct page *page = e->pages;
2703 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002704 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002705 page_chain_for_each(page) {
2706 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002707 if (!_drbd_send_page(mdev, page, 0, l,
2708 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002709 return 0;
2710 len -= l;
2711 }
2712 return 1;
2713}
2714
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002715static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2716{
2717 if (mdev->agreed_pro_version >= 95)
2718 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002719 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2720 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2721 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2722 else
Jens Axboe721a9602011-03-09 11:56:30 +01002723 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002724}
2725
Philipp Reisnerb411b362009-09-25 16:07:19 -07002726/* Used to send write requests
2727 * R_PRIMARY -> Peer (P_DATA)
2728 */
2729int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2730{
2731 int ok = 1;
2732 struct p_data p;
2733 unsigned int dp_flags = 0;
2734 void *dgb;
2735 int dgs;
2736
2737 if (!drbd_get_data_sock(mdev))
2738 return 0;
2739
2740 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2741 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2742
Philipp Reisnerd5373382010-08-23 15:18:33 +02002743 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002744 p.head.h80.magic = BE_DRBD_MAGIC;
2745 p.head.h80.command = cpu_to_be16(P_DATA);
2746 p.head.h80.length =
2747 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2748 } else {
2749 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2750 p.head.h95.command = cpu_to_be16(P_DATA);
2751 p.head.h95.length =
2752 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2753 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002754
2755 p.sector = cpu_to_be64(req->sector);
2756 p.block_id = (unsigned long)req;
2757 p.seq_num = cpu_to_be32(req->seq_num =
2758 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002759
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002760 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2761
Philipp Reisnerb411b362009-09-25 16:07:19 -07002762 if (mdev->state.conn >= C_SYNC_SOURCE &&
2763 mdev->state.conn <= C_PAUSED_SYNC_T)
2764 dp_flags |= DP_MAY_SET_IN_SYNC;
2765
2766 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002767 set_bit(UNPLUG_REMOTE, &mdev->flags);
2768 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002769 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002770 if (ok && dgs) {
2771 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002772 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002773 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002774 }
2775 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002776 /* For protocol A, we have to memcpy the payload into
2777 * socket buffers, as we may complete right away
2778 * as soon as we handed it over to tcp, at which point the data
2779 * pages may become invalid.
2780 *
2781 * For data-integrity enabled, we copy it as well, so we can be
2782 * sure that even if the bio pages may still be modified, it
2783 * won't change the data on the wire, thus if the digest checks
2784 * out ok after sending on this side, but does not fit on the
2785 * receiving side, we sure have detected corruption elsewhere.
2786 */
2787 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002788 ok = _drbd_send_bio(mdev, req->master_bio);
2789 else
2790 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002791
2792 /* double check digest, sometimes buffers have been modified in flight. */
2793 if (dgs > 0 && dgs <= 64) {
Bart Van Assche24c48302011-05-21 18:32:29 +02002794 /* 64 byte, 512 bit, is the largest digest size
Lars Ellenberg470be442010-11-10 10:36:52 +01002795 * currently supported in kernel crypto. */
2796 unsigned char digest[64];
2797 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2798 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2799 dev_warn(DEV,
2800 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2801 (unsigned long long)req->sector, req->size);
2802 }
2803 } /* else if (dgs > 64) {
2804 ... Be noisy about digest too large ...
2805 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002806 }
2807
2808 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002809
Philipp Reisnerb411b362009-09-25 16:07:19 -07002810 return ok;
2811}
2812
2813/* answer packet, used to send data back for read requests:
2814 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2815 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2816 */
2817int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2818 struct drbd_epoch_entry *e)
2819{
2820 int ok;
2821 struct p_data p;
2822 void *dgb;
2823 int dgs;
2824
2825 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2826 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2827
Philipp Reisnerd5373382010-08-23 15:18:33 +02002828 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002829 p.head.h80.magic = BE_DRBD_MAGIC;
2830 p.head.h80.command = cpu_to_be16(cmd);
2831 p.head.h80.length =
2832 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2833 } else {
2834 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2835 p.head.h95.command = cpu_to_be16(cmd);
2836 p.head.h95.length =
2837 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2838 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002839
2840 p.sector = cpu_to_be64(e->sector);
2841 p.block_id = e->block_id;
2842 /* p.seq_num = 0; No sequence numbers here.. */
2843
2844 /* Only called by our kernel thread.
2845 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2846 * in response to admin command or module unload.
2847 */
2848 if (!drbd_get_data_sock(mdev))
2849 return 0;
2850
Philipp Reisner0b70a132010-08-20 13:36:10 +02002851 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002852 if (ok && dgs) {
2853 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002854 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002855 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002856 }
2857 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002858 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002859
2860 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002861
Philipp Reisnerb411b362009-09-25 16:07:19 -07002862 return ok;
2863}
2864
Philipp Reisner73a01a12010-10-27 14:33:00 +02002865int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2866{
2867 struct p_block_desc p;
2868
2869 p.sector = cpu_to_be64(req->sector);
2870 p.blksize = cpu_to_be32(req->size);
2871
2872 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2873}
2874
Philipp Reisnerb411b362009-09-25 16:07:19 -07002875/*
2876 drbd_send distinguishes two cases:
2877
2878 Packets sent via the data socket "sock"
2879 and packets sent via the meta data socket "msock"
2880
2881 sock msock
2882 -----------------+-------------------------+------------------------------
2883 timeout conf.timeout / 2 conf.timeout / 2
2884 timeout action send a ping via msock Abort communication
2885 and close all sockets
2886*/
2887
2888/*
2889 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2890 */
2891int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2892 void *buf, size_t size, unsigned msg_flags)
2893{
2894 struct kvec iov;
2895 struct msghdr msg;
2896 int rv, sent = 0;
2897
2898 if (!sock)
2899 return -1000;
2900
2901 /* THINK if (signal_pending) return ... ? */
2902
2903 iov.iov_base = buf;
2904 iov.iov_len = size;
2905
2906 msg.msg_name = NULL;
2907 msg.msg_namelen = 0;
2908 msg.msg_control = NULL;
2909 msg.msg_controllen = 0;
2910 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2911
2912 if (sock == mdev->data.socket) {
2913 mdev->ko_count = mdev->net_conf->ko_count;
2914 drbd_update_congested(mdev);
2915 }
2916 do {
2917 /* STRANGE
2918 * tcp_sendmsg does _not_ use its size parameter at all ?
2919 *
2920 * -EAGAIN on timeout, -EINTR on signal.
2921 */
2922/* THINK
2923 * do we need to block DRBD_SIG if sock == &meta.socket ??
2924 * otherwise wake_asender() might interrupt some send_*Ack !
2925 */
2926 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2927 if (rv == -EAGAIN) {
2928 if (we_should_drop_the_connection(mdev, sock))
2929 break;
2930 else
2931 continue;
2932 }
2933 D_ASSERT(rv != 0);
2934 if (rv == -EINTR) {
2935 flush_signals(current);
2936 rv = 0;
2937 }
2938 if (rv < 0)
2939 break;
2940 sent += rv;
2941 iov.iov_base += rv;
2942 iov.iov_len -= rv;
2943 } while (sent < size);
2944
2945 if (sock == mdev->data.socket)
2946 clear_bit(NET_CONGESTED, &mdev->flags);
2947
2948 if (rv <= 0) {
2949 if (rv != -EAGAIN) {
2950 dev_err(DEV, "%s_sendmsg returned %d\n",
2951 sock == mdev->meta.socket ? "msock" : "sock",
2952 rv);
2953 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2954 } else
2955 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2956 }
2957
2958 return sent;
2959}
2960
2961static int drbd_open(struct block_device *bdev, fmode_t mode)
2962{
2963 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2964 unsigned long flags;
2965 int rv = 0;
2966
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002967 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002968 spin_lock_irqsave(&mdev->req_lock, flags);
2969 /* to have a stable mdev->state.role
2970 * and no race with updating open_cnt */
2971
2972 if (mdev->state.role != R_PRIMARY) {
2973 if (mode & FMODE_WRITE)
2974 rv = -EROFS;
2975 else if (!allow_oos)
2976 rv = -EMEDIUMTYPE;
2977 }
2978
2979 if (!rv)
2980 mdev->open_cnt++;
2981 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002982 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002983
2984 return rv;
2985}
2986
2987static int drbd_release(struct gendisk *gd, fmode_t mode)
2988{
2989 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002990 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002991 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002992 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002993 return 0;
2994}
2995
Philipp Reisnerb411b362009-09-25 16:07:19 -07002996static void drbd_set_defaults(struct drbd_conf *mdev)
2997{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002998 /* This way we get a compile error when sync_conf grows,
2999 and we forgot to initialize it here */
3000 mdev->sync_conf = (struct syncer_conf) {
3001 /* .rate = */ DRBD_RATE_DEF,
3002 /* .after = */ DRBD_AFTER_DEF,
3003 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02003004 /* .verify_alg = */ {}, 0,
3005 /* .cpu_mask = */ {}, 0,
3006 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02003007 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02003008 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3009 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3010 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3011 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003012 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3013 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02003014 };
3015
3016 /* Have to use that way, because the layout differs between
3017 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003018 mdev->state = (union drbd_state) {
3019 { .role = R_SECONDARY,
3020 .peer = R_UNKNOWN,
3021 .conn = C_STANDALONE,
3022 .disk = D_DISKLESS,
3023 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02003024 .susp = 0,
3025 .susp_nod = 0,
3026 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07003027 } };
3028}
3029
3030void drbd_init_set_defaults(struct drbd_conf *mdev)
3031{
3032 /* the memset(,0,) did most of this.
3033 * note: only assignments, no allocation in here */
3034
3035 drbd_set_defaults(mdev);
3036
Philipp Reisnerb411b362009-09-25 16:07:19 -07003037 atomic_set(&mdev->ap_bio_cnt, 0);
3038 atomic_set(&mdev->ap_pending_cnt, 0);
3039 atomic_set(&mdev->rs_pending_cnt, 0);
3040 atomic_set(&mdev->unacked_cnt, 0);
3041 atomic_set(&mdev->local_cnt, 0);
3042 atomic_set(&mdev->net_cnt, 0);
3043 atomic_set(&mdev->packet_seq, 0);
3044 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003045 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02003046 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003047 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02003048 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnere1711732011-06-27 11:51:46 +02003049 atomic_set(&mdev->md_io_in_use, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003050
Philipp Reisnerb411b362009-09-25 16:07:19 -07003051 mutex_init(&mdev->data.mutex);
3052 mutex_init(&mdev->meta.mutex);
3053 sema_init(&mdev->data.work.s, 0);
3054 sema_init(&mdev->meta.work.s, 0);
3055 mutex_init(&mdev->state_mutex);
3056
3057 spin_lock_init(&mdev->data.work.q_lock);
3058 spin_lock_init(&mdev->meta.work.q_lock);
3059
3060 spin_lock_init(&mdev->al_lock);
3061 spin_lock_init(&mdev->req_lock);
3062 spin_lock_init(&mdev->peer_seq_lock);
3063 spin_lock_init(&mdev->epoch_lock);
3064
3065 INIT_LIST_HEAD(&mdev->active_ee);
3066 INIT_LIST_HEAD(&mdev->sync_ee);
3067 INIT_LIST_HEAD(&mdev->done_ee);
3068 INIT_LIST_HEAD(&mdev->read_ee);
3069 INIT_LIST_HEAD(&mdev->net_ee);
3070 INIT_LIST_HEAD(&mdev->resync_reads);
3071 INIT_LIST_HEAD(&mdev->data.work.q);
3072 INIT_LIST_HEAD(&mdev->meta.work.q);
3073 INIT_LIST_HEAD(&mdev->resync_work.list);
3074 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003075 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003076 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003077 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003078 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003079
Philipp Reisner794abb72010-12-27 11:51:23 +01003080 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003081 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003082 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003083 mdev->md_sync_work.cb = w_md_sync;
3084 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003085 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003086 init_timer(&mdev->resync_timer);
3087 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003088 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003089 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003090 mdev->resync_timer.function = resync_timer_fn;
3091 mdev->resync_timer.data = (unsigned long) mdev;
3092 mdev->md_sync_timer.function = md_sync_timer_fn;
3093 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003094 mdev->start_resync_timer.function = start_resync_timer_fn;
3095 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003096 mdev->request_timer.function = request_timer_fn;
3097 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003098
3099 init_waitqueue_head(&mdev->misc_wait);
3100 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003101 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003102 init_waitqueue_head(&mdev->ee_wait);
3103 init_waitqueue_head(&mdev->al_wait);
3104 init_waitqueue_head(&mdev->seq_wait);
3105
3106 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3107 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3108 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3109
3110 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02003111 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003112 mdev->resync_wenr = LC_FREE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003113 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3114 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003115}
3116
3117void drbd_mdev_cleanup(struct drbd_conf *mdev)
3118{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003119 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003120 if (mdev->receiver.t_state != None)
3121 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3122 mdev->receiver.t_state);
3123
3124 /* no need to lock it, I'm the only thread alive */
3125 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3126 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3127 mdev->al_writ_cnt =
3128 mdev->bm_writ_cnt =
3129 mdev->read_cnt =
3130 mdev->recv_cnt =
3131 mdev->send_cnt =
3132 mdev->writ_cnt =
3133 mdev->p_size =
3134 mdev->rs_start =
3135 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003136 mdev->rs_failed = 0;
3137 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003138 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003139 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3140 mdev->rs_mark_left[i] = 0;
3141 mdev->rs_mark_time[i] = 0;
3142 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003143 D_ASSERT(mdev->net_conf == NULL);
3144
3145 drbd_set_my_capacity(mdev, 0);
3146 if (mdev->bitmap) {
3147 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003148 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003149 drbd_bm_cleanup(mdev);
3150 }
3151
3152 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003153 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003154
3155 /*
3156 * currently we drbd_init_ee only on module load, so
3157 * we may do drbd_release_ee only on module unload!
3158 */
3159 D_ASSERT(list_empty(&mdev->active_ee));
3160 D_ASSERT(list_empty(&mdev->sync_ee));
3161 D_ASSERT(list_empty(&mdev->done_ee));
3162 D_ASSERT(list_empty(&mdev->read_ee));
3163 D_ASSERT(list_empty(&mdev->net_ee));
3164 D_ASSERT(list_empty(&mdev->resync_reads));
3165 D_ASSERT(list_empty(&mdev->data.work.q));
3166 D_ASSERT(list_empty(&mdev->meta.work.q));
3167 D_ASSERT(list_empty(&mdev->resync_work.list));
3168 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003169 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003170
3171 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003172}
3173
3174
3175static void drbd_destroy_mempools(void)
3176{
3177 struct page *page;
3178
3179 while (drbd_pp_pool) {
3180 page = drbd_pp_pool;
3181 drbd_pp_pool = (struct page *)page_private(page);
3182 __free_page(page);
3183 drbd_pp_vacant--;
3184 }
3185
3186 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3187
3188 if (drbd_ee_mempool)
3189 mempool_destroy(drbd_ee_mempool);
3190 if (drbd_request_mempool)
3191 mempool_destroy(drbd_request_mempool);
3192 if (drbd_ee_cache)
3193 kmem_cache_destroy(drbd_ee_cache);
3194 if (drbd_request_cache)
3195 kmem_cache_destroy(drbd_request_cache);
3196 if (drbd_bm_ext_cache)
3197 kmem_cache_destroy(drbd_bm_ext_cache);
3198 if (drbd_al_ext_cache)
3199 kmem_cache_destroy(drbd_al_ext_cache);
3200
3201 drbd_ee_mempool = NULL;
3202 drbd_request_mempool = NULL;
3203 drbd_ee_cache = NULL;
3204 drbd_request_cache = NULL;
3205 drbd_bm_ext_cache = NULL;
3206 drbd_al_ext_cache = NULL;
3207
3208 return;
3209}
3210
3211static int drbd_create_mempools(void)
3212{
3213 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003214 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003215 int i;
3216
3217 /* prepare our caches and mempools */
3218 drbd_request_mempool = NULL;
3219 drbd_ee_cache = NULL;
3220 drbd_request_cache = NULL;
3221 drbd_bm_ext_cache = NULL;
3222 drbd_al_ext_cache = NULL;
3223 drbd_pp_pool = NULL;
3224
3225 /* caches */
3226 drbd_request_cache = kmem_cache_create(
3227 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3228 if (drbd_request_cache == NULL)
3229 goto Enomem;
3230
3231 drbd_ee_cache = kmem_cache_create(
3232 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3233 if (drbd_ee_cache == NULL)
3234 goto Enomem;
3235
3236 drbd_bm_ext_cache = kmem_cache_create(
3237 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3238 if (drbd_bm_ext_cache == NULL)
3239 goto Enomem;
3240
3241 drbd_al_ext_cache = kmem_cache_create(
3242 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3243 if (drbd_al_ext_cache == NULL)
3244 goto Enomem;
3245
3246 /* mempools */
3247 drbd_request_mempool = mempool_create(number,
3248 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3249 if (drbd_request_mempool == NULL)
3250 goto Enomem;
3251
3252 drbd_ee_mempool = mempool_create(number,
3253 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003254 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003255 goto Enomem;
3256
3257 /* drbd's page pool */
3258 spin_lock_init(&drbd_pp_lock);
3259
3260 for (i = 0; i < number; i++) {
3261 page = alloc_page(GFP_HIGHUSER);
3262 if (!page)
3263 goto Enomem;
3264 set_page_private(page, (unsigned long)drbd_pp_pool);
3265 drbd_pp_pool = page;
3266 }
3267 drbd_pp_vacant = number;
3268
3269 return 0;
3270
3271Enomem:
3272 drbd_destroy_mempools(); /* in case we allocated some */
3273 return -ENOMEM;
3274}
3275
3276static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3277 void *unused)
3278{
3279 /* just so we have it. you never know what interesting things we
3280 * might want to do here some day...
3281 */
3282
3283 return NOTIFY_DONE;
3284}
3285
3286static struct notifier_block drbd_notifier = {
3287 .notifier_call = drbd_notify_sys,
3288};
3289
3290static void drbd_release_ee_lists(struct drbd_conf *mdev)
3291{
3292 int rr;
3293
3294 rr = drbd_release_ee(mdev, &mdev->active_ee);
3295 if (rr)
3296 dev_err(DEV, "%d EEs in active list found!\n", rr);
3297
3298 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3299 if (rr)
3300 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3301
3302 rr = drbd_release_ee(mdev, &mdev->read_ee);
3303 if (rr)
3304 dev_err(DEV, "%d EEs in read list found!\n", rr);
3305
3306 rr = drbd_release_ee(mdev, &mdev->done_ee);
3307 if (rr)
3308 dev_err(DEV, "%d EEs in done list found!\n", rr);
3309
3310 rr = drbd_release_ee(mdev, &mdev->net_ee);
3311 if (rr)
3312 dev_err(DEV, "%d EEs in net list found!\n", rr);
3313}
3314
3315/* caution. no locking.
3316 * currently only used from module cleanup code. */
3317static void drbd_delete_device(unsigned int minor)
3318{
3319 struct drbd_conf *mdev = minor_to_mdev(minor);
3320
3321 if (!mdev)
3322 return;
3323
Philipp Reisnerdfa8bed2011-06-29 14:06:08 +02003324 del_timer_sync(&mdev->request_timer);
3325
Philipp Reisnerb411b362009-09-25 16:07:19 -07003326 /* paranoia asserts */
3327 if (mdev->open_cnt != 0)
3328 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3329 __FILE__ , __LINE__);
3330
3331 ERR_IF (!list_empty(&mdev->data.work.q)) {
3332 struct list_head *lp;
3333 list_for_each(lp, &mdev->data.work.q) {
3334 dev_err(DEV, "lp = %p\n", lp);
3335 }
3336 };
3337 /* end paranoia asserts */
3338
3339 del_gendisk(mdev->vdisk);
3340
3341 /* cleanup stuff that may have been allocated during
3342 * device (re-)configuration or state changes */
3343
3344 if (mdev->this_bdev)
3345 bdput(mdev->this_bdev);
3346
3347 drbd_free_resources(mdev);
3348
3349 drbd_release_ee_lists(mdev);
3350
Bart Van Assche24c48302011-05-21 18:32:29 +02003351 /* should be freed on disconnect? */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003352 kfree(mdev->ee_hash);
3353 /*
3354 mdev->ee_hash_s = 0;
3355 mdev->ee_hash = NULL;
3356 */
3357
3358 lc_destroy(mdev->act_log);
3359 lc_destroy(mdev->resync);
3360
3361 kfree(mdev->p_uuid);
3362 /* mdev->p_uuid = NULL; */
3363
3364 kfree(mdev->int_dig_out);
3365 kfree(mdev->int_dig_in);
3366 kfree(mdev->int_dig_vv);
3367
3368 /* cleanup the rest that has been
3369 * allocated from drbd_new_device
3370 * and actually free the mdev itself */
3371 drbd_free_mdev(mdev);
3372}
3373
3374static void drbd_cleanup(void)
3375{
3376 unsigned int i;
3377
3378 unregister_reboot_notifier(&drbd_notifier);
3379
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003380 /* first remove proc,
3381 * drbdsetup uses it's presence to detect
3382 * whether DRBD is loaded.
3383 * If we would get stuck in proc removal,
3384 * but have netlink already deregistered,
3385 * some drbdsetup commands may wait forever
3386 * for an answer.
3387 */
3388 if (drbd_proc)
3389 remove_proc_entry("drbd", NULL);
3390
Philipp Reisnerb411b362009-09-25 16:07:19 -07003391 drbd_nl_cleanup();
3392
3393 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003394 i = minor_count;
3395 while (i--)
3396 drbd_delete_device(i);
3397 drbd_destroy_mempools();
3398 }
3399
3400 kfree(minor_table);
3401
3402 unregister_blkdev(DRBD_MAJOR, "drbd");
3403
3404 printk(KERN_INFO "drbd: module cleanup done.\n");
3405}
3406
3407/**
3408 * drbd_congested() - Callback for pdflush
3409 * @congested_data: User data
3410 * @bdi_bits: Bits pdflush is currently interested in
3411 *
3412 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3413 */
3414static int drbd_congested(void *congested_data, int bdi_bits)
3415{
3416 struct drbd_conf *mdev = congested_data;
3417 struct request_queue *q;
3418 char reason = '-';
3419 int r = 0;
3420
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003421 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003422 /* DRBD has frozen IO */
3423 r = bdi_bits;
3424 reason = 'd';
3425 goto out;
3426 }
3427
3428 if (get_ldev(mdev)) {
3429 q = bdev_get_queue(mdev->ldev->backing_bdev);
3430 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3431 put_ldev(mdev);
3432 if (r)
3433 reason = 'b';
3434 }
3435
3436 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3437 r |= (1 << BDI_async_congested);
3438 reason = reason == 'b' ? 'a' : 'n';
3439 }
3440
3441out:
3442 mdev->congestion_reason = reason;
3443 return r;
3444}
3445
3446struct drbd_conf *drbd_new_device(unsigned int minor)
3447{
3448 struct drbd_conf *mdev;
3449 struct gendisk *disk;
3450 struct request_queue *q;
3451
3452 /* GFP_KERNEL, we are outside of all write-out paths */
3453 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3454 if (!mdev)
3455 return NULL;
3456 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3457 goto out_no_cpumask;
3458
3459 mdev->minor = minor;
3460
3461 drbd_init_set_defaults(mdev);
3462
3463 q = blk_alloc_queue(GFP_KERNEL);
3464 if (!q)
3465 goto out_no_q;
3466 mdev->rq_queue = q;
3467 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003468
3469 disk = alloc_disk(1);
3470 if (!disk)
3471 goto out_no_disk;
3472 mdev->vdisk = disk;
3473
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003474 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003475
3476 disk->queue = q;
3477 disk->major = DRBD_MAJOR;
3478 disk->first_minor = minor;
3479 disk->fops = &drbd_ops;
3480 sprintf(disk->disk_name, "drbd%d", minor);
3481 disk->private_data = mdev;
3482
3483 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3484 /* we have no partitions. we contain only ourselves. */
3485 mdev->this_bdev->bd_contains = mdev->this_bdev;
3486
3487 q->backing_dev_info.congested_fn = drbd_congested;
3488 q->backing_dev_info.congested_data = mdev;
3489
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003490 blk_queue_make_request(q, drbd_make_request);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003491 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3492 This triggers a max_bio_size message upon first attach or connect */
3493 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003494 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3495 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003496 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003497
3498 mdev->md_io_page = alloc_page(GFP_KERNEL);
3499 if (!mdev->md_io_page)
3500 goto out_no_io_page;
3501
3502 if (drbd_bm_init(mdev))
3503 goto out_no_bitmap;
3504 /* no need to lock access, we are still initializing this minor device. */
3505 if (!tl_init(mdev))
3506 goto out_no_tl;
3507
3508 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3509 if (!mdev->app_reads_hash)
3510 goto out_no_app_reads;
3511
3512 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3513 if (!mdev->current_epoch)
3514 goto out_no_epoch;
3515
3516 INIT_LIST_HEAD(&mdev->current_epoch->list);
3517 mdev->epochs = 1;
3518
3519 return mdev;
3520
3521/* out_whatever_else:
3522 kfree(mdev->current_epoch); */
3523out_no_epoch:
3524 kfree(mdev->app_reads_hash);
3525out_no_app_reads:
3526 tl_cleanup(mdev);
3527out_no_tl:
3528 drbd_bm_cleanup(mdev);
3529out_no_bitmap:
3530 __free_page(mdev->md_io_page);
3531out_no_io_page:
3532 put_disk(disk);
3533out_no_disk:
3534 blk_cleanup_queue(q);
3535out_no_q:
3536 free_cpumask_var(mdev->cpu_mask);
3537out_no_cpumask:
3538 kfree(mdev);
3539 return NULL;
3540}
3541
3542/* counterpart of drbd_new_device.
3543 * last part of drbd_delete_device. */
3544void drbd_free_mdev(struct drbd_conf *mdev)
3545{
3546 kfree(mdev->current_epoch);
3547 kfree(mdev->app_reads_hash);
3548 tl_cleanup(mdev);
3549 if (mdev->bitmap) /* should no longer be there. */
3550 drbd_bm_cleanup(mdev);
3551 __free_page(mdev->md_io_page);
3552 put_disk(mdev->vdisk);
3553 blk_cleanup_queue(mdev->rq_queue);
3554 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003555 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003556 kfree(mdev);
3557}
3558
3559
3560int __init drbd_init(void)
3561{
3562 int err;
3563
3564 if (sizeof(struct p_handshake) != 80) {
3565 printk(KERN_ERR
3566 "drbd: never change the size or layout "
3567 "of the HandShake packet.\n");
3568 return -EINVAL;
3569 }
3570
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003571 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003572 printk(KERN_ERR
3573 "drbd: invalid minor_count (%d)\n", minor_count);
3574#ifdef MODULE
3575 return -EINVAL;
3576#else
3577 minor_count = 8;
3578#endif
3579 }
3580
3581 err = drbd_nl_init();
3582 if (err)
3583 return err;
3584
3585 err = register_blkdev(DRBD_MAJOR, "drbd");
3586 if (err) {
3587 printk(KERN_ERR
3588 "drbd: unable to register block device major %d\n",
3589 DRBD_MAJOR);
3590 return err;
3591 }
3592
3593 register_reboot_notifier(&drbd_notifier);
3594
3595 /*
3596 * allocate all necessary structs
3597 */
3598 err = -ENOMEM;
3599
3600 init_waitqueue_head(&drbd_pp_wait);
3601
3602 drbd_proc = NULL; /* play safe for drbd_cleanup */
3603 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3604 GFP_KERNEL);
3605 if (!minor_table)
3606 goto Enomem;
3607
3608 err = drbd_create_mempools();
3609 if (err)
3610 goto Enomem;
3611
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003612 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003613 if (!drbd_proc) {
3614 printk(KERN_ERR "drbd: unable to register proc file\n");
3615 goto Enomem;
3616 }
3617
3618 rwlock_init(&global_state_lock);
3619
3620 printk(KERN_INFO "drbd: initialized. "
3621 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3622 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3623 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3624 printk(KERN_INFO "drbd: registered as block device major %d\n",
3625 DRBD_MAJOR);
3626 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3627
3628 return 0; /* Success! */
3629
3630Enomem:
3631 drbd_cleanup();
3632 if (err == -ENOMEM)
3633 /* currently always the case */
3634 printk(KERN_ERR "drbd: ran out of memory\n");
3635 else
3636 printk(KERN_ERR "drbd: initialization failure\n");
3637 return err;
3638}
3639
3640void drbd_free_bc(struct drbd_backing_dev *ldev)
3641{
3642 if (ldev == NULL)
3643 return;
3644
Tejun Heoe525fd82010-11-13 11:55:17 +01003645 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3646 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003647
3648 kfree(ldev);
3649}
3650
3651void drbd_free_sock(struct drbd_conf *mdev)
3652{
3653 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003654 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003655 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3656 sock_release(mdev->data.socket);
3657 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003658 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003659 }
3660 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003661 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003662 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3663 sock_release(mdev->meta.socket);
3664 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003665 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003666 }
3667}
3668
3669
3670void drbd_free_resources(struct drbd_conf *mdev)
3671{
3672 crypto_free_hash(mdev->csums_tfm);
3673 mdev->csums_tfm = NULL;
3674 crypto_free_hash(mdev->verify_tfm);
3675 mdev->verify_tfm = NULL;
3676 crypto_free_hash(mdev->cram_hmac_tfm);
3677 mdev->cram_hmac_tfm = NULL;
3678 crypto_free_hash(mdev->integrity_w_tfm);
3679 mdev->integrity_w_tfm = NULL;
3680 crypto_free_hash(mdev->integrity_r_tfm);
3681 mdev->integrity_r_tfm = NULL;
3682
3683 drbd_free_sock(mdev);
3684
3685 __no_warn(local,
3686 drbd_free_bc(mdev->ldev);
3687 mdev->ldev = NULL;);
3688}
3689
3690/* meta data management */
3691
3692struct meta_data_on_disk {
3693 u64 la_size; /* last agreed size. */
3694 u64 uuid[UI_SIZE]; /* UUIDs. */
3695 u64 device_uuid;
3696 u64 reserved_u64_1;
3697 u32 flags; /* MDF */
3698 u32 magic;
3699 u32 md_size_sect;
3700 u32 al_offset; /* offset to this block */
3701 u32 al_nr_extents; /* important for restoring the AL */
3702 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3703 u32 bm_offset; /* offset to the bitmap, from here */
3704 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
Philipp Reisner99432fc2011-05-20 16:39:13 +02003705 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3706 u32 reserved_u32[3];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003707
3708} __packed;
3709
3710/**
3711 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3712 * @mdev: DRBD device.
3713 */
3714void drbd_md_sync(struct drbd_conf *mdev)
3715{
3716 struct meta_data_on_disk *buffer;
3717 sector_t sector;
3718 int i;
3719
Lars Ellenbergee15b032010-09-03 10:00:09 +02003720 del_timer(&mdev->md_sync_timer);
3721 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003722 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3723 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003724
3725 /* We use here D_FAILED and not D_ATTACHING because we try to write
3726 * metadata even if we detach due to a disk failure! */
3727 if (!get_ldev_if_state(mdev, D_FAILED))
3728 return;
3729
Philipp Reisnere1711732011-06-27 11:51:46 +02003730 buffer = drbd_md_get_buffer(mdev);
3731 if (!buffer)
3732 goto out;
3733
Philipp Reisnerb411b362009-09-25 16:07:19 -07003734 memset(buffer, 0, 512);
3735
3736 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3737 for (i = UI_CURRENT; i < UI_SIZE; i++)
3738 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3739 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3740 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3741
3742 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3743 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3744 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3745 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3746 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3747
3748 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003749 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003750
3751 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3752 sector = mdev->ldev->md.md_offset;
3753
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003754 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003755 /* this was a try anyways ... */
3756 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003757 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003758 }
3759
3760 /* Update mdev->ldev->md.la_size_sect,
3761 * since we updated it on metadata. */
3762 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3763
Philipp Reisnere1711732011-06-27 11:51:46 +02003764 drbd_md_put_buffer(mdev);
3765out:
Philipp Reisnerb411b362009-09-25 16:07:19 -07003766 put_ldev(mdev);
3767}
3768
3769/**
3770 * drbd_md_read() - Reads in the meta data super block
3771 * @mdev: DRBD device.
3772 * @bdev: Device from which the meta data should be read in.
3773 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003774 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003775 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3776 */
3777int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3778{
3779 struct meta_data_on_disk *buffer;
3780 int i, rv = NO_ERROR;
3781
3782 if (!get_ldev_if_state(mdev, D_ATTACHING))
3783 return ERR_IO_MD_DISK;
3784
Philipp Reisnere1711732011-06-27 11:51:46 +02003785 buffer = drbd_md_get_buffer(mdev);
3786 if (!buffer)
3787 goto out;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003788
3789 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003790 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003791 called BEFORE disk is attached */
3792 dev_err(DEV, "Error while reading metadata.\n");
3793 rv = ERR_IO_MD_DISK;
3794 goto err;
3795 }
3796
3797 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3798 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3799 rv = ERR_MD_INVALID;
3800 goto err;
3801 }
3802 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3803 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3804 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3805 rv = ERR_MD_INVALID;
3806 goto err;
3807 }
3808 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3809 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3810 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3811 rv = ERR_MD_INVALID;
3812 goto err;
3813 }
3814 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3815 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3816 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3817 rv = ERR_MD_INVALID;
3818 goto err;
3819 }
3820
3821 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3822 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3823 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3824 rv = ERR_MD_INVALID;
3825 goto err;
3826 }
3827
3828 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3829 for (i = UI_CURRENT; i < UI_SIZE; i++)
3830 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3831 bdev->md.flags = be32_to_cpu(buffer->flags);
3832 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3833 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3834
Philipp Reisner99432fc2011-05-20 16:39:13 +02003835 spin_lock_irq(&mdev->req_lock);
3836 if (mdev->state.conn < C_CONNECTED) {
3837 int peer;
3838 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3839 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3840 mdev->peer_max_bio_size = peer;
3841 }
3842 spin_unlock_irq(&mdev->req_lock);
3843
Philipp Reisnerb411b362009-09-25 16:07:19 -07003844 if (mdev->sync_conf.al_extents < 7)
3845 mdev->sync_conf.al_extents = 127;
3846
3847 err:
Philipp Reisnere1711732011-06-27 11:51:46 +02003848 drbd_md_put_buffer(mdev);
3849 out:
Philipp Reisnerb411b362009-09-25 16:07:19 -07003850 put_ldev(mdev);
3851
3852 return rv;
3853}
3854
3855/**
3856 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3857 * @mdev: DRBD device.
3858 *
3859 * Call this function if you change anything that should be written to
3860 * the meta-data super block. This function sets MD_DIRTY, and starts a
3861 * timer that ensures that within five seconds you have to call drbd_md_sync().
3862 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003863#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003864void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3865{
3866 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3867 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3868 mdev->last_md_mark_dirty.line = line;
3869 mdev->last_md_mark_dirty.func = func;
3870 }
3871}
3872#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003873void drbd_md_mark_dirty(struct drbd_conf *mdev)
3874{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003875 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003876 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003877}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003878#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003879
3880static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3881{
3882 int i;
3883
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003884 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003885 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003886}
3887
3888void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3889{
3890 if (idx == UI_CURRENT) {
3891 if (mdev->state.role == R_PRIMARY)
3892 val |= 1;
3893 else
3894 val &= ~((u64)1);
3895
3896 drbd_set_ed_uuid(mdev, val);
3897 }
3898
3899 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003900 drbd_md_mark_dirty(mdev);
3901}
3902
3903
3904void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3905{
3906 if (mdev->ldev->md.uuid[idx]) {
3907 drbd_uuid_move_history(mdev);
3908 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003909 }
3910 _drbd_uuid_set(mdev, idx, val);
3911}
3912
3913/**
3914 * drbd_uuid_new_current() - Creates a new current UUID
3915 * @mdev: DRBD device.
3916 *
3917 * Creates a new current UUID, and rotates the old current UUID into
3918 * the bitmap slot. Causes an incremental resync upon next connect.
3919 */
3920void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3921{
3922 u64 val;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003923 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003924
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003925 if (bm_uuid)
3926 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3927
Philipp Reisnerb411b362009-09-25 16:07:19 -07003928 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003929
3930 get_random_bytes(&val, sizeof(u64));
3931 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003932 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003933 /* get it to stable storage _now_ */
3934 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003935}
3936
3937void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3938{
3939 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3940 return;
3941
3942 if (val == 0) {
3943 drbd_uuid_move_history(mdev);
3944 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3945 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003946 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003947 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3948 if (bm_uuid)
3949 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003950
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003951 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003952 }
3953 drbd_md_mark_dirty(mdev);
3954}
3955
3956/**
3957 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3958 * @mdev: DRBD device.
3959 *
3960 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3961 */
3962int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3963{
3964 int rv = -EIO;
3965
3966 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3967 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3968 drbd_md_sync(mdev);
3969 drbd_bm_set_all(mdev);
3970
3971 rv = drbd_bm_write(mdev);
3972
3973 if (!rv) {
3974 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3975 drbd_md_sync(mdev);
3976 }
3977
3978 put_ldev(mdev);
3979 }
3980
3981 return rv;
3982}
3983
3984/**
3985 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3986 * @mdev: DRBD device.
3987 *
3988 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3989 */
3990int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3991{
3992 int rv = -EIO;
3993
Philipp Reisner07782862010-08-31 12:00:50 +02003994 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003995 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3996 drbd_bm_clear_all(mdev);
3997 rv = drbd_bm_write(mdev);
3998 put_ldev(mdev);
3999 }
4000
4001 return rv;
4002}
4003
4004static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4005{
4006 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01004007 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004008
4009 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4010
Lars Ellenberg02851e92010-12-16 14:47:39 +01004011 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004012 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01004013 rv = work->io_fn(mdev);
4014 drbd_bm_unlock(mdev);
4015 put_ldev(mdev);
4016 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07004017
4018 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01004019 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07004020 wake_up(&mdev->misc_wait);
4021
4022 if (work->done)
4023 work->done(mdev, rv);
4024
4025 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
4026 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004027 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004028
4029 return 1;
4030}
4031
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004032void drbd_ldev_destroy(struct drbd_conf *mdev)
4033{
4034 lc_destroy(mdev->resync);
4035 mdev->resync = NULL;
4036 lc_destroy(mdev->act_log);
4037 mdev->act_log = NULL;
4038 __no_warn(local,
4039 drbd_free_bc(mdev->ldev);
4040 mdev->ldev = NULL;);
4041
4042 if (mdev->md_io_tmpp) {
4043 __free_page(mdev->md_io_tmpp);
4044 mdev->md_io_tmpp = NULL;
4045 }
4046 clear_bit(GO_DISKLESS, &mdev->flags);
4047}
4048
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004049static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4050{
4051 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02004052 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4053 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004054 * the protected members anymore, though, so once put_ldev reaches zero
4055 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004056 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004057 return 1;
4058}
4059
4060void drbd_go_diskless(struct drbd_conf *mdev)
4061{
4062 D_ASSERT(mdev->state.disk == D_FAILED);
4063 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02004064 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004065}
4066
Philipp Reisnerb411b362009-09-25 16:07:19 -07004067/**
4068 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4069 * @mdev: DRBD device.
4070 * @io_fn: IO callback to be called when bitmap IO is possible
4071 * @done: callback to be called after the bitmap IO was performed
4072 * @why: Descriptive text of the reason for doing the IO
4073 *
4074 * While IO on the bitmap happens we freeze application IO thus we ensure
4075 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4076 * called from worker context. It MUST NOT be used while a previous such
4077 * work is still pending!
4078 */
4079void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4080 int (*io_fn)(struct drbd_conf *),
4081 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004082 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004083{
4084 D_ASSERT(current == mdev->worker.task);
4085
4086 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4087 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4088 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4089 if (mdev->bm_io_work.why)
4090 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4091 why, mdev->bm_io_work.why);
4092
4093 mdev->bm_io_work.io_fn = io_fn;
4094 mdev->bm_io_work.done = done;
4095 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004096 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004097
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004098 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004099 set_bit(BITMAP_IO, &mdev->flags);
4100 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004101 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004102 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004103 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004104 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004105}
4106
4107/**
4108 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4109 * @mdev: DRBD device.
4110 * @io_fn: IO callback to be called when bitmap IO is possible
4111 * @why: Descriptive text of the reason for doing the IO
4112 *
4113 * freezes application IO while that the actual IO operations runs. This
4114 * functions MAY NOT be called from worker context.
4115 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004116int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4117 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004118{
4119 int rv;
4120
4121 D_ASSERT(current != mdev->worker.task);
4122
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004123 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4124 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004125
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004126 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004127 rv = io_fn(mdev);
4128 drbd_bm_unlock(mdev);
4129
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004130 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4131 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004132
4133 return rv;
4134}
4135
4136void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4137{
4138 if ((mdev->ldev->md.flags & flag) != flag) {
4139 drbd_md_mark_dirty(mdev);
4140 mdev->ldev->md.flags |= flag;
4141 }
4142}
4143
4144void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4145{
4146 if ((mdev->ldev->md.flags & flag) != 0) {
4147 drbd_md_mark_dirty(mdev);
4148 mdev->ldev->md.flags &= ~flag;
4149 }
4150}
4151int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4152{
4153 return (bdev->md.flags & flag) != 0;
4154}
4155
4156static void md_sync_timer_fn(unsigned long data)
4157{
4158 struct drbd_conf *mdev = (struct drbd_conf *) data;
4159
4160 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4161}
4162
4163static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4164{
4165 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004166#ifdef DEBUG
4167 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4168 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4169#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004170 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004171 return 1;
4172}
4173
4174#ifdef CONFIG_DRBD_FAULT_INJECTION
4175/* Fault insertion support including random number generator shamelessly
4176 * stolen from kernel/rcutorture.c */
4177struct fault_random_state {
4178 unsigned long state;
4179 unsigned long count;
4180};
4181
4182#define FAULT_RANDOM_MULT 39916801 /* prime */
4183#define FAULT_RANDOM_ADD 479001701 /* prime */
4184#define FAULT_RANDOM_REFRESH 10000
4185
4186/*
4187 * Crude but fast random-number generator. Uses a linear congruential
4188 * generator, with occasional help from get_random_bytes().
4189 */
4190static unsigned long
4191_drbd_fault_random(struct fault_random_state *rsp)
4192{
4193 long refresh;
4194
Roel Kluin49829ea2009-12-15 22:55:44 +01004195 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004196 get_random_bytes(&refresh, sizeof(refresh));
4197 rsp->state += refresh;
4198 rsp->count = FAULT_RANDOM_REFRESH;
4199 }
4200 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4201 return swahw32(rsp->state);
4202}
4203
4204static char *
4205_drbd_fault_str(unsigned int type) {
4206 static char *_faults[] = {
4207 [DRBD_FAULT_MD_WR] = "Meta-data write",
4208 [DRBD_FAULT_MD_RD] = "Meta-data read",
4209 [DRBD_FAULT_RS_WR] = "Resync write",
4210 [DRBD_FAULT_RS_RD] = "Resync read",
4211 [DRBD_FAULT_DT_WR] = "Data write",
4212 [DRBD_FAULT_DT_RD] = "Data read",
4213 [DRBD_FAULT_DT_RA] = "Data read ahead",
4214 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004215 [DRBD_FAULT_AL_EE] = "EE allocation",
4216 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004217 };
4218
4219 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4220}
4221
4222unsigned int
4223_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4224{
4225 static struct fault_random_state rrs = {0, 0};
4226
4227 unsigned int ret = (
4228 (fault_devs == 0 ||
4229 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4230 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4231
4232 if (ret) {
4233 fault_count++;
4234
Lars Ellenberg73835062010-05-27 11:51:56 +02004235 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004236 dev_warn(DEV, "***Simulating %s failure\n",
4237 _drbd_fault_str(type));
4238 }
4239
4240 return ret;
4241}
4242#endif
4243
4244const char *drbd_buildtag(void)
4245{
4246 /* DRBD built from external sources has here a reference to the
4247 git hash of the source code. */
4248
4249 static char buildtag[38] = "\0uilt-in";
4250
4251 if (buildtag[0] == 0) {
4252#ifdef CONFIG_MODULES
4253 if (THIS_MODULE != NULL)
4254 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4255 else
4256#endif
4257 buildtag[0] = 'b';
4258 }
4259
4260 return buildtag;
4261}
4262
4263module_init(drbd_init)
4264module_exit(drbd_cleanup)
4265
Philipp Reisnerb411b362009-09-25 16:07:19 -07004266EXPORT_SYMBOL(drbd_conn_str);
4267EXPORT_SYMBOL(drbd_role_str);
4268EXPORT_SYMBOL(drbd_disk_str);
4269EXPORT_SYMBOL(drbd_set_st_err_str);