[BTRFS][UBTRFS][SHELLBTRFS] Upgrade to 1.7.2
[reactos.git] / drivers / filesystems / btrfs / balance.c
1 /* Copyright (c) Mark Harmstone 2016-17
2 *
3 * This file is part of WinBtrfs.
4 *
5 * WinBtrfs is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public Licence as published by
7 * the Free Software Foundation, either version 3 of the Licence, or
8 * (at your option) any later version.
9 *
10 * WinBtrfs is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public Licence for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public Licence
16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include "crc32c.h"
21 #include <ntddstor.h>
22
23 typedef struct {
24 uint64_t address;
25 uint64_t new_address;
26 tree_header* data;
27 EXTENT_ITEM* ei;
28 tree* t;
29 bool system;
30 LIST_ENTRY refs;
31 LIST_ENTRY list_entry;
32 } metadata_reloc;
33
34 typedef struct {
35 uint8_t type;
36 uint64_t hash;
37
38 union {
39 TREE_BLOCK_REF tbr;
40 SHARED_BLOCK_REF sbr;
41 };
42
43 metadata_reloc* parent;
44 bool top;
45 LIST_ENTRY list_entry;
46 } metadata_reloc_ref;
47
48 typedef struct {
49 uint64_t address;
50 uint64_t size;
51 uint64_t new_address;
52 chunk* newchunk;
53 EXTENT_ITEM* ei;
54 LIST_ENTRY refs;
55 LIST_ENTRY list_entry;
56 } data_reloc;
57
58 typedef struct {
59 uint8_t type;
60 uint64_t hash;
61
62 union {
63 EXTENT_DATA_REF edr;
64 SHARED_DATA_REF sdr;
65 };
66
67 metadata_reloc* parent;
68 LIST_ENTRY list_entry;
69 } data_reloc_ref;
70
71 #ifndef _MSC_VER // not in mingw yet
72 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
73 #endif
74
75 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
76
77 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
78 bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
79 NTSTATUS Status;
80 metadata_reloc* mr;
81 EXTENT_ITEM* ei;
82 uint16_t len;
83 uint64_t inline_rc;
84 uint8_t* ptr;
85
86 mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
87 if (!mr) {
88 ERR("out of memory\n");
89 return STATUS_INSUFFICIENT_RESOURCES;
90 }
91
92 mr->address = tp->item->key.obj_id;
93 mr->data = NULL;
94 mr->ei = (EXTENT_ITEM*)tp->item->data;
95 mr->system = false;
96 InitializeListHead(&mr->refs);
97
98 Status = delete_tree_item(Vcb, tp);
99 if (!NT_SUCCESS(Status)) {
100 ERR("delete_tree_item returned %08lx\n", Status);
101 ExFreePool(mr);
102 return Status;
103 }
104
105 if (!c)
106 c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
107
108 if (c) {
109 acquire_chunk_lock(c, Vcb);
110
111 c->used -= Vcb->superblock.node_size;
112
113 space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
114
115 release_chunk_lock(c, Vcb);
116 }
117
118 ei = (EXTENT_ITEM*)tp->item->data;
119 inline_rc = 0;
120
121 len = tp->item->size - sizeof(EXTENT_ITEM);
122 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
123 if (!skinny) {
124 len -= sizeof(EXTENT_ITEM2);
125 ptr += sizeof(EXTENT_ITEM2);
126 }
127
128 while (len > 0) {
129 uint8_t secttype = *ptr;
130 uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
131 metadata_reloc_ref* ref;
132
133 len--;
134
135 if (sectlen > len) {
136 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
137 return STATUS_INTERNAL_ERROR;
138 }
139
140 if (sectlen == 0) {
141 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
142 return STATUS_INTERNAL_ERROR;
143 }
144
145 ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
146 if (!ref) {
147 ERR("out of memory\n");
148 return STATUS_INSUFFICIENT_RESOURCES;
149 }
150
151 if (secttype == TYPE_TREE_BLOCK_REF) {
152 ref->type = TYPE_TREE_BLOCK_REF;
153 RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
154 inline_rc++;
155 } else if (secttype == TYPE_SHARED_BLOCK_REF) {
156 ref->type = TYPE_SHARED_BLOCK_REF;
157 RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
158 inline_rc++;
159 } else {
160 ERR("unexpected tree type %x\n", secttype);
161 ExFreePool(ref);
162 return STATUS_INTERNAL_ERROR;
163 }
164
165 ref->parent = NULL;
166 ref->top = false;
167 InsertTailList(&mr->refs, &ref->list_entry);
168
169 len -= sectlen;
170 ptr += sizeof(uint8_t) + sectlen;
171 }
172
173 if (inline_rc < ei->refcount) { // look for non-inline entries
174 traverse_ptr tp2 = *tp, next_tp;
175
176 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
177 tp2 = next_tp;
178
179 if (tp2.item->key.obj_id == tp->item->key.obj_id) {
180 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
181 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
182 if (!ref) {
183 ERR("out of memory\n");
184 return STATUS_INSUFFICIENT_RESOURCES;
185 }
186
187 ref->type = TYPE_TREE_BLOCK_REF;
188 ref->tbr.offset = tp2.item->key.offset;
189 ref->parent = NULL;
190 ref->top = false;
191 InsertTailList(&mr->refs, &ref->list_entry);
192
193 Status = delete_tree_item(Vcb, &tp2);
194 if (!NT_SUCCESS(Status)) {
195 ERR("delete_tree_item returned %08lx\n", Status);
196 return Status;
197 }
198 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
199 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
200 if (!ref) {
201 ERR("out of memory\n");
202 return STATUS_INSUFFICIENT_RESOURCES;
203 }
204
205 ref->type = TYPE_SHARED_BLOCK_REF;
206 ref->sbr.offset = tp2.item->key.offset;
207 ref->parent = NULL;
208 ref->top = false;
209 InsertTailList(&mr->refs, &ref->list_entry);
210
211 Status = delete_tree_item(Vcb, &tp2);
212 if (!NT_SUCCESS(Status)) {
213 ERR("delete_tree_item returned %08lx\n", Status);
214 return Status;
215 }
216 }
217 } else
218 break;
219 }
220 }
221
222 InsertTailList(items, &mr->list_entry);
223
224 if (mr2)
225 *mr2 = mr;
226
227 return STATUS_SUCCESS;
228 }
229
230 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
231 uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
232 LIST_ENTRY* le;
233 KEY searchkey;
234 traverse_ptr tp;
235 bool skinny = false;
236 NTSTATUS Status;
237
238 le = items->Flink;
239 while (le != items) {
240 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
241
242 if (mr->address == address) {
243 *mr2 = mr;
244 return STATUS_SUCCESS;
245 }
246
247 le = le->Flink;
248 }
249
250 searchkey.obj_id = address;
251 searchkey.obj_type = TYPE_METADATA_ITEM;
252 searchkey.offset = 0xffffffffffffffff;
253
254 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
255 if (!NT_SUCCESS(Status)) {
256 ERR("find_item returned %08lx\n", Status);
257 return Status;
258 }
259
260 if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
261 skinny = true;
262 else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
263 tp.item->size >= sizeof(EXTENT_ITEM)) {
264 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
265
266 if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
267 ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
268 return STATUS_INTERNAL_ERROR;
269 }
270 } else {
271 ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
272 return STATUS_INTERNAL_ERROR;
273 }
274
275 Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
276 if (!NT_SUCCESS(Status)) {
277 ERR("add_metadata_reloc returned %08lx\n", Status);
278 return Status;
279 }
280
281 return STATUS_SUCCESS;
282 }
283
284 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
285 LIST_ENTRY newlist, *le;
286
287 if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
288 return;
289
290 // insertion sort
291
292 InitializeListHead(&newlist);
293
294 while (!IsListEmpty(&mr->refs)) {
295 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
296 bool inserted = false;
297
298 if (ref->type == TYPE_TREE_BLOCK_REF)
299 ref->hash = ref->tbr.offset;
300 else if (ref->type == TYPE_SHARED_BLOCK_REF)
301 ref->hash = ref->parent->new_address;
302
303 le = newlist.Flink;
304 while (le != &newlist) {
305 metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
306
307 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
308 InsertHeadList(le->Blink, &ref->list_entry);
309 inserted = true;
310 break;
311 }
312
313 le = le->Flink;
314 }
315
316 if (!inserted)
317 InsertTailList(&newlist, &ref->list_entry);
318 }
319
320 newlist.Flink->Blink = &mr->refs;
321 newlist.Blink->Flink = &mr->refs;
322 mr->refs.Flink = newlist.Flink;
323 mr->refs.Blink = newlist.Blink;
324 }
325
326 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
327 NTSTATUS Status;
328 LIST_ENTRY* le;
329 uint64_t rc = 0;
330 uint16_t inline_len;
331 bool all_inline = true;
332 metadata_reloc_ref* first_noninline = NULL;
333 EXTENT_ITEM* ei;
334 uint8_t* ptr;
335
336 inline_len = sizeof(EXTENT_ITEM);
337 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
338 inline_len += sizeof(EXTENT_ITEM2);
339
340 sort_metadata_reloc_refs(mr);
341
342 le = mr->refs.Flink;
343 while (le != &mr->refs) {
344 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
345 uint16_t extlen = 0;
346
347 rc++;
348
349 if (ref->type == TYPE_TREE_BLOCK_REF)
350 extlen += sizeof(TREE_BLOCK_REF);
351 else if (ref->type == TYPE_SHARED_BLOCK_REF)
352 extlen += sizeof(SHARED_BLOCK_REF);
353
354 if (all_inline) {
355 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
356 all_inline = false;
357 first_noninline = ref;
358 } else
359 inline_len += extlen + 1;
360 }
361
362 le = le->Flink;
363 }
364
365 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
366 if (!ei) {
367 ERR("out of memory\n");
368 return STATUS_INSUFFICIENT_RESOURCES;
369 }
370
371 ei->refcount = rc;
372 ei->generation = mr->ei->generation;
373 ei->flags = mr->ei->flags;
374 ptr = (uint8_t*)&ei[1];
375
376 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
377 EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
378
379 ei2->firstitem = *(KEY*)&mr->data[1];
380 ei2->level = mr->data->level;
381
382 ptr += sizeof(EXTENT_ITEM2);
383 }
384
385 le = mr->refs.Flink;
386 while (le != &mr->refs) {
387 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
388
389 if (ref == first_noninline)
390 break;
391
392 *ptr = ref->type;
393 ptr++;
394
395 if (ref->type == TYPE_TREE_BLOCK_REF) {
396 TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
397
398 tbr->offset = ref->tbr.offset;
399
400 ptr += sizeof(TREE_BLOCK_REF);
401 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
402 SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
403
404 sbr->offset = ref->parent->new_address;
405
406 ptr += sizeof(SHARED_BLOCK_REF);
407 }
408
409 le = le->Flink;
410 }
411
412 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
413 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
414 else
415 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
416
417 if (!NT_SUCCESS(Status)) {
418 ERR("insert_tree_item returned %08lx\n", Status);
419 ExFreePool(ei);
420 return Status;
421 }
422
423 if (!all_inline) {
424 le = &first_noninline->list_entry;
425
426 while (le != &mr->refs) {
427 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
428
429 if (ref->type == TYPE_TREE_BLOCK_REF) {
430 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
431 if (!NT_SUCCESS(Status)) {
432 ERR("insert_tree_item returned %08lx\n", Status);
433 return Status;
434 }
435 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
436 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
437 if (!NT_SUCCESS(Status)) {
438 ERR("insert_tree_item returned %08lx\n", Status);
439 return Status;
440 }
441 }
442
443 le = le->Flink;
444 }
445 }
446
447 if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
448 if (mr->data->level > 0) {
449 uint16_t i;
450 internal_node* in = (internal_node*)&mr->data[1];
451
452 for (i = 0; i < mr->data->num_items; i++) {
453 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
454
455 if (sbrrc > 0) {
456 SHARED_BLOCK_REF sbr;
457
458 sbr.offset = mr->new_address;
459
460 Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
461 if (!NT_SUCCESS(Status)) {
462 ERR("increase_extent_refcount returned %08lx\n", Status);
463 return Status;
464 }
465
466 sbr.offset = mr->address;
467
468 Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
469 sbr.offset, false, NULL);
470 if (!NT_SUCCESS(Status)) {
471 ERR("decrease_extent_refcount returned %08lx\n", Status);
472 return Status;
473 }
474 }
475 }
476 } else {
477 uint16_t i;
478 leaf_node* ln = (leaf_node*)&mr->data[1];
479
480 for (i = 0; i < mr->data->num_items; i++) {
481 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
482 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
483
484 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
485 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
486
487 if (ed2->size > 0) { // not sparse
488 uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
489
490 if (sdrrc > 0) {
491 SHARED_DATA_REF sdr;
492 chunk* c;
493
494 sdr.offset = mr->new_address;
495 sdr.count = sdrrc;
496
497 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
498 if (!NT_SUCCESS(Status)) {
499 ERR("increase_extent_refcount returned %08lx\n", Status);
500 return Status;
501 }
502
503 sdr.offset = mr->address;
504
505 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
506 sdr.offset, false, NULL);
507 if (!NT_SUCCESS(Status)) {
508 ERR("decrease_extent_refcount returned %08lx\n", Status);
509 return Status;
510 }
511
512 c = get_chunk_from_address(Vcb, ed2->address);
513
514 if (c) {
515 // check changed_extents
516
517 ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
518
519 le = c->changed_extents.Flink;
520
521 while (le != &c->changed_extents) {
522 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
523
524 if (ce->address == ed2->address) {
525 LIST_ENTRY* le2;
526
527 le2 = ce->refs.Flink;
528 while (le2 != &ce->refs) {
529 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
530
531 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
532 cer->sdr.offset = mr->new_address;
533 break;
534 }
535
536 le2 = le2->Flink;
537 }
538
539 le2 = ce->old_refs.Flink;
540 while (le2 != &ce->old_refs) {
541 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
542
543 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
544 cer->sdr.offset = mr->new_address;
545 break;
546 }
547
548 le2 = le2->Flink;
549 }
550
551 break;
552 }
553
554 le = le->Flink;
555 }
556
557 ExReleaseResourceLite(&c->changed_extents_lock);
558 }
559 }
560 }
561 }
562 }
563 }
564 }
565 }
566
567 return STATUS_SUCCESS;
568 }
569
570 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
571 LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
572 LIST_ENTRY tree_writes, *le;
573 NTSTATUS Status;
574 traverse_ptr tp;
575 uint8_t level, max_level = 0;
576 chunk* newchunk = NULL;
577
578 InitializeListHead(&tree_writes);
579
580 le = items->Flink;
581 while (le != items) {
582 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
583 LIST_ENTRY* le2;
584 chunk* pc;
585
586 mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
587 if (!mr->data) {
588 ERR("out of memory\n");
589 return STATUS_INSUFFICIENT_RESOURCES;
590 }
591
592 Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
593 c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
594 if (!NT_SUCCESS(Status)) {
595 ERR("read_data returned %08lx\n", Status);
596 return Status;
597 }
598
599 if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
600 mr->system = true;
601
602 if (data_items && mr->data->level == 0) {
603 le2 = data_items->Flink;
604 while (le2 != data_items) {
605 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
606 leaf_node* ln = (leaf_node*)&mr->data[1];
607 uint16_t i;
608
609 for (i = 0; i < mr->data->num_items; i++) {
610 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
611 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
612
613 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
614 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
615
616 if (ed2->address == dr->address)
617 ed2->address = dr->new_address;
618 }
619 }
620 }
621
622 le2 = le2->Flink;
623 }
624 }
625
626 if (mr->data->level > max_level)
627 max_level = mr->data->level;
628
629 le2 = mr->refs.Flink;
630 while (le2 != &mr->refs) {
631 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
632
633 if (ref->type == TYPE_TREE_BLOCK_REF) {
634 KEY* firstitem;
635 root* r = NULL;
636 LIST_ENTRY* le3;
637 tree* t;
638
639 firstitem = (KEY*)&mr->data[1];
640
641 le3 = Vcb->roots.Flink;
642 while (le3 != &Vcb->roots) {
643 root* r2 = CONTAINING_RECORD(le3, root, list_entry);
644
645 if (r2->id == ref->tbr.offset) {
646 r = r2;
647 break;
648 }
649
650 le3 = le3->Flink;
651 }
652
653 if (!r) {
654 ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
655 return STATUS_INTERNAL_ERROR;
656 }
657
658 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
659 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
660 ERR("find_item_to_level returned %08lx\n", Status);
661 return Status;
662 }
663
664 t = tp.tree;
665 while (t && t->header.level < mr->data->level + 1) {
666 t = t->parent;
667 }
668
669 if (!t)
670 ref->top = true;
671 else {
672 metadata_reloc* mr2;
673
674 Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
675 if (!NT_SUCCESS(Status)) {
676 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
677 return Status;
678 }
679
680 ref->parent = mr2;
681 }
682 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
683 metadata_reloc* mr2;
684
685 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
686 if (!NT_SUCCESS(Status)) {
687 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
688 return Status;
689 }
690
691 ref->parent = mr2;
692 }
693
694 le2 = le2->Flink;
695 }
696
697 le = le->Flink;
698 }
699
700 le = items->Flink;
701 while (le != items) {
702 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
703 LIST_ENTRY* le2;
704 uint32_t hash;
705
706 mr->t = NULL;
707
708 hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
709
710 le2 = Vcb->trees_ptrs[hash >> 24];
711
712 if (le2) {
713 while (le2 != &Vcb->trees_hash) {
714 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
715
716 if (t->header.address == mr->address) {
717 mr->t = t;
718 break;
719 } else if (t->hash > hash)
720 break;
721
722 le2 = le2->Flink;
723 }
724 }
725
726 le = le->Flink;
727 }
728
729 for (level = 0; level <= max_level; level++) {
730 le = items->Flink;
731 while (le != items) {
732 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
733
734 if (mr->data->level == level) {
735 bool done = false;
736 LIST_ENTRY* le2;
737 tree_write* tw;
738 uint64_t flags;
739 tree* t3;
740
741 if (mr->system)
742 flags = Vcb->system_flags;
743 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
744 flags = Vcb->data_flags;
745 else
746 flags = Vcb->metadata_flags;
747
748 if (newchunk) {
749 acquire_chunk_lock(newchunk, Vcb);
750
751 if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
752 newchunk->used += Vcb->superblock.node_size;
753 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
754 done = true;
755 }
756
757 release_chunk_lock(newchunk, Vcb);
758 }
759
760 if (!done) {
761 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
762
763 le2 = Vcb->chunks.Flink;
764 while (le2 != &Vcb->chunks) {
765 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
766
767 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
768 acquire_chunk_lock(c2, Vcb);
769
770 if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
771 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
772 c2->used += Vcb->superblock.node_size;
773 space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback);
774 release_chunk_lock(c2, Vcb);
775 newchunk = c2;
776 done = true;
777 break;
778 }
779 }
780
781 release_chunk_lock(c2, Vcb);
782 }
783
784 le2 = le2->Flink;
785 }
786
787 // allocate new chunk if necessary
788 if (!done) {
789 Status = alloc_chunk(Vcb, flags, &newchunk, false);
790
791 if (!NT_SUCCESS(Status)) {
792 ERR("alloc_chunk returned %08lx\n", Status);
793 ExReleaseResourceLite(&Vcb->chunk_lock);
794 goto end;
795 }
796
797 acquire_chunk_lock(newchunk, Vcb);
798
799 newchunk->balance_num = Vcb->balance.balance_num;
800
801 if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
802 release_chunk_lock(newchunk, Vcb);
803 ExReleaseResourceLite(&Vcb->chunk_lock);
804 ERR("could not find address in new chunk\n");
805 Status = STATUS_DISK_FULL;
806 goto end;
807 } else {
808 newchunk->used += Vcb->superblock.node_size;
809 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
810 }
811
812 release_chunk_lock(newchunk, Vcb);
813 }
814
815 ExReleaseResourceLite(&Vcb->chunk_lock);
816 }
817
818 // update parents
819 le2 = mr->refs.Flink;
820 while (le2 != &mr->refs) {
821 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
822
823 if (ref->parent) {
824 uint16_t i;
825 internal_node* in = (internal_node*)&ref->parent->data[1];
826
827 for (i = 0; i < ref->parent->data->num_items; i++) {
828 if (in[i].address == mr->address) {
829 in[i].address = mr->new_address;
830 break;
831 }
832 }
833
834 if (ref->parent->t) {
835 LIST_ENTRY* le3;
836
837 le3 = ref->parent->t->itemlist.Flink;
838 while (le3 != &ref->parent->t->itemlist) {
839 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
840
841 if (!td->inserted && td->treeholder.address == mr->address)
842 td->treeholder.address = mr->new_address;
843
844 le3 = le3->Flink;
845 }
846 }
847 } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
848 LIST_ENTRY* le3;
849 root* r = NULL;
850
851 // alter ROOT_ITEM
852
853 le3 = Vcb->roots.Flink;
854 while (le3 != &Vcb->roots) {
855 root* r2 = CONTAINING_RECORD(le3, root, list_entry);
856
857 if (r2->id == ref->tbr.offset) {
858 r = r2;
859 break;
860 }
861
862 le3 = le3->Flink;
863 }
864
865 if (r) {
866 r->treeholder.address = mr->new_address;
867
868 if (r == Vcb->root_root)
869 Vcb->superblock.root_tree_addr = mr->new_address;
870 else if (r == Vcb->chunk_root)
871 Vcb->superblock.chunk_tree_addr = mr->new_address;
872 else if (r->root_item.block_number == mr->address) {
873 KEY searchkey;
874 ROOT_ITEM* ri;
875
876 r->root_item.block_number = mr->new_address;
877
878 searchkey.obj_id = r->id;
879 searchkey.obj_type = TYPE_ROOT_ITEM;
880 searchkey.offset = 0xffffffffffffffff;
881
882 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
883 if (!NT_SUCCESS(Status)) {
884 ERR("find_item returned %08lx\n", Status);
885 goto end;
886 }
887
888 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
889 ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
890 Status = STATUS_INTERNAL_ERROR;
891 goto end;
892 }
893
894 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
895 if (!ri) {
896 ERR("out of memory\n");
897 Status = STATUS_INSUFFICIENT_RESOURCES;
898 goto end;
899 }
900
901 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
902
903 Status = delete_tree_item(Vcb, &tp);
904 if (!NT_SUCCESS(Status)) {
905 ERR("delete_tree_item returned %08lx\n", Status);
906 goto end;
907 }
908
909 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
910 if (!NT_SUCCESS(Status)) {
911 ERR("insert_tree_item returned %08lx\n", Status);
912 goto end;
913 }
914 }
915 }
916 }
917
918 le2 = le2->Flink;
919 }
920
921 mr->data->address = mr->new_address;
922
923 t3 = mr->t;
924
925 while (t3) {
926 uint8_t h;
927 bool inserted;
928 tree* t4 = NULL;
929
930 // check if tree loaded more than once
931 if (t3->list_entry.Flink != &Vcb->trees_hash) {
932 tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
933
934 if (nt->header.address == t3->header.address)
935 t4 = nt;
936 }
937
938 t3->header.address = mr->new_address;
939
940 h = t3->hash >> 24;
941
942 if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
943 if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
944 Vcb->trees_ptrs[h] = NULL;
945 else {
946 tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
947
948 if (t2->hash >> 24 == h)
949 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
950 else
951 Vcb->trees_ptrs[h] = NULL;
952 }
953 }
954
955 RemoveEntryList(&t3->list_entry_hash);
956
957 t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
958 h = t3->hash >> 24;
959
960 if (!Vcb->trees_ptrs[h]) {
961 uint8_t h2 = h;
962
963 le2 = Vcb->trees_hash.Flink;
964
965 if (h2 > 0) {
966 h2--;
967 do {
968 if (Vcb->trees_ptrs[h2]) {
969 le2 = Vcb->trees_ptrs[h2];
970 break;
971 }
972
973 h2--;
974 } while (h2 > 0);
975 }
976 } else
977 le2 = Vcb->trees_ptrs[h];
978
979 inserted = false;
980 while (le2 != &Vcb->trees_hash) {
981 tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
982
983 if (t2->hash >= t3->hash) {
984 InsertHeadList(le2->Blink, &t3->list_entry_hash);
985 inserted = true;
986 break;
987 }
988
989 le2 = le2->Flink;
990 }
991
992 if (!inserted)
993 InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
994
995 if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
996 Vcb->trees_ptrs[h] = &t3->list_entry_hash;
997
998 if (data_items && level == 0) {
999 le2 = data_items->Flink;
1000
1001 while (le2 != data_items) {
1002 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
1003 LIST_ENTRY* le3 = t3->itemlist.Flink;
1004
1005 while (le3 != &t3->itemlist) {
1006 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1007
1008 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1009 EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1010
1011 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1012 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1013
1014 if (ed2->address == dr->address)
1015 ed2->address = dr->new_address;
1016 }
1017 }
1018
1019 le3 = le3->Flink;
1020 }
1021
1022 le2 = le2->Flink;
1023 }
1024 }
1025
1026 t3 = t4;
1027 }
1028
1029 calc_tree_checksum(Vcb, mr->data);
1030
1031 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1032 if (!tw) {
1033 ERR("out of memory\n");
1034 Status = STATUS_INSUFFICIENT_RESOURCES;
1035 goto end;
1036 }
1037
1038 tw->address = mr->new_address;
1039 tw->length = Vcb->superblock.node_size;
1040 tw->data = (uint8_t*)mr->data;
1041 tw->allocated = false;
1042
1043 if (IsListEmpty(&tree_writes))
1044 InsertTailList(&tree_writes, &tw->list_entry);
1045 else {
1046 bool inserted = false;
1047
1048 le2 = tree_writes.Flink;
1049 while (le2 != &tree_writes) {
1050 tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1051
1052 if (tw2->address > tw->address) {
1053 InsertHeadList(le2->Blink, &tw->list_entry);
1054 inserted = true;
1055 break;
1056 }
1057
1058 le2 = le2->Flink;
1059 }
1060
1061 if (!inserted)
1062 InsertTailList(&tree_writes, &tw->list_entry);
1063 }
1064 }
1065
1066 le = le->Flink;
1067 }
1068 }
1069
1070 Status = do_tree_writes(Vcb, &tree_writes, true);
1071 if (!NT_SUCCESS(Status)) {
1072 ERR("do_tree_writes returned %08lx\n", Status);
1073 goto end;
1074 }
1075
1076 le = items->Flink;
1077 while (le != items) {
1078 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1079
1080 Status = add_metadata_reloc_extent_item(Vcb, mr);
1081 if (!NT_SUCCESS(Status)) {
1082 ERR("add_metadata_reloc_extent_item returned %08lx\n", Status);
1083 goto end;
1084 }
1085
1086 le = le->Flink;
1087 }
1088
1089 Status = STATUS_SUCCESS;
1090
1091 end:
1092 while (!IsListEmpty(&tree_writes)) {
1093 tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1094
1095 if (tw->allocated)
1096 ExFreePool(tw->data);
1097
1098 ExFreePool(tw);
1099 }
1100
1101 return Status;
1102 }
1103
1104 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1105 KEY searchkey;
1106 traverse_ptr tp;
1107 NTSTATUS Status;
1108 bool b;
1109 LIST_ENTRY items, rollback;
1110 uint32_t loaded = 0;
1111
1112 TRACE("chunk %I64x\n", c->offset);
1113
1114 InitializeListHead(&rollback);
1115 InitializeListHead(&items);
1116
1117 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1118
1119 searchkey.obj_id = c->offset;
1120 searchkey.obj_type = TYPE_METADATA_ITEM;
1121 searchkey.offset = 0xffffffffffffffff;
1122
1123 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1124 if (!NT_SUCCESS(Status)) {
1125 ERR("find_item returned %08lx\n", Status);
1126 goto end;
1127 }
1128
1129 do {
1130 traverse_ptr next_tp;
1131
1132 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1133 break;
1134
1135 if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1136 bool tree = false, skinny = false;
1137
1138 if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1139 tree = true;
1140 skinny = true;
1141 } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1142 tp.item->size >= sizeof(EXTENT_ITEM)) {
1143 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1144
1145 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1146 tree = true;
1147 }
1148
1149 if (tree) {
1150 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1151
1152 if (!NT_SUCCESS(Status)) {
1153 ERR("add_metadata_reloc returned %08lx\n", Status);
1154 goto end;
1155 }
1156
1157 loaded++;
1158
1159 if (loaded >= 64) // only do 64 at a time
1160 break;
1161 }
1162 }
1163
1164 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1165
1166 if (b)
1167 tp = next_tp;
1168 } while (b);
1169
1170 if (IsListEmpty(&items)) {
1171 *changed = false;
1172 Status = STATUS_SUCCESS;
1173 goto end;
1174 } else
1175 *changed = true;
1176
1177 Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1178 if (!NT_SUCCESS(Status)) {
1179 ERR("write_metadata_items returned %08lx\n", Status);
1180 goto end;
1181 }
1182
1183 Status = STATUS_SUCCESS;
1184
1185 Vcb->need_write = true;
1186
1187 end:
1188 if (NT_SUCCESS(Status)) {
1189 Status = do_write(Vcb, NULL);
1190 if (!NT_SUCCESS(Status))
1191 ERR("do_write returned %08lx\n", Status);
1192 }
1193
1194 if (NT_SUCCESS(Status))
1195 clear_rollback(&rollback);
1196 else
1197 do_rollback(Vcb, &rollback);
1198
1199 free_trees(Vcb);
1200
1201 ExReleaseResourceLite(&Vcb->tree_lock);
1202
1203 while (!IsListEmpty(&items)) {
1204 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1205
1206 while (!IsListEmpty(&mr->refs)) {
1207 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1208
1209 ExFreePool(ref);
1210 }
1211
1212 if (mr->data)
1213 ExFreePool(mr->data);
1214
1215 ExFreePool(mr);
1216 }
1217
1218 return Status;
1219 }
1220
1221 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1222 data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1223 NTSTATUS Status;
1224 LIST_ENTRY* le;
1225 KEY searchkey;
1226 traverse_ptr tp;
1227 root* r = NULL;
1228 metadata_reloc* mr;
1229 uint64_t last_tree = 0;
1230 data_reloc_ref* ref;
1231
1232 le = Vcb->roots.Flink;
1233 while (le != &Vcb->roots) {
1234 root* r2 = CONTAINING_RECORD(le, root, list_entry);
1235
1236 if (r2->id == edr->root) {
1237 r = r2;
1238 break;
1239 }
1240
1241 le = le->Flink;
1242 }
1243
1244 if (!r) {
1245 ERR("could not find subvol %I64x\n", edr->root);
1246 return STATUS_INTERNAL_ERROR;
1247 }
1248
1249 searchkey.obj_id = edr->objid;
1250 searchkey.obj_type = TYPE_EXTENT_DATA;
1251 searchkey.offset = 0;
1252
1253 Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1254 if (!NT_SUCCESS(Status)) {
1255 ERR("find_item returned %08lx\n", Status);
1256 return Status;
1257 }
1258
1259 if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1260 traverse_ptr tp2;
1261
1262 if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1263 tp = tp2;
1264 else {
1265 ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1266 return STATUS_INTERNAL_ERROR;
1267 }
1268 }
1269
1270 ref = NULL;
1271
1272 while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1273 traverse_ptr tp2;
1274
1275 if (tp.item->size >= sizeof(EXTENT_DATA)) {
1276 EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1277
1278 if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1279 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1280
1281 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1282 if (ref && last_tree == tp.tree->header.address)
1283 ref->edr.count++;
1284 else {
1285 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1286 if (!ref) {
1287 ERR("out of memory\n");
1288 return STATUS_INSUFFICIENT_RESOURCES;
1289 }
1290
1291 ref->type = TYPE_EXTENT_DATA_REF;
1292 RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1293 ref->edr.count = 1;
1294
1295 Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1296 if (!NT_SUCCESS(Status)) {
1297 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1298 ExFreePool(ref);
1299 return Status;
1300 }
1301
1302 last_tree = tp.tree->header.address;
1303 ref->parent = mr;
1304
1305 InsertTailList(&dr->refs, &ref->list_entry);
1306 }
1307 }
1308 }
1309 }
1310
1311 if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1312 tp = tp2;
1313 else
1314 break;
1315 }
1316
1317 return STATUS_SUCCESS;
1318 }
1319
1320 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1321 traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1322 NTSTATUS Status;
1323 data_reloc* dr;
1324 EXTENT_ITEM* ei;
1325 uint16_t len;
1326 uint64_t inline_rc;
1327 uint8_t* ptr;
1328
1329 dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1330 if (!dr) {
1331 ERR("out of memory\n");
1332 return STATUS_INSUFFICIENT_RESOURCES;
1333 }
1334
1335 dr->address = tp->item->key.obj_id;
1336 dr->size = tp->item->key.offset;
1337 dr->ei = (EXTENT_ITEM*)tp->item->data;
1338 InitializeListHead(&dr->refs);
1339
1340 Status = delete_tree_item(Vcb, tp);
1341 if (!NT_SUCCESS(Status)) {
1342 ERR("delete_tree_item returned %08lx\n", Status);
1343 return Status;
1344 }
1345
1346 if (!c)
1347 c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1348
1349 if (c) {
1350 acquire_chunk_lock(c, Vcb);
1351
1352 c->used -= tp->item->key.offset;
1353
1354 space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1355
1356 release_chunk_lock(c, Vcb);
1357 }
1358
1359 ei = (EXTENT_ITEM*)tp->item->data;
1360 inline_rc = 0;
1361
1362 len = tp->item->size - sizeof(EXTENT_ITEM);
1363 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1364
1365 while (len > 0) {
1366 uint8_t secttype = *ptr;
1367 uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1368
1369 len--;
1370
1371 if (sectlen > len) {
1372 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1373 return STATUS_INTERNAL_ERROR;
1374 }
1375
1376 if (sectlen == 0) {
1377 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1378 return STATUS_INTERNAL_ERROR;
1379 }
1380
1381 if (secttype == TYPE_EXTENT_DATA_REF) {
1382 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1383
1384 inline_rc += edr->count;
1385
1386 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1387 if (!NT_SUCCESS(Status)) {
1388 ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1389 return Status;
1390 }
1391 } else if (secttype == TYPE_SHARED_DATA_REF) {
1392 metadata_reloc* mr;
1393 data_reloc_ref* ref;
1394
1395 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1396 if (!ref) {
1397 ERR("out of memory\n");
1398 return STATUS_INSUFFICIENT_RESOURCES;
1399 }
1400
1401 ref->type = TYPE_SHARED_DATA_REF;
1402 RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1403 inline_rc += ref->sdr.count;
1404
1405 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1406 if (!NT_SUCCESS(Status)) {
1407 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1408 ExFreePool(ref);
1409 return Status;
1410 }
1411
1412 ref->parent = mr;
1413
1414 InsertTailList(&dr->refs, &ref->list_entry);
1415 } else {
1416 ERR("unexpected tree type %x\n", secttype);
1417 return STATUS_INTERNAL_ERROR;
1418 }
1419
1420
1421 len -= sectlen;
1422 ptr += sizeof(uint8_t) + sectlen;
1423 }
1424
1425 if (inline_rc < ei->refcount) { // look for non-inline entries
1426 traverse_ptr tp2 = *tp, next_tp;
1427
1428 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1429 tp2 = next_tp;
1430
1431 if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1432 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1433 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1434 if (!NT_SUCCESS(Status)) {
1435 ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1436 return Status;
1437 }
1438
1439 Status = delete_tree_item(Vcb, &tp2);
1440 if (!NT_SUCCESS(Status)) {
1441 ERR("delete_tree_item returned %08lx\n", Status);
1442 return Status;
1443 }
1444 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1445 metadata_reloc* mr;
1446 data_reloc_ref* ref;
1447
1448 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1449 if (!ref) {
1450 ERR("out of memory\n");
1451 return STATUS_INSUFFICIENT_RESOURCES;
1452 }
1453
1454 ref->type = TYPE_SHARED_DATA_REF;
1455 ref->sdr.offset = tp2.item->key.offset;
1456 ref->sdr.count = *((uint32_t*)tp2.item->data);
1457
1458 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1459 if (!NT_SUCCESS(Status)) {
1460 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1461 ExFreePool(ref);
1462 return Status;
1463 }
1464
1465 ref->parent = mr;
1466 InsertTailList(&dr->refs, &ref->list_entry);
1467
1468 Status = delete_tree_item(Vcb, &tp2);
1469 if (!NT_SUCCESS(Status)) {
1470 ERR("delete_tree_item returned %08lx\n", Status);
1471 return Status;
1472 }
1473 }
1474 } else
1475 break;
1476 }
1477 }
1478
1479 InsertTailList(items, &dr->list_entry);
1480
1481 return STATUS_SUCCESS;
1482 }
1483
1484 static void sort_data_reloc_refs(data_reloc* dr) {
1485 LIST_ENTRY newlist, *le;
1486
1487 if (IsListEmpty(&dr->refs))
1488 return;
1489
1490 // insertion sort
1491
1492 InitializeListHead(&newlist);
1493
1494 while (!IsListEmpty(&dr->refs)) {
1495 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1496 bool inserted = false;
1497
1498 if (ref->type == TYPE_EXTENT_DATA_REF)
1499 ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1500 else if (ref->type == TYPE_SHARED_DATA_REF)
1501 ref->hash = ref->parent->new_address;
1502
1503 le = newlist.Flink;
1504 while (le != &newlist) {
1505 data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1506
1507 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1508 InsertHeadList(le->Blink, &ref->list_entry);
1509 inserted = true;
1510 break;
1511 }
1512
1513 le = le->Flink;
1514 }
1515
1516 if (!inserted)
1517 InsertTailList(&newlist, &ref->list_entry);
1518 }
1519
1520 le = newlist.Flink;
1521 while (le != &newlist) {
1522 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1523
1524 if (le->Flink != &newlist) {
1525 data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1526
1527 if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1528 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1529 RemoveEntryList(&ref2->list_entry);
1530 ref->edr.count += ref2->edr.count;
1531 ExFreePool(ref2);
1532 continue;
1533 }
1534 }
1535
1536 le = le->Flink;
1537 }
1538
1539 newlist.Flink->Blink = &dr->refs;
1540 newlist.Blink->Flink = &dr->refs;
1541 dr->refs.Flink = newlist.Flink;
1542 dr->refs.Blink = newlist.Blink;
1543 }
1544
1545 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1546 NTSTATUS Status;
1547 LIST_ENTRY* le;
1548 uint64_t rc = 0;
1549 uint16_t inline_len;
1550 bool all_inline = true;
1551 data_reloc_ref* first_noninline = NULL;
1552 EXTENT_ITEM* ei;
1553 uint8_t* ptr;
1554
1555 inline_len = sizeof(EXTENT_ITEM);
1556
1557 sort_data_reloc_refs(dr);
1558
1559 le = dr->refs.Flink;
1560 while (le != &dr->refs) {
1561 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1562 uint16_t extlen = 0;
1563
1564 if (ref->type == TYPE_EXTENT_DATA_REF) {
1565 extlen += sizeof(EXTENT_DATA_REF);
1566 rc += ref->edr.count;
1567 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1568 extlen += sizeof(SHARED_DATA_REF);
1569 rc++;
1570 }
1571
1572 if (all_inline) {
1573 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1574 all_inline = false;
1575 first_noninline = ref;
1576 } else
1577 inline_len += extlen + 1;
1578 }
1579
1580 le = le->Flink;
1581 }
1582
1583 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1584 if (!ei) {
1585 ERR("out of memory\n");
1586 return STATUS_INSUFFICIENT_RESOURCES;
1587 }
1588
1589 ei->refcount = rc;
1590 ei->generation = dr->ei->generation;
1591 ei->flags = dr->ei->flags;
1592 ptr = (uint8_t*)&ei[1];
1593
1594 le = dr->refs.Flink;
1595 while (le != &dr->refs) {
1596 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1597
1598 if (ref == first_noninline)
1599 break;
1600
1601 *ptr = ref->type;
1602 ptr++;
1603
1604 if (ref->type == TYPE_EXTENT_DATA_REF) {
1605 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1606
1607 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1608
1609 ptr += sizeof(EXTENT_DATA_REF);
1610 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1611 SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1612
1613 sdr->offset = ref->parent->new_address;
1614 sdr->count = ref->sdr.count;
1615
1616 ptr += sizeof(SHARED_DATA_REF);
1617 }
1618
1619 le = le->Flink;
1620 }
1621
1622 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1623 if (!NT_SUCCESS(Status)) {
1624 ERR("insert_tree_item returned %08lx\n", Status);
1625 return Status;
1626 }
1627
1628 if (!all_inline) {
1629 le = &first_noninline->list_entry;
1630
1631 while (le != &dr->refs) {
1632 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1633
1634 if (ref->type == TYPE_EXTENT_DATA_REF) {
1635 EXTENT_DATA_REF* edr;
1636
1637 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1638 if (!edr) {
1639 ERR("out of memory\n");
1640 return STATUS_INSUFFICIENT_RESOURCES;
1641 }
1642
1643 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1644
1645 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1646 if (!NT_SUCCESS(Status)) {
1647 ERR("insert_tree_item returned %08lx\n", Status);
1648 return Status;
1649 }
1650 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1651 uint32_t* sdr;
1652
1653 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG);
1654 if (!sdr) {
1655 ERR("out of memory\n");
1656 return STATUS_INSUFFICIENT_RESOURCES;
1657 }
1658
1659 *sdr = ref->sdr.count;
1660
1661 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1662 if (!NT_SUCCESS(Status)) {
1663 ERR("insert_tree_item returned %08lx\n", Status);
1664 return Status;
1665 }
1666 }
1667
1668 le = le->Flink;
1669 }
1670 }
1671
1672 return STATUS_SUCCESS;
1673 }
1674
1675 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1676 KEY searchkey;
1677 traverse_ptr tp;
1678 NTSTATUS Status;
1679 bool b;
1680 LIST_ENTRY items, metadata_items, rollback, *le;
1681 uint64_t loaded = 0, num_loaded = 0;
1682 chunk* newchunk = NULL;
1683 uint8_t* data = NULL;
1684
1685 TRACE("chunk %I64x\n", c->offset);
1686
1687 InitializeListHead(&rollback);
1688 InitializeListHead(&items);
1689 InitializeListHead(&metadata_items);
1690
1691 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1692
1693 searchkey.obj_id = c->offset;
1694 searchkey.obj_type = TYPE_EXTENT_ITEM;
1695 searchkey.offset = 0xffffffffffffffff;
1696
1697 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1698 if (!NT_SUCCESS(Status)) {
1699 ERR("find_item returned %08lx\n", Status);
1700 goto end;
1701 }
1702
1703 do {
1704 traverse_ptr next_tp;
1705
1706 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1707 break;
1708
1709 if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1710 bool tree = false;
1711
1712 if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1713 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1714
1715 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1716 tree = true;
1717 }
1718
1719 if (!tree) {
1720 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1721
1722 if (!NT_SUCCESS(Status)) {
1723 ERR("add_data_reloc returned %08lx\n", Status);
1724 goto end;
1725 }
1726
1727 loaded += tp.item->key.offset;
1728 num_loaded++;
1729
1730 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1731 break;
1732 }
1733 }
1734
1735 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1736
1737 if (b)
1738 tp = next_tp;
1739 } while (b);
1740
1741 if (IsListEmpty(&items)) {
1742 *changed = false;
1743 Status = STATUS_SUCCESS;
1744 goto end;
1745 } else
1746 *changed = true;
1747
1748 data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1749 if (!data) {
1750 ERR("out of memory\n");
1751 Status = STATUS_INSUFFICIENT_RESOURCES;
1752 goto end;
1753 }
1754
1755 le = items.Flink;
1756 while (le != &items) {
1757 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1758 bool done = false;
1759 LIST_ENTRY* le2;
1760 void* csum;
1761 RTL_BITMAP bmp;
1762 ULONG* bmparr;
1763 ULONG bmplen, runlength, index, lastoff;
1764
1765 if (newchunk) {
1766 acquire_chunk_lock(newchunk, Vcb);
1767
1768 if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1769 newchunk->used += dr->size;
1770 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1771 done = true;
1772 }
1773
1774 release_chunk_lock(newchunk, Vcb);
1775 }
1776
1777 if (!done) {
1778 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1779
1780 le2 = Vcb->chunks.Flink;
1781 while (le2 != &Vcb->chunks) {
1782 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1783
1784 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1785 acquire_chunk_lock(c2, Vcb);
1786
1787 if ((c2->chunk_item->size - c2->used) >= dr->size) {
1788 if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1789 c2->used += dr->size;
1790 space_list_subtract(c2, false, dr->new_address, dr->size, &rollback);
1791 release_chunk_lock(c2, Vcb);
1792 newchunk = c2;
1793 done = true;
1794 break;
1795 }
1796 }
1797
1798 release_chunk_lock(c2, Vcb);
1799 }
1800
1801 le2 = le2->Flink;
1802 }
1803
1804 // allocate new chunk if necessary
1805 if (!done) {
1806 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1807
1808 if (!NT_SUCCESS(Status)) {
1809 ERR("alloc_chunk returned %08lx\n", Status);
1810 ExReleaseResourceLite(&Vcb->chunk_lock);
1811 goto end;
1812 }
1813
1814 acquire_chunk_lock(newchunk, Vcb);
1815
1816 newchunk->balance_num = Vcb->balance.balance_num;
1817
1818 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1819 release_chunk_lock(newchunk, Vcb);
1820 ExReleaseResourceLite(&Vcb->chunk_lock);
1821 ERR("could not find address in new chunk\n");
1822 Status = STATUS_DISK_FULL;
1823 goto end;
1824 } else {
1825 newchunk->used += dr->size;
1826 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1827 }
1828
1829 release_chunk_lock(newchunk, Vcb);
1830 }
1831
1832 ExReleaseResourceLite(&Vcb->chunk_lock);
1833 }
1834
1835 dr->newchunk = newchunk;
1836
1837 bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size);
1838
1839 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1840 if (!bmparr) {
1841 ERR("out of memory\n");
1842 Status = STATUS_INSUFFICIENT_RESOURCES;
1843 goto end;
1844 }
1845
1846 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * Vcb->csum_size / Vcb->superblock.sector_size), ALLOC_TAG);
1847 if (!csum) {
1848 ERR("out of memory\n");
1849 ExFreePool(bmparr);
1850 Status = STATUS_INSUFFICIENT_RESOURCES;
1851 goto end;
1852 }
1853
1854 RtlInitializeBitMap(&bmp, bmparr, bmplen);
1855 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1856
1857 searchkey.obj_id = EXTENT_CSUM_ID;
1858 searchkey.obj_type = TYPE_EXTENT_CSUM;
1859 searchkey.offset = dr->address;
1860
1861 Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1862 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1863 ERR("find_item returned %08lx\n", Status);
1864 ExFreePool(csum);
1865 ExFreePool(bmparr);
1866 goto end;
1867 }
1868
1869 if (Status != STATUS_NOT_FOUND) {
1870 do {
1871 traverse_ptr next_tp;
1872
1873 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1874 if (tp.item->key.offset >= dr->address + dr->size)
1875 break;
1876 else if (tp.item->size >= Vcb->csum_size && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / Vcb->csum_size) >= dr->address) {
1877 uint64_t cs = max(dr->address, tp.item->key.offset);
1878 uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / Vcb->csum_size));
1879
1880 RtlCopyMemory((uint8_t*)csum + ((cs - dr->address) * Vcb->csum_size / Vcb->superblock.sector_size),
1881 tp.item->data + ((cs - tp.item->key.offset) * Vcb->csum_size / Vcb->superblock.sector_size),
1882 (ULONG)((ce - cs) * Vcb->csum_size / Vcb->superblock.sector_size));
1883
1884 RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1885
1886 if (ce == dr->address + dr->size)
1887 break;
1888 }
1889 }
1890
1891 if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1892 tp = next_tp;
1893 else
1894 break;
1895 } while (true);
1896 }
1897
1898 lastoff = 0;
1899 runlength = RtlFindFirstRunClear(&bmp, &index);
1900
1901 while (runlength != 0) {
1902 if (index >= bmplen)
1903 break;
1904
1905 if (index + runlength >= bmplen) {
1906 runlength = bmplen - index;
1907
1908 if (runlength == 0)
1909 break;
1910 }
1911
1912 if (index > lastoff) {
1913 ULONG off = lastoff;
1914 ULONG size = index - lastoff;
1915
1916 // handle no csum run
1917 do {
1918 ULONG rl;
1919
1920 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1921 rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1922 else
1923 rl = size;
1924
1925 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1926 c, NULL, NULL, 0, false, NormalPagePriority);
1927 if (!NT_SUCCESS(Status)) {
1928 ERR("read_data returned %08lx\n", Status);
1929 ExFreePool(csum);
1930 ExFreePool(bmparr);
1931 goto end;
1932 }
1933
1934 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1935 NULL, newchunk, false, 0, NormalPagePriority);
1936 if (!NT_SUCCESS(Status)) {
1937 ERR("write_data_complete returned %08lx\n", Status);
1938 ExFreePool(csum);
1939 ExFreePool(bmparr);
1940 goto end;
1941 }
1942
1943 size -= rl;
1944 off += rl;
1945 } while (size > 0);
1946 }
1947
1948 add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, (uint8_t*)csum + (index * Vcb->csum_size), NULL);
1949 add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1950
1951 // handle csum run
1952 do {
1953 ULONG rl;
1954
1955 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1956 rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1957 else
1958 rl = runlength;
1959
1960 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size,
1961 (uint8_t*)csum + (index * Vcb->csum_size), false, data, c, NULL, NULL, 0, false, NormalPagePriority);
1962 if (!NT_SUCCESS(Status)) {
1963 ERR("read_data returned %08lx\n", Status);
1964 ExFreePool(csum);
1965 ExFreePool(bmparr);
1966 goto end;
1967 }
1968
1969 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1970 NULL, newchunk, false, 0, NormalPagePriority);
1971 if (!NT_SUCCESS(Status)) {
1972 ERR("write_data_complete returned %08lx\n", Status);
1973 ExFreePool(csum);
1974 ExFreePool(bmparr);
1975 goto end;
1976 }
1977
1978 runlength -= rl;
1979 index += rl;
1980 } while (runlength > 0);
1981
1982 lastoff = index;
1983 runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1984 }
1985
1986 ExFreePool(csum);
1987 ExFreePool(bmparr);
1988
1989 // handle final nocsum run
1990 if (lastoff < dr->size / Vcb->superblock.sector_size) {
1991 ULONG off = lastoff;
1992 ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1993
1994 do {
1995 ULONG rl;
1996
1997 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1998 rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1999 else
2000 rl = size;
2001
2002 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
2003 c, NULL, NULL, 0, false, NormalPagePriority);
2004 if (!NT_SUCCESS(Status)) {
2005 ERR("read_data returned %08lx\n", Status);
2006 goto end;
2007 }
2008
2009 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
2010 NULL, newchunk, false, 0, NormalPagePriority);
2011 if (!NT_SUCCESS(Status)) {
2012 ERR("write_data_complete returned %08lx\n", Status);
2013 goto end;
2014 }
2015
2016 size -= rl;
2017 off += rl;
2018 } while (size > 0);
2019 }
2020
2021 le = le->Flink;
2022 }
2023
2024 ExFreePool(data);
2025 data = NULL;
2026
2027 Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2028 if (!NT_SUCCESS(Status)) {
2029 ERR("write_metadata_items returned %08lx\n", Status);
2030 goto end;
2031 }
2032
2033 le = items.Flink;
2034 while (le != &items) {
2035 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2036
2037 Status = add_data_reloc_extent_item(Vcb, dr);
2038 if (!NT_SUCCESS(Status)) {
2039 ERR("add_data_reloc_extent_item returned %08lx\n", Status);
2040 goto end;
2041 }
2042
2043 le = le->Flink;
2044 }
2045
2046 le = c->changed_extents.Flink;
2047 while (le != &c->changed_extents) {
2048 LIST_ENTRY *le2, *le3;
2049 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2050
2051 le3 = le->Flink;
2052
2053 le2 = items.Flink;
2054 while (le2 != &items) {
2055 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2056
2057 if (ce->address == dr->address) {
2058 ce->address = dr->new_address;
2059 RemoveEntryList(&ce->list_entry);
2060 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2061 break;
2062 }
2063
2064 le2 = le2->Flink;
2065 }
2066
2067 le = le3;
2068 }
2069
2070 Status = STATUS_SUCCESS;
2071
2072 Vcb->need_write = true;
2073
2074 end:
2075 if (NT_SUCCESS(Status)) {
2076 // update extents in cache inodes before we flush
2077 le = Vcb->chunks.Flink;
2078 while (le != &Vcb->chunks) {
2079 chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2080
2081 if (c2->cache) {
2082 LIST_ENTRY* le2;
2083
2084 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2085
2086 le2 = c2->cache->extents.Flink;
2087 while (le2 != &c2->cache->extents) {
2088 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2089
2090 if (!ext->ignore) {
2091 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2092 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2093
2094 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2095 LIST_ENTRY* le3 = items.Flink;
2096 while (le3 != &items) {
2097 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2098
2099 if (ed2->address == dr->address) {
2100 ed2->address = dr->new_address;
2101 break;
2102 }
2103
2104 le3 = le3->Flink;
2105 }
2106 }
2107 }
2108 }
2109
2110 le2 = le2->Flink;
2111 }
2112
2113 ExReleaseResourceLite(c2->cache->Header.Resource);
2114 }
2115
2116 le = le->Flink;
2117 }
2118
2119 Status = do_write(Vcb, NULL);
2120 if (!NT_SUCCESS(Status))
2121 ERR("do_write returned %08lx\n", Status);
2122 }
2123
2124 if (NT_SUCCESS(Status)) {
2125 clear_rollback(&rollback);
2126
2127 // update open FCBs
2128 // FIXME - speed this up(?)
2129
2130 le = Vcb->all_fcbs.Flink;
2131 while (le != &Vcb->all_fcbs) {
2132 struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2133 LIST_ENTRY* le2;
2134
2135 ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2136
2137 le2 = fcb->extents.Flink;
2138 while (le2 != &fcb->extents) {
2139 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2140
2141 if (!ext->ignore) {
2142 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2143 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2144
2145 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2146 LIST_ENTRY* le3 = items.Flink;
2147 while (le3 != &items) {
2148 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2149
2150 if (ed2->address == dr->address) {
2151 ed2->address = dr->new_address;
2152 break;
2153 }
2154
2155 le3 = le3->Flink;
2156 }
2157 }
2158 }
2159 }
2160
2161 le2 = le2->Flink;
2162 }
2163
2164 ExReleaseResourceLite(fcb->Header.Resource);
2165
2166 le = le->Flink;
2167 }
2168 } else
2169 do_rollback(Vcb, &rollback);
2170
2171 free_trees(Vcb);
2172
2173 ExReleaseResourceLite(&Vcb->tree_lock);
2174
2175 if (data)
2176 ExFreePool(data);
2177
2178 while (!IsListEmpty(&items)) {
2179 data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2180
2181 while (!IsListEmpty(&dr->refs)) {
2182 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2183
2184 ExFreePool(ref);
2185 }
2186
2187 ExFreePool(dr);
2188 }
2189
2190 while (!IsListEmpty(&metadata_items)) {
2191 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2192
2193 while (!IsListEmpty(&mr->refs)) {
2194 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2195
2196 ExFreePool(ref);
2197 }
2198
2199 if (mr->data)
2200 ExFreePool(mr->data);
2201
2202 ExFreePool(mr);
2203 }
2204
2205 return Status;
2206 }
2207
2208 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2209 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2210 return BLOCK_FLAG_RAID0;
2211 else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2212 return BLOCK_FLAG_RAID1;
2213 else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2214 return BLOCK_FLAG_DUPLICATE;
2215 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2216 return BLOCK_FLAG_RAID10;
2217 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2218 return BLOCK_FLAG_RAID5;
2219 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2220 return BLOCK_FLAG_RAID6;
2221 else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3)
2222 return BLOCK_FLAG_RAID1C3;
2223 else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4)
2224 return BLOCK_FLAG_RAID1C4;
2225 else
2226 return BLOCK_FLAG_SINGLE;
2227 }
2228
2229 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) {
2230 btrfs_balance_opts* opts;
2231
2232 opts = &Vcb->balance.opts[sort];
2233
2234 if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2235 return false;
2236
2237 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2238 uint64_t type = get_chunk_dup_type(c);
2239
2240 if (!(type & opts->profiles))
2241 return false;
2242 }
2243
2244 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2245 uint16_t i;
2246 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2247 bool b = false;
2248
2249 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2250 if (cis[i].dev_id == opts->devid) {
2251 b = true;
2252 break;
2253 }
2254 }
2255
2256 if (!b)
2257 return false;
2258 }
2259
2260 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2261 uint16_t i, factor;
2262 uint64_t physsize;
2263 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2264 bool b = false;
2265
2266 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2267 factor = c->chunk_item->num_stripes;
2268 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2269 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2270 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2271 factor = c->chunk_item->num_stripes - 1;
2272 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2273 factor = c->chunk_item->num_stripes - 2;
2274 else // SINGLE, DUPLICATE, RAID1, RAID1C3, RAID1C4
2275 factor = 1;
2276
2277 physsize = c->chunk_item->size / factor;
2278
2279 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2280 if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2281 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2282 b = true;
2283 break;
2284 }
2285 }
2286
2287 if (!b)
2288 return false;
2289 }
2290
2291 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2292 if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2293 return false;
2294 }
2295
2296 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2297 if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2298 return false;
2299 }
2300
2301 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2302 uint64_t usage = c->used * 100 / c->chunk_item->size;
2303
2304 // usage == 0 should mean completely empty, not just that usage rounds to 0%
2305 if (c->used > 0 && usage == 0)
2306 usage = 1;
2307
2308 if (usage < opts->usage_start || usage > opts->usage_end)
2309 return false;
2310 }
2311
2312 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2313 uint64_t type = get_chunk_dup_type(c);
2314
2315 if (type == opts->convert)
2316 return false;
2317 }
2318
2319 return true;
2320 }
2321
2322 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2323 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2324 args->profiles = opts->profiles;
2325 args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2326 }
2327
2328 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2329 if (args->usage_start == 0) {
2330 args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2331 args->usage_start = opts->usage_start;
2332 args->usage_end = opts->usage_end;
2333 } else {
2334 args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2335 args->usage = opts->usage_end;
2336 }
2337 }
2338
2339 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2340 args->devid = opts->devid;
2341 args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2342 }
2343
2344 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2345 args->drange_start = opts->drange_start;
2346 args->drange_end = opts->drange_end;
2347 args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2348 }
2349
2350 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2351 args->vrange_start = opts->vrange_start;
2352 args->vrange_end = opts->vrange_end;
2353 args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2354 }
2355
2356 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2357 args->convert = opts->convert;
2358 args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2359
2360 if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2361 args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2362 }
2363
2364 if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2365 if (args->limit_start == 0) {
2366 args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2367 args->limit_start = (uint32_t)opts->limit_start;
2368 args->limit_end = (uint32_t)opts->limit_end;
2369 } else {
2370 args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2371 args->limit = opts->limit_end;
2372 }
2373 }
2374
2375 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2376 args->stripes_start = opts->stripes_start;
2377 args->stripes_end = opts->stripes_end;
2378 args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2379 }
2380 }
2381
2382 static NTSTATUS add_balance_item(device_extension* Vcb) {
2383 KEY searchkey;
2384 traverse_ptr tp;
2385 NTSTATUS Status;
2386 BALANCE_ITEM* bi;
2387
2388 searchkey.obj_id = BALANCE_ITEM_ID;
2389 searchkey.obj_type = TYPE_TEMP_ITEM;
2390 searchkey.offset = 0;
2391
2392 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2393
2394 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2395 if (!NT_SUCCESS(Status)) {
2396 ERR("find_item returned %08lx\n", Status);
2397 goto end;
2398 }
2399
2400 if (!keycmp(tp.item->key, searchkey)) {
2401 Status = delete_tree_item(Vcb, &tp);
2402 if (!NT_SUCCESS(Status)) {
2403 ERR("delete_tree_item returned %08lx\n", Status);
2404 goto end;
2405 }
2406 }
2407
2408 bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2409 if (!bi) {
2410 ERR("out of memory\n");
2411 Status = STATUS_INSUFFICIENT_RESOURCES;
2412 goto end;
2413 }
2414
2415 RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2416
2417 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2418 bi->flags |= BALANCE_FLAGS_DATA;
2419 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2420 }
2421
2422 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2423 bi->flags |= BALANCE_FLAGS_METADATA;
2424 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2425 }
2426
2427 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2428 bi->flags |= BALANCE_FLAGS_SYSTEM;
2429 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2430 }
2431
2432 Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2433 if (!NT_SUCCESS(Status)) {
2434 ERR("insert_tree_item returned %08lx\n", Status);
2435 ExFreePool(bi);
2436 goto end;
2437 }
2438
2439 Status = STATUS_SUCCESS;
2440
2441 end:
2442 if (NT_SUCCESS(Status)) {
2443 Status = do_write(Vcb, NULL);
2444 if (!NT_SUCCESS(Status))
2445 ERR("do_write returned %08lx\n", Status);
2446 }
2447
2448 free_trees(Vcb);
2449
2450 ExReleaseResourceLite(&Vcb->tree_lock);
2451
2452 return Status;
2453 }
2454
2455 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2456 KEY searchkey;
2457 traverse_ptr tp;
2458 NTSTATUS Status;
2459
2460 searchkey.obj_id = BALANCE_ITEM_ID;
2461 searchkey.obj_type = TYPE_TEMP_ITEM;
2462 searchkey.offset = 0;
2463
2464 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2465
2466 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2467 if (!NT_SUCCESS(Status)) {
2468 ERR("find_item returned %08lx\n", Status);
2469 goto end;
2470 }
2471
2472 if (!keycmp(tp.item->key, searchkey)) {
2473 Status = delete_tree_item(Vcb, &tp);
2474 if (!NT_SUCCESS(Status)) {
2475 ERR("delete_tree_item returned %08lx\n", Status);
2476 goto end;
2477 }
2478
2479 Status = do_write(Vcb, NULL);
2480 if (!NT_SUCCESS(Status)) {
2481 ERR("do_write returned %08lx\n", Status);
2482 goto end;
2483 }
2484
2485 free_trees(Vcb);
2486 }
2487
2488 Status = STATUS_SUCCESS;
2489
2490 end:
2491 ExReleaseResourceLite(&Vcb->tree_lock);
2492
2493 return Status;
2494 }
2495
2496 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2497 opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2498
2499 if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2500 opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2501 opts->profiles = args->profiles;
2502 }
2503
2504 if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2505 opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2506
2507 opts->usage_start = 0;
2508 opts->usage_end = (uint8_t)args->usage;
2509 } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2510 opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2511
2512 opts->usage_start = (uint8_t)args->usage_start;
2513 opts->usage_end = (uint8_t)args->usage_end;
2514 }
2515
2516 if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2517 opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2518 opts->devid = args->devid;
2519 }
2520
2521 if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2522 opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2523 opts->drange_start = args->drange_start;
2524 opts->drange_end = args->drange_end;
2525 }
2526
2527 if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2528 opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2529 opts->vrange_start = args->vrange_start;
2530 opts->vrange_end = args->vrange_end;
2531 }
2532
2533 if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2534 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2535
2536 opts->limit_start = 0;
2537 opts->limit_end = args->limit;
2538 } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2539 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2540
2541 opts->limit_start = args->limit_start;
2542 opts->limit_end = args->limit_end;
2543 }
2544
2545 if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2546 opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2547
2548 opts->stripes_start = (uint16_t)args->stripes_start;
2549 opts->stripes_end = (uint16_t)args->stripes_end;
2550 }
2551
2552 if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2553 opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2554 opts->convert = args->convert;
2555
2556 if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2557 opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2558 }
2559 }
2560
2561 static NTSTATUS remove_superblocks(device* dev) {
2562 NTSTATUS Status;
2563 superblock* sb;
2564 int i = 0;
2565
2566 sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2567 if (!sb) {
2568 ERR("out of memory\n");
2569 return STATUS_INSUFFICIENT_RESOURCES;
2570 }
2571
2572 RtlZeroMemory(sb, sizeof(superblock));
2573
2574 while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2575 Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2576
2577 if (!NT_SUCCESS(Status)) {
2578 ExFreePool(sb);
2579 return Status;
2580 }
2581
2582 i++;
2583 }
2584
2585 ExFreePool(sb);
2586
2587 return STATUS_SUCCESS;
2588 }
2589
2590 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2591 KEY searchkey;
2592 traverse_ptr tp;
2593 NTSTATUS Status;
2594 LIST_ENTRY* le;
2595 volume_device_extension* vde;
2596
2597 if (Vcb->need_write) {
2598 Status = do_write(Vcb, NULL);
2599
2600 if (!NT_SUCCESS(Status))
2601 ERR("do_write returned %08lx\n", Status);
2602 } else
2603 Status = STATUS_SUCCESS;
2604
2605 free_trees(Vcb);
2606
2607 if (!NT_SUCCESS(Status))
2608 return Status;
2609
2610 // remove entry in chunk tree
2611
2612 searchkey.obj_id = 1;
2613 searchkey.obj_type = TYPE_DEV_ITEM;
2614 searchkey.offset = dev->devitem.dev_id;
2615
2616 Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2617 if (!NT_SUCCESS(Status)) {
2618 ERR("find_item returned %08lx\n", Status);
2619 return Status;
2620 }
2621
2622 if (!keycmp(searchkey, tp.item->key)) {
2623 Status = delete_tree_item(Vcb, &tp);
2624
2625 if (!NT_SUCCESS(Status)) {
2626 ERR("delete_tree_item returned %08lx\n", Status);
2627 return Status;
2628 }
2629 }
2630
2631 // remove stats entry in device tree
2632
2633 searchkey.obj_id = 0;
2634 searchkey.obj_type = TYPE_DEV_STATS;
2635 searchkey.offset = dev->devitem.dev_id;
2636
2637 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2638 if (!NT_SUCCESS(Status)) {
2639 ERR("find_item returned %08lx\n", Status);
2640 return Status;
2641 }
2642
2643 if (!keycmp(searchkey, tp.item->key)) {
2644 Status = delete_tree_item(Vcb, &tp);
2645
2646 if (!NT_SUCCESS(Status)) {
2647 ERR("delete_tree_item returned %08lx\n", Status);
2648 return Status;
2649 }
2650 }
2651
2652 // update superblock
2653
2654 Vcb->superblock.num_devices--;
2655 Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2656 Vcb->devices_loaded--;
2657
2658 RemoveEntryList(&dev->list_entry);
2659
2660 // flush
2661
2662 Status = do_write(Vcb, NULL);
2663 if (!NT_SUCCESS(Status))
2664 ERR("do_write returned %08lx\n", Status);
2665
2666 free_trees(Vcb);
2667
2668 if (!NT_SUCCESS(Status))
2669 return Status;
2670
2671 if (!dev->readonly && dev->devobj) {
2672 Status = remove_superblocks(dev);
2673 if (!NT_SUCCESS(Status))
2674 WARN("remove_superblocks returned %08lx\n", Status);
2675 }
2676
2677 // remove entry in volume list
2678
2679 vde = Vcb->vde;
2680
2681 if (dev->devobj) {
2682 pdo_device_extension* pdode = vde->pdode;
2683
2684 ExAcquireResourceExclusiveLite(&pdode->child_lock, true);
2685
2686 le = pdode->children.Flink;
2687 while (le != &pdode->children) {
2688 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2689
2690 if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2691 PFILE_OBJECT FileObject;
2692 PDEVICE_OBJECT mountmgr;
2693 UNICODE_STRING mmdevpath;
2694
2695 pdode->children_loaded--;
2696
2697 if (vc->had_drive_letter) { // re-add entry to mountmgr
2698 RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2699 Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2700 if (!NT_SUCCESS(Status))
2701 ERR("IoGetDeviceObjectPointer returned %08lx\n", Status);
2702 else {
2703 MOUNTDEV_NAME mdn;
2704
2705 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2706 if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2707 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2708 else {
2709 MOUNTDEV_NAME* mdn2;
2710 ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2711
2712 mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2713 if (!mdn2)
2714 ERR("out of memory\n");
2715 else {
2716 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2717 if (!NT_SUCCESS(Status))
2718 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2719 else {
2720 UNICODE_STRING name;
2721
2722 name.Buffer = mdn2->Name;
2723 name.Length = name.MaximumLength = mdn2->NameLength;
2724
2725 Status = mountmgr_add_drive_letter(mountmgr, &name);
2726 if (!NT_SUCCESS(Status))
2727 WARN("mountmgr_add_drive_letter returned %08lx\n", Status);
2728 }
2729
2730 ExFreePool(mdn2);
2731 }
2732 }
2733
2734 ObDereferenceObject(FileObject);
2735 }
2736 }
2737
2738 ExFreePool(vc->pnp_name.Buffer);
2739 RemoveEntryList(&vc->list_entry);
2740 ExFreePool(vc);
2741
2742 ObDereferenceObject(vc->fileobj);
2743
2744 break;
2745 }
2746
2747 le = le->Flink;
2748 }
2749
2750 if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2751 vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2752
2753 le = pdode->children.Flink;
2754 while (le != &pdode->children) {
2755 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2756
2757 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2758 vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2759 break;
2760 }
2761
2762 le = le->Flink;
2763 }
2764 }
2765
2766 pdode->num_children = Vcb->superblock.num_devices;
2767
2768 ExReleaseResourceLite(&pdode->child_lock);
2769
2770 // free dev
2771
2772 if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2773 trim_whole_device(dev);
2774 }
2775
2776 while (!IsListEmpty(&dev->space)) {
2777 LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2778 space* s = CONTAINING_RECORD(le2, space, list_entry);
2779
2780 ExFreePool(s);
2781 }
2782
2783 ExFreePool(dev);
2784
2785 if (Vcb->trim) {
2786 Vcb->trim = false;
2787
2788 le = Vcb->devices.Flink;
2789 while (le != &Vcb->devices) {
2790 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2791
2792 if (dev2->trim) {
2793 Vcb->trim = true;
2794 break;
2795 }
2796
2797 le = le->Flink;
2798 }
2799 }
2800
2801 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2802
2803 return STATUS_SUCCESS;
2804 }
2805
2806 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2807 DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2808 DEVICE_DATA_SET_RANGE* ranges;
2809 ULONG datalen, i;
2810 KEY searchkey;
2811 traverse_ptr tp;
2812 NTSTATUS Status;
2813 bool b;
2814 uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2815 LIST_ENTRY* le;
2816
2817 dev->num_trim_entries = 0;
2818
2819 searchkey.obj_id = dev->devitem.dev_id;
2820 searchkey.obj_type = TYPE_DEV_EXTENT;
2821 searchkey.offset = 0;
2822
2823 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2824 if (!NT_SUCCESS(Status)) {
2825 ERR("find_item returned %08lx\n", Status);
2826 return;
2827 }
2828
2829 do {
2830 traverse_ptr next_tp;
2831
2832 if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2833 if (tp.item->size >= sizeof(DEV_EXTENT)) {
2834 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2835
2836 if (tp.item->key.offset > lastoff)
2837 add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2838
2839 lastoff = tp.item->key.offset + de->length;
2840 } else {
2841 ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2842 return;
2843 }
2844 }
2845
2846 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2847
2848 if (b) {
2849 tp = next_tp;
2850 if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2851 break;
2852 }
2853 } while (b);
2854
2855 if (lastoff < dev->devitem.num_bytes)
2856 add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2857
2858 if (dev->num_trim_entries == 0)
2859 return;
2860
2861 datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2862
2863 dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2864 if (!dmdsa) {
2865 ERR("out of memory\n");
2866 goto end;
2867 }
2868
2869 dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2870 dmdsa->Action = DeviceDsmAction_Trim;
2871 dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2872 dmdsa->ParameterBlockOffset = 0;
2873 dmdsa->ParameterBlockLength = 0;
2874 dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t));
2875 dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2876
2877 ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2878
2879 i = 0;
2880 le = dev->trim_list.Flink;
2881 while (le != &dev->trim_list) {
2882 space* s = CONTAINING_RECORD(le, space, list_entry);
2883
2884 ranges[i].StartingOffset = s->address;
2885 ranges[i].LengthInBytes = s->size;
2886 i++;
2887
2888 le = le->Flink;
2889 }
2890
2891 Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL);
2892 if (!NT_SUCCESS(Status))
2893 WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08lx\n", Status);
2894
2895 ExFreePool(dmdsa);
2896
2897 end:
2898 while (!IsListEmpty(&dev->trim_list)) {
2899 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2900 ExFreePool(s);
2901 }
2902
2903 dev->num_trim_entries = 0;
2904 }
2905
2906 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) {
2907 NTSTATUS Status;
2908 bool changed;
2909 LIST_ENTRY* le;
2910 chunk* rc;
2911
2912 // FIXME - allow with metadata chunks?
2913
2914 while (true) {
2915 rc = NULL;
2916
2917 ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2918
2919 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2920
2921 // choose the least-used chunk we haven't looked at yet
2922 le = Vcb->chunks.Flink;
2923 while (le != &Vcb->chunks) {
2924 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2925
2926 // FIXME - skip full-size chunks over e.g. 90% full?
2927 if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2928 rc = c;
2929
2930 le = le->Flink;
2931 }
2932
2933 ExReleaseResourceLite(&Vcb->chunk_lock);
2934
2935 if (!rc) {
2936 ExReleaseResourceLite(&Vcb->tree_lock);
2937 break;
2938 }
2939
2940 if (rc->list_entry_balance.Flink) {
2941 RemoveEntryList(&rc->list_entry_balance);
2942 Vcb->balance.chunks_left--;
2943 }
2944
2945 rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2946 rc->reloc = true;
2947
2948 ExReleaseResourceLite(&Vcb->tree_lock);
2949
2950 do {
2951 changed = false;
2952
2953 Status = balance_data_chunk(Vcb, rc, &changed);
2954 if (!NT_SUCCESS(Status)) {
2955 ERR("balance_data_chunk returned %08lx\n", Status);
2956 Vcb->balance.status = Status;
2957 rc->list_entry_balance.Flink = NULL;
2958 rc->reloc = false;
2959 return Status;
2960 }
2961
2962 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2963
2964 if (Vcb->readonly)
2965 Vcb->balance.stopping = true;
2966
2967 if (Vcb->balance.stopping)
2968 return STATUS_SUCCESS;
2969 } while (changed);
2970
2971 rc->list_entry_balance.Flink = NULL;
2972
2973 rc->changed = true;
2974 rc->space_changed = true;
2975 rc->balance_num = Vcb->balance.balance_num;
2976
2977 Status = do_write(Vcb, NULL);
2978 if (!NT_SUCCESS(Status)) {
2979 ERR("do_write returned %08lx\n", Status);
2980 return Status;
2981 }
2982
2983 free_trees(Vcb);
2984 }
2985
2986 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2987
2988 Status = alloc_chunk(Vcb, flags, &rc, true);
2989
2990 ExReleaseResourceLite(&Vcb->chunk_lock);
2991
2992 if (NT_SUCCESS(Status)) {
2993 *newchunk = rc;
2994 return Status;
2995 } else {
2996 ERR("alloc_chunk returned %08lx\n", Status);
2997 return Status;
2998 }
2999 }
3000
3001 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
3002 LIST_ENTRY* le;
3003
3004 while (!IsListEmpty(&dev->space)) {
3005 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
3006
3007 ExFreePool(s);
3008 }
3009
3010 // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
3011
3012 space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
3013
3014 le = Vcb->chunks.Flink;
3015 while (le != &Vcb->chunks) {
3016 uint16_t n;
3017 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3018 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3019
3020 for (n = 0; n < c->chunk_item->num_stripes; n++) {
3021 uint64_t stripe_size = 0;
3022
3023 if (cis[n].dev_id == dev->devitem.dev_id) {
3024 if (stripe_size == 0) {
3025 uint16_t factor;
3026
3027 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3028 factor = c->chunk_item->num_stripes;
3029 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3030 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3031 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3032 factor = c->chunk_item->num_stripes - 1;
3033 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3034 factor = c->chunk_item->num_stripes - 2;
3035 else // SINGLE, DUP, RAID1, RAID1C3, RAID1C4
3036 factor = 1;
3037
3038 stripe_size = c->chunk_item->size / factor;
3039 }
3040
3041 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3042 }
3043 }
3044
3045 le = le->Flink;
3046 }
3047
3048 return STATUS_SUCCESS;
3049 }
3050
3051 _Function_class_(KSTART_ROUTINE)
3052 void __stdcall balance_thread(void* context) {
3053 device_extension* Vcb = (device_extension*)context;
3054 LIST_ENTRY chunks;
3055 LIST_ENTRY* le;
3056 uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3057 uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3058 NTSTATUS Status;
3059
3060 Vcb->balance.balance_num++;
3061
3062 Vcb->balance.stopping = false;
3063 KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3064
3065 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3066 old_data_flags = Vcb->data_flags;
3067 Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3068
3069 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3070 }
3071
3072 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3073 old_metadata_flags = Vcb->metadata_flags;
3074 Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3075 }
3076
3077 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3078 old_system_flags = Vcb->system_flags;
3079 Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3080 }
3081
3082 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3083 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3084 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3085 else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3086 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3087 }
3088
3089 num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3090 Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3091
3092 InitializeListHead(&chunks);
3093
3094 // FIXME - what are we supposed to do with limit_start?
3095
3096 if (!Vcb->readonly) {
3097 if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3098 Status = add_balance_item(Vcb);
3099 if (!NT_SUCCESS(Status)) {
3100 ERR("add_balance_item returned %08lx\n", Status);
3101 Vcb->balance.status = Status;
3102 goto end;
3103 }
3104 } else {
3105 if (Vcb->need_write) {
3106 Status = do_write(Vcb, NULL);
3107
3108 free_trees(Vcb);
3109
3110 if (!NT_SUCCESS(Status)) {
3111 ERR("do_write returned %08lx\n", Status);
3112 Vcb->balance.status = Status;
3113 goto end;
3114 }
3115 }
3116 }
3117 }
3118
3119 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3120
3121 if (Vcb->balance.stopping)
3122 goto end;
3123
3124 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3125
3126 le = Vcb->chunks.Flink;
3127 while (le != &Vcb->chunks) {
3128 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3129 uint8_t sort;
3130
3131 acquire_chunk_lock(c, Vcb);
3132
3133 if (c->chunk_item->type & BLOCK_FLAG_DATA)
3134 sort = BALANCE_OPTS_DATA;
3135 else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3136 sort = BALANCE_OPTS_METADATA;
3137 else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3138 sort = BALANCE_OPTS_SYSTEM;
3139 else {
3140 ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3141 release_chunk_lock(c, Vcb);
3142 break;
3143 }
3144
3145 if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3146 should_balance_chunk(Vcb, sort, c)) {
3147 InsertTailList(&chunks, &c->list_entry_balance);
3148
3149 num_chunks[sort]++;
3150 Vcb->balance.total_chunks++;
3151 Vcb->balance.chunks_left++;
3152 } else if (sort == BALANCE_OPTS_METADATA)
3153 okay_metadata_chunks++;
3154 else if (sort == BALANCE_OPTS_DATA)
3155 okay_data_chunks++;
3156 else if (sort == BALANCE_OPTS_SYSTEM)
3157 okay_system_chunks++;
3158
3159 if (!c->cache_loaded) {
3160 Status = load_cache_chunk(Vcb, c, NULL);
3161
3162 if (!NT_SUCCESS(Status)) {
3163 ERR("load_cache_chunk returned %08lx\n", Status);
3164 Vcb->balance.status = Status;
3165 release_chunk_lock(c, Vcb);
3166 ExReleaseResourceLite(&Vcb->chunk_lock);
3167 goto end;
3168 }
3169 }
3170
3171 release_chunk_lock(c, Vcb);
3172
3173 le = le->Flink;
3174 }
3175
3176 ExReleaseResourceLite(&Vcb->chunk_lock);
3177
3178 // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3179 if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3180 bool consolidated = false;
3181 chunk* c;
3182
3183 if (okay_metadata_chunks == 0) {
3184 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3185
3186 Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3187 if (NT_SUCCESS(Status))
3188 c->balance_num = Vcb->balance.balance_num;
3189 else if (Status != STATUS_DISK_FULL || consolidated) {
3190 ERR("alloc_chunk returned %08lx\n", Status);
3191 ExReleaseResourceLite(&Vcb->chunk_lock);
3192 Vcb->balance.status = Status;
3193 goto end;
3194 }
3195
3196 ExReleaseResourceLite(&Vcb->chunk_lock);
3197
3198 if (Status == STATUS_DISK_FULL) {
3199 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3200 if (!NT_SUCCESS(Status)) {
3201 ERR("try_consolidation returned %08lx\n", Status);
3202 Vcb->balance.status = Status;
3203 goto end;
3204 } else
3205 c->balance_num = Vcb->balance.balance_num;
3206
3207 consolidated = true;
3208
3209 if (Vcb->balance.stopping)
3210 goto end;
3211 }
3212 }
3213
3214 if (okay_data_chunks == 0) {
3215 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3216
3217 Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3218 if (NT_SUCCESS(Status))
3219 c->balance_num = Vcb->balance.balance_num;
3220 else if (Status != STATUS_DISK_FULL || consolidated) {
3221 ERR("alloc_chunk returned %08lx\n", Status);
3222 ExReleaseResourceLite(&Vcb->chunk_lock);
3223 Vcb->balance.status = Status;
3224 goto end;
3225 }
3226
3227 ExReleaseResourceLite(&Vcb->chunk_lock);
3228
3229 if (Status == STATUS_DISK_FULL) {
3230 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3231 if (!NT_SUCCESS(Status)) {
3232 ERR("try_consolidation returned %08lx\n", Status);
3233 Vcb->balance.status = Status;
3234 goto end;
3235 } else
3236 c->balance_num = Vcb->balance.balance_num;
3237
3238 consolidated = true;
3239
3240 if (Vcb->balance.stopping)
3241 goto end;
3242 }
3243 }
3244
3245 if (okay_system_chunks == 0) {
3246 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3247
3248 Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3249 if (NT_SUCCESS(Status))
3250 c->balance_num = Vcb->balance.balance_num;
3251 else if (Status != STATUS_DISK_FULL || consolidated) {
3252 ERR("alloc_chunk returned %08lx\n", Status);
3253 ExReleaseResourceLite(&Vcb->chunk_lock);
3254 Vcb->balance.status = Status;
3255 goto end;
3256 }
3257
3258 ExReleaseResourceLite(&Vcb->chunk_lock);
3259
3260 if (Status == STATUS_DISK_FULL) {
3261 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3262 if (!NT_SUCCESS(Status)) {
3263 ERR("try_consolidation returned %08lx\n", Status);
3264 Vcb->balance.status = Status;
3265 goto end;
3266 } else
3267 c->balance_num = Vcb->balance.balance_num;
3268
3269 consolidated = true;
3270
3271 if (Vcb->balance.stopping)
3272 goto end;
3273 }
3274 }
3275 }
3276
3277 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3278
3279 le = chunks.Flink;
3280 while (le != &chunks) {
3281 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3282
3283 c->reloc = true;
3284
3285 le = le->Flink;
3286 }
3287
3288 ExReleaseResourceLite(&Vcb->chunk_lock);
3289
3290 // do data chunks before metadata
3291 le = chunks.Flink;
3292 while (le != &chunks) {
3293 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3294 LIST_ENTRY* le2 = le->Flink;
3295
3296 if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3297 bool changed;
3298
3299 do {
3300 changed = false;
3301
3302 Status = balance_data_chunk(Vcb, c, &changed);
3303 if (!NT_SUCCESS(Status)) {
3304 ERR("balance_data_chunk returned %08lx\n", Status);
3305 Vcb->balance.status = Status;
3306 goto end;
3307 }
3308
3309 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3310
3311 if (Vcb->readonly)
3312 Vcb->balance.stopping = true;
3313
3314 if (Vcb->balance.stopping)
3315 break;
3316 } while (changed);
3317
3318 c->changed = true;
3319 c->space_changed = true;
3320 }
3321
3322 if (Vcb->balance.stopping)
3323 goto end;
3324
3325 if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3326 (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3327 RemoveEntryList(&c->list_entry_balance);
3328 c->list_entry_balance.Flink = NULL;
3329
3330 Vcb->balance.chunks_left--;
3331 }
3332
3333 le = le2;
3334 }
3335
3336 // do metadata chunks
3337 while (!IsListEmpty(&chunks)) {
3338 chunk* c;
3339 bool changed;
3340
3341 le = RemoveHeadList(&chunks);
3342 c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3343
3344 if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3345 do {
3346 Status = balance_metadata_chunk(Vcb, c, &changed);
3347 if (!NT_SUCCESS(Status)) {
3348 ERR("balance_metadata_chunk returned %08lx\n", Status);
3349 Vcb->balance.status = Status;
3350 goto end;
3351 }
3352
3353 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3354
3355 if (Vcb->readonly)
3356 Vcb->balance.stopping = true;
3357
3358 if (Vcb->balance.stopping)
3359 break;
3360 } while (changed);
3361
3362 c->changed = true;
3363 c->space_changed = true;
3364 }
3365
3366 if (Vcb->balance.stopping)
3367 break;
3368
3369 c->list_entry_balance.Flink = NULL;
3370
3371 Vcb->balance.chunks_left--;
3372 }
3373
3374 end:
3375 if (!Vcb->readonly) {
3376 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3377 le = chunks.Flink;
3378 while (le != &chunks) {
3379 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3380 c->reloc = false;
3381
3382 le = le->Flink;
3383 c->list_entry_balance.Flink = NULL;
3384 }
3385
3386 if (old_data_flags != 0)
3387 Vcb->data_flags = old_data_flags;
3388
3389 if (old_metadata_flags != 0)
3390 Vcb->metadata_flags = old_metadata_flags;
3391
3392 if (old_system_flags != 0)
3393 Vcb->system_flags = old_system_flags;
3394 }
3395
3396 if (Vcb->balance.removing) {
3397 device* dev = NULL;
3398
3399 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3400
3401 le = Vcb->devices.Flink;
3402 while (le != &Vcb->devices) {
3403 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3404
3405 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3406 dev = dev2;
3407 break;
3408 }
3409
3410 le = le->Flink;
3411 }
3412
3413 if (dev) {
3414 if (Vcb->balance.chunks_left == 0) {
3415 Status = finish_removing_device(Vcb, dev);
3416
3417 if (!NT_SUCCESS(Status)) {
3418 ERR("finish_removing_device returned %08lx\n", Status);
3419 dev->reloc = false;
3420 }
3421 } else
3422 dev->reloc = false;
3423 }
3424
3425 ExReleaseResourceLite(&Vcb->tree_lock);
3426 } else if (Vcb->balance.shrinking) {
3427 device* dev = NULL;
3428
3429 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3430
3431 le = Vcb->devices.Flink;
3432 while (le != &Vcb->devices) {
3433 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3434
3435 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3436 dev = dev2;
3437 break;
3438 }
3439
3440 le = le->Flink;
3441 }
3442
3443 if (!dev) {
3444 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3445 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3446 }
3447
3448 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3449 if (dev) {
3450 Status = regenerate_space_list(Vcb, dev);
3451 if (!NT_SUCCESS(Status))
3452 WARN("regenerate_space_list returned %08lx\n", Status);
3453 }
3454 } else {
3455 uint64_t old_size;
3456
3457 old_size = dev->devitem.num_bytes;
3458 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3459
3460 Status = update_dev_item(Vcb, dev, NULL);
3461 if (!NT_SUCCESS(Status)) {
3462 ERR("update_dev_item returned %08lx\n", Status);
3463 dev->devitem.num_bytes = old_size;
3464 Vcb->balance.status = Status;
3465
3466 Status = regenerate_space_list(Vcb, dev);
3467 if (!NT_SUCCESS(Status))
3468 WARN("regenerate_space_list returned %08lx\n", Status);
3469 } else {
3470 Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3471
3472 Status = do_write(Vcb, NULL);
3473 if (!NT_SUCCESS(Status))
3474 ERR("do_write returned %08lx\n", Status);
3475
3476 free_trees(Vcb);
3477 }
3478 }
3479
3480 ExReleaseResourceLite(&Vcb->tree_lock);
3481
3482 if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3483 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3484 } else {
3485 Status = remove_balance_item(Vcb);
3486 if (!NT_SUCCESS(Status)) {
3487 ERR("remove_balance_item returned %08lx\n", Status);
3488 goto end;
3489 }
3490 }
3491
3492 if (Vcb->trim && !Vcb->options.no_trim) {
3493 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3494
3495 le = Vcb->devices.Flink;
3496 while (le != &Vcb->devices) {
3497 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3498
3499 if (dev2->devobj && !dev2->readonly && dev2->trim)
3500 trim_unalloc_space(Vcb, dev2);
3501
3502 le = le->Flink;
3503 }
3504
3505 ExReleaseResourceLite(&Vcb->tree_lock);
3506 }
3507 }
3508
3509 ZwClose(Vcb->balance.thread);
3510 Vcb->balance.thread = NULL;
3511
3512 KeSetEvent(&Vcb->balance.finished, 0, false);
3513 }
3514
3515 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3516 NTSTATUS Status;
3517 btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3518 OBJECT_ATTRIBUTES oa;
3519 uint8_t i;
3520
3521 if (length < sizeof(btrfs_start_balance) || !data)
3522 return STATUS_INVALID_PARAMETER;
3523
3524 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3525 return STATUS_PRIVILEGE_NOT_HELD;
3526
3527 if (Vcb->locked) {
3528 WARN("cannot start balance while locked\n");
3529 return STATUS_DEVICE_NOT_READY;
3530 }
3531
3532 if (Vcb->scrub.thread) {
3533 WARN("cannot start balance while scrub running\n");
3534 return STATUS_DEVICE_NOT_READY;
3535 }
3536
3537 if (Vcb->balance.thread) {
3538 WARN("balance already running\n");
3539 return STATUS_DEVICE_NOT_READY;
3540 }
3541
3542 if (Vcb->readonly)
3543 return STATUS_MEDIA_WRITE_PROTECTED;
3544
3545 if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3546 !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3547 !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3548 return STATUS_SUCCESS;
3549
3550 for (i = 0; i < 3; i++) {
3551 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3552 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3553 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3554 BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE | BLOCK_FLAG_RAID1C3 |
3555 BLOCK_FLAG_RAID1C4;
3556
3557 if (bsb->opts[i].profiles == 0)
3558 return STATUS_INVALID_PARAMETER;
3559 }
3560
3561 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3562 if (bsb->opts[i].devid == 0)
3563 return STATUS_INVALID_PARAMETER;
3564 }
3565
3566 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3567 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3568 return STATUS_INVALID_PARAMETER;
3569 }
3570
3571 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3572 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3573 return STATUS_INVALID_PARAMETER;
3574 }
3575
3576 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3577 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3578 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3579
3580 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3581 return STATUS_INVALID_PARAMETER;
3582 }
3583
3584 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3585 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3586 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3587
3588 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3589 return STATUS_INVALID_PARAMETER;
3590 }
3591
3592 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3593 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3594 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3595
3596 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3597 return STATUS_INVALID_PARAMETER;
3598 }
3599
3600 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3601 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3602 bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3603 bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3604 bsb->opts[i].convert != BLOCK_FLAG_SINGLE && bsb->opts[i].convert != BLOCK_FLAG_RAID1C3 &&
3605 bsb->opts[i].convert != BLOCK_FLAG_RAID1C4)
3606 return STATUS_INVALID_PARAMETER;
3607 }
3608 }
3609 }
3610
3611 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3612 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3613 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3614
3615 Vcb->balance.paused = false;
3616 Vcb->balance.removing = false;
3617 Vcb->balance.shrinking = false;
3618 Vcb->balance.status = STATUS_SUCCESS;
3619 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3620
3621 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3622
3623 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3624 if (!NT_SUCCESS(Status)) {
3625 ERR("PsCreateSystemThread returned %08lx\n", Status);
3626 return Status;
3627 }
3628
3629 return STATUS_SUCCESS;
3630 }
3631
3632 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3633 KEY searchkey;
3634 traverse_ptr tp;
3635 NTSTATUS Status;
3636 BALANCE_ITEM* bi;
3637 OBJECT_ATTRIBUTES oa;
3638 int i;
3639
3640 searchkey.obj_id = BALANCE_ITEM_ID;
3641 searchkey.obj_type = TYPE_TEMP_ITEM;
3642 searchkey.offset = 0;
3643
3644 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3645 if (!NT_SUCCESS(Status)) {
3646 ERR("find_item returned %08lx\n", Status);
3647 return Status;
3648 }
3649
3650 if (keycmp(tp.item->key, searchkey)) {
3651 TRACE("no balance item found\n");
3652 return STATUS_NOT_FOUND;
3653 }
3654
3655 if (tp.item->size < sizeof(BALANCE_ITEM)) {
3656 WARN("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3657 tp.item->size, sizeof(BALANCE_ITEM));
3658 return STATUS_INTERNAL_ERROR;
3659 }
3660
3661 bi = (BALANCE_ITEM*)tp.item->data;
3662
3663 if (bi->flags & BALANCE_FLAGS_DATA)
3664 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3665
3666 if (bi->flags & BALANCE_FLAGS_METADATA)
3667 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3668
3669 if (bi->flags & BALANCE_FLAGS_SYSTEM)
3670 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3671
3672 // do the heuristics that Linux driver does
3673
3674 for (i = 0; i < 3; i++) {
3675 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3676 // if converting, don't redo chunks already done
3677
3678 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3679 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3680
3681 // don't balance chunks more than 90% filled - presumably these
3682 // have already been done
3683
3684 if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3685 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3686 ) {
3687 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3688 Vcb->balance.opts[i].usage_start = 0;
3689 Vcb->balance.opts[i].usage_end = 90;
3690 }
3691 }
3692 }
3693
3694 if (Vcb->readonly || Vcb->options.skip_balance)
3695 Vcb->balance.paused = true;
3696 else
3697 Vcb->balance.paused = false;
3698
3699 Vcb->balance.removing = false;
3700 Vcb->balance.shrinking = false;
3701 Vcb->balance.status = STATUS_SUCCESS;
3702 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3703
3704 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3705
3706 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3707 if (!NT_SUCCESS(Status)) {
3708 ERR("PsCreateSystemThread returned %08lx\n", Status);
3709 return Status;
3710 }
3711
3712 return STATUS_SUCCESS;
3713 }
3714
3715 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3716 btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3717
3718 if (length < sizeof(btrfs_query_balance) || !data)
3719 return STATUS_INVALID_PARAMETER;
3720
3721 if (!Vcb->balance.thread) {
3722 bqb->status = BTRFS_BALANCE_STOPPED;
3723
3724 if (!NT_SUCCESS(Vcb->balance.status)) {
3725 bqb->status |= BTRFS_BALANCE_ERROR;
3726 bqb->error = Vcb->balance.status;
3727 }
3728
3729 return STATUS_SUCCESS;
3730 }
3731
3732 bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3733
3734 if (Vcb->balance.removing)
3735 bqb->status |= BTRFS_BALANCE_REMOVAL;
3736
3737 if (Vcb->balance.shrinking)
3738 bqb->status |= BTRFS_BALANCE_SHRINKING;
3739
3740 if (!NT_SUCCESS(Vcb->balance.status))
3741 bqb->status |= BTRFS_BALANCE_ERROR;
3742
3743 bqb->chunks_left = Vcb->balance.chunks_left;
3744 bqb->total_chunks = Vcb->balance.total_chunks;
3745 bqb->error = Vcb->balance.status;
3746 RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3747 RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3748 RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3749
3750 return STATUS_SUCCESS;
3751 }
3752
3753 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3754 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3755 return STATUS_PRIVILEGE_NOT_HELD;
3756
3757 if (!Vcb->balance.thread)
3758 return STATUS_DEVICE_NOT_READY;
3759
3760 if (Vcb->balance.paused)
3761 return STATUS_DEVICE_NOT_READY;
3762
3763 Vcb->balance.paused = true;
3764 KeClearEvent(&Vcb->balance.event);
3765
3766 return STATUS_SUCCESS;
3767 }
3768
3769 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3770 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3771 return STATUS_PRIVILEGE_NOT_HELD;
3772
3773 if (!Vcb->balance.thread)
3774 return STATUS_DEVICE_NOT_READY;
3775
3776 if (!Vcb->balance.paused)
3777 return STATUS_DEVICE_NOT_READY;
3778
3779 if (Vcb->readonly)
3780 return STATUS_MEDIA_WRITE_PROTECTED;
3781
3782 Vcb->balance.paused = false;
3783 KeSetEvent(&Vcb->balance.event, 0, false);
3784
3785 return STATUS_SUCCESS;
3786 }
3787
3788 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3789 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3790 return STATUS_PRIVILEGE_NOT_HELD;
3791
3792 if (!Vcb->balance.thread)
3793 return STATUS_DEVICE_NOT_READY;
3794
3795 Vcb->balance.paused = false;
3796 Vcb->balance.stopping = true;
3797 Vcb->balance.status = STATUS_SUCCESS;
3798 KeSetEvent(&Vcb->balance.event, 0, false);
3799
3800 return STATUS_SUCCESS;
3801 }
3802
3803 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3804 uint64_t devid;
3805 LIST_ENTRY* le;
3806 device* dev = NULL;
3807 NTSTATUS Status;
3808 int i;
3809 uint64_t num_rw_devices;
3810 OBJECT_ATTRIBUTES oa;
3811
3812 TRACE("(%p, %p, %lx)\n", Vcb, data, length);
3813
3814 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3815 return STATUS_PRIVILEGE_NOT_HELD;
3816
3817 if (length < sizeof(uint64_t))
3818 return STATUS_INVALID_PARAMETER;
3819
3820 devid = *(uint64_t*)data;
3821
3822 ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3823
3824 if (Vcb->readonly) {
3825 ExReleaseResourceLite(&Vcb->tree_lock);
3826 return STATUS_MEDIA_WRITE_PROTECTED;
3827 }
3828
3829 num_rw_devices = 0;
3830
3831 le = Vcb->devices.Flink;
3832 while (le != &Vcb->devices) {
3833 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3834
3835 if (dev2->devitem.dev_id == devid)
3836 dev = dev2;
3837
3838 if (!dev2->readonly)
3839 num_rw_devices++;
3840
3841 le = le->Flink;
3842 }
3843
3844 if (!dev) {
3845 ExReleaseResourceLite(&Vcb->tree_lock);
3846 WARN("device %I64x not found\n", devid);
3847 return STATUS_NOT_FOUND;
3848 }
3849
3850 if (!dev->readonly) {
3851 if (num_rw_devices == 1) {
3852 ExReleaseResourceLite(&Vcb->tree_lock);
3853 WARN("not removing last non-readonly device\n");
3854 return STATUS_INVALID_PARAMETER;
3855 }
3856
3857 if (num_rw_devices == 4 &&
3858 ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3859 (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6) ||
3860 (Vcb->data_flags & BLOCK_FLAG_RAID1C4 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C4 || Vcb->system_flags & BLOCK_FLAG_RAID1C4)
3861 )
3862 ) {
3863 ExReleaseResourceLite(&Vcb->tree_lock);
3864 ERR("would not be enough devices to satisfy RAID requirement (RAID6/10/1C4)\n");
3865 return STATUS_CANNOT_DELETE;
3866 }
3867
3868 if (num_rw_devices == 3 &&
3869 ((Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5) ||
3870 (Vcb->data_flags & BLOCK_FLAG_RAID1C3 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C3 || Vcb->system_flags & BLOCK_FLAG_RAID1C3))
3871 ) {
3872 ExReleaseResourceLite(&Vcb->tree_lock);
3873 ERR("would not be enough devices to satisfy RAID requirement (RAID5/1C3)\n");
3874 return STATUS_CANNOT_DELETE;
3875 }
3876
3877 if (num_rw_devices == 2 &&
3878 ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3879 (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3880 ) {
3881 ExReleaseResourceLite(&Vcb->tree_lock);
3882 ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3883 return STATUS_CANNOT_DELETE;
3884 }
3885 }
3886
3887 ExReleaseResourceLite(&Vcb->tree_lock);
3888
3889 if (Vcb->balance.thread) {
3890 WARN("balance already running\n");
3891 return STATUS_DEVICE_NOT_READY;
3892 }
3893
3894 dev->reloc = true;
3895
3896 RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3897
3898 for (i = 0; i < 3; i++) {
3899 Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3900 Vcb->balance.opts[i].devid = devid;
3901 }
3902
3903 Vcb->balance.paused = false;
3904 Vcb->balance.removing = true;
3905 Vcb->balance.shrinking = false;
3906 Vcb->balance.status = STATUS_SUCCESS;
3907 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3908
3909 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3910
3911 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3912 if (!NT_SUCCESS(Status)) {
3913 ERR("PsCreateSystemThread returned %08lx\n", Status);
3914 dev->reloc = false;
3915 return Status;
3916 }
3917
3918 return STATUS_SUCCESS;
3919 }