1a2c6a2dc6a9eb81a03c6c3181733306f513c68a
[reactos.git] / drivers / filesystems / btrfs / balance.c
1 /* Copyright (c) Mark Harmstone 2016-17
2 *
3 * This file is part of WinBtrfs.
4 *
5 * WinBtrfs is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public Licence as published by
7 * the Free Software Foundation, either version 3 of the Licence, or
8 * (at your option) any later version.
9 *
10 * WinBtrfs is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public Licence for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public Licence
16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include <ntddstor.h>
21
22 typedef struct {
23 uint64_t address;
24 uint64_t new_address;
25 tree_header* data;
26 EXTENT_ITEM* ei;
27 tree* t;
28 bool system;
29 LIST_ENTRY refs;
30 LIST_ENTRY list_entry;
31 } metadata_reloc;
32
33 typedef struct {
34 uint8_t type;
35 uint64_t hash;
36
37 union {
38 TREE_BLOCK_REF tbr;
39 SHARED_BLOCK_REF sbr;
40 };
41
42 metadata_reloc* parent;
43 bool top;
44 LIST_ENTRY list_entry;
45 } metadata_reloc_ref;
46
47 typedef struct {
48 uint64_t address;
49 uint64_t size;
50 uint64_t new_address;
51 chunk* newchunk;
52 EXTENT_ITEM* ei;
53 LIST_ENTRY refs;
54 LIST_ENTRY list_entry;
55 } data_reloc;
56
57 typedef struct {
58 uint8_t type;
59 uint64_t hash;
60
61 union {
62 EXTENT_DATA_REF edr;
63 SHARED_DATA_REF sdr;
64 };
65
66 metadata_reloc* parent;
67 LIST_ENTRY list_entry;
68 } data_reloc_ref;
69
70 #ifndef _MSC_VER // not in mingw yet
71 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
72 #endif
73
74 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
75
76 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
77 bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
78 NTSTATUS Status;
79 metadata_reloc* mr;
80 EXTENT_ITEM* ei;
81 uint16_t len;
82 uint64_t inline_rc;
83 uint8_t* ptr;
84
85 mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
86 if (!mr) {
87 ERR("out of memory\n");
88 return STATUS_INSUFFICIENT_RESOURCES;
89 }
90
91 mr->address = tp->item->key.obj_id;
92 mr->data = NULL;
93 mr->ei = (EXTENT_ITEM*)tp->item->data;
94 mr->system = false;
95 InitializeListHead(&mr->refs);
96
97 Status = delete_tree_item(Vcb, tp);
98 if (!NT_SUCCESS(Status)) {
99 ERR("delete_tree_item returned %08x\n", Status);
100 ExFreePool(mr);
101 return Status;
102 }
103
104 if (!c)
105 c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
106
107 if (c) {
108 acquire_chunk_lock(c, Vcb);
109
110 c->used -= Vcb->superblock.node_size;
111
112 space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
113
114 release_chunk_lock(c, Vcb);
115 }
116
117 ei = (EXTENT_ITEM*)tp->item->data;
118 inline_rc = 0;
119
120 len = tp->item->size - sizeof(EXTENT_ITEM);
121 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
122 if (!skinny) {
123 len -= sizeof(EXTENT_ITEM2);
124 ptr += sizeof(EXTENT_ITEM2);
125 }
126
127 while (len > 0) {
128 uint8_t secttype = *ptr;
129 uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
130 metadata_reloc_ref* ref;
131
132 len--;
133
134 if (sectlen > len) {
135 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
136 return STATUS_INTERNAL_ERROR;
137 }
138
139 if (sectlen == 0) {
140 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
141 return STATUS_INTERNAL_ERROR;
142 }
143
144 ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
145 if (!ref) {
146 ERR("out of memory\n");
147 return STATUS_INSUFFICIENT_RESOURCES;
148 }
149
150 if (secttype == TYPE_TREE_BLOCK_REF) {
151 ref->type = TYPE_TREE_BLOCK_REF;
152 RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
153 inline_rc++;
154 } else if (secttype == TYPE_SHARED_BLOCK_REF) {
155 ref->type = TYPE_SHARED_BLOCK_REF;
156 RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
157 inline_rc++;
158 } else {
159 ERR("unexpected tree type %x\n", secttype);
160 ExFreePool(ref);
161 return STATUS_INTERNAL_ERROR;
162 }
163
164 ref->parent = NULL;
165 ref->top = false;
166 InsertTailList(&mr->refs, &ref->list_entry);
167
168 len -= sectlen;
169 ptr += sizeof(uint8_t) + sectlen;
170 }
171
172 if (inline_rc < ei->refcount) { // look for non-inline entries
173 traverse_ptr tp2 = *tp, next_tp;
174
175 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
176 tp2 = next_tp;
177
178 if (tp2.item->key.obj_id == tp->item->key.obj_id) {
179 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
180 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
181 if (!ref) {
182 ERR("out of memory\n");
183 return STATUS_INSUFFICIENT_RESOURCES;
184 }
185
186 ref->type = TYPE_TREE_BLOCK_REF;
187 ref->tbr.offset = tp2.item->key.offset;
188 ref->parent = NULL;
189 ref->top = false;
190 InsertTailList(&mr->refs, &ref->list_entry);
191
192 Status = delete_tree_item(Vcb, &tp2);
193 if (!NT_SUCCESS(Status)) {
194 ERR("delete_tree_item returned %08x\n", Status);
195 return Status;
196 }
197 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
198 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
199 if (!ref) {
200 ERR("out of memory\n");
201 return STATUS_INSUFFICIENT_RESOURCES;
202 }
203
204 ref->type = TYPE_SHARED_BLOCK_REF;
205 ref->sbr.offset = tp2.item->key.offset;
206 ref->parent = NULL;
207 ref->top = false;
208 InsertTailList(&mr->refs, &ref->list_entry);
209
210 Status = delete_tree_item(Vcb, &tp2);
211 if (!NT_SUCCESS(Status)) {
212 ERR("delete_tree_item returned %08x\n", Status);
213 return Status;
214 }
215 }
216 } else
217 break;
218 }
219 }
220
221 InsertTailList(items, &mr->list_entry);
222
223 if (mr2)
224 *mr2 = mr;
225
226 return STATUS_SUCCESS;
227 }
228
229 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
230 uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
231 LIST_ENTRY* le;
232 KEY searchkey;
233 traverse_ptr tp;
234 bool skinny = false;
235 NTSTATUS Status;
236
237 le = items->Flink;
238 while (le != items) {
239 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
240
241 if (mr->address == address) {
242 *mr2 = mr;
243 return STATUS_SUCCESS;
244 }
245
246 le = le->Flink;
247 }
248
249 searchkey.obj_id = address;
250 searchkey.obj_type = TYPE_METADATA_ITEM;
251 searchkey.offset = 0xffffffffffffffff;
252
253 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
254 if (!NT_SUCCESS(Status)) {
255 ERR("find_item returned %08x\n", Status);
256 return Status;
257 }
258
259 if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
260 skinny = true;
261 else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
262 tp.item->size >= sizeof(EXTENT_ITEM)) {
263 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
264
265 if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
266 ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
267 return STATUS_INTERNAL_ERROR;
268 }
269 } else {
270 ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
271 return STATUS_INTERNAL_ERROR;
272 }
273
274 Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
275 if (!NT_SUCCESS(Status)) {
276 ERR("add_metadata_reloc returned %08x\n", Status);
277 return Status;
278 }
279
280 return STATUS_SUCCESS;
281 }
282
283 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
284 LIST_ENTRY newlist, *le;
285
286 if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
287 return;
288
289 // insertion sort
290
291 InitializeListHead(&newlist);
292
293 while (!IsListEmpty(&mr->refs)) {
294 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
295 bool inserted = false;
296
297 if (ref->type == TYPE_TREE_BLOCK_REF)
298 ref->hash = ref->tbr.offset;
299 else if (ref->type == TYPE_SHARED_BLOCK_REF)
300 ref->hash = ref->parent->new_address;
301
302 le = newlist.Flink;
303 while (le != &newlist) {
304 metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
305
306 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
307 InsertHeadList(le->Blink, &ref->list_entry);
308 inserted = true;
309 break;
310 }
311
312 le = le->Flink;
313 }
314
315 if (!inserted)
316 InsertTailList(&newlist, &ref->list_entry);
317 }
318
319 newlist.Flink->Blink = &mr->refs;
320 newlist.Blink->Flink = &mr->refs;
321 mr->refs.Flink = newlist.Flink;
322 mr->refs.Blink = newlist.Blink;
323 }
324
325 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
326 NTSTATUS Status;
327 LIST_ENTRY* le;
328 uint64_t rc = 0;
329 uint16_t inline_len;
330 bool all_inline = true;
331 metadata_reloc_ref* first_noninline = NULL;
332 EXTENT_ITEM* ei;
333 uint8_t* ptr;
334
335 inline_len = sizeof(EXTENT_ITEM);
336 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
337 inline_len += sizeof(EXTENT_ITEM2);
338
339 sort_metadata_reloc_refs(mr);
340
341 le = mr->refs.Flink;
342 while (le != &mr->refs) {
343 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
344 uint16_t extlen = 0;
345
346 rc++;
347
348 if (ref->type == TYPE_TREE_BLOCK_REF)
349 extlen += sizeof(TREE_BLOCK_REF);
350 else if (ref->type == TYPE_SHARED_BLOCK_REF)
351 extlen += sizeof(SHARED_BLOCK_REF);
352
353 if (all_inline) {
354 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
355 all_inline = false;
356 first_noninline = ref;
357 } else
358 inline_len += extlen + 1;
359 }
360
361 le = le->Flink;
362 }
363
364 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
365 if (!ei) {
366 ERR("out of memory\n");
367 return STATUS_INSUFFICIENT_RESOURCES;
368 }
369
370 ei->refcount = rc;
371 ei->generation = mr->ei->generation;
372 ei->flags = mr->ei->flags;
373 ptr = (uint8_t*)&ei[1];
374
375 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
376 EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
377
378 ei2->firstitem = *(KEY*)&mr->data[1];
379 ei2->level = mr->data->level;
380
381 ptr += sizeof(EXTENT_ITEM2);
382 }
383
384 le = mr->refs.Flink;
385 while (le != &mr->refs) {
386 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
387
388 if (ref == first_noninline)
389 break;
390
391 *ptr = ref->type;
392 ptr++;
393
394 if (ref->type == TYPE_TREE_BLOCK_REF) {
395 TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
396
397 tbr->offset = ref->tbr.offset;
398
399 ptr += sizeof(TREE_BLOCK_REF);
400 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
401 SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
402
403 sbr->offset = ref->parent->new_address;
404
405 ptr += sizeof(SHARED_BLOCK_REF);
406 }
407
408 le = le->Flink;
409 }
410
411 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
412 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
413 else
414 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
415
416 if (!NT_SUCCESS(Status)) {
417 ERR("insert_tree_item returned %08x\n", Status);
418 ExFreePool(ei);
419 return Status;
420 }
421
422 if (!all_inline) {
423 le = &first_noninline->list_entry;
424
425 while (le != &mr->refs) {
426 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
427
428 if (ref->type == TYPE_TREE_BLOCK_REF) {
429 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
430 if (!NT_SUCCESS(Status)) {
431 ERR("insert_tree_item returned %08x\n", Status);
432 return Status;
433 }
434 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
435 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
436 if (!NT_SUCCESS(Status)) {
437 ERR("insert_tree_item returned %08x\n", Status);
438 return Status;
439 }
440 }
441
442 le = le->Flink;
443 }
444 }
445
446 if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
447 if (mr->data->level > 0) {
448 uint16_t i;
449 internal_node* in = (internal_node*)&mr->data[1];
450
451 for (i = 0; i < mr->data->num_items; i++) {
452 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
453
454 if (sbrrc > 0) {
455 SHARED_BLOCK_REF sbr;
456
457 sbr.offset = mr->new_address;
458
459 Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
460 if (!NT_SUCCESS(Status)) {
461 ERR("increase_extent_refcount returned %08x\n", Status);
462 return Status;
463 }
464
465 sbr.offset = mr->address;
466
467 Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
468 sbr.offset, false, NULL);
469 if (!NT_SUCCESS(Status)) {
470 ERR("decrease_extent_refcount returned %08x\n", Status);
471 return Status;
472 }
473 }
474 }
475 } else {
476 uint16_t i;
477 leaf_node* ln = (leaf_node*)&mr->data[1];
478
479 for (i = 0; i < mr->data->num_items; i++) {
480 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
481 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
482
483 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
484 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
485
486 if (ed2->size > 0) { // not sparse
487 uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
488
489 if (sdrrc > 0) {
490 SHARED_DATA_REF sdr;
491 chunk* c;
492
493 sdr.offset = mr->new_address;
494 sdr.count = sdrrc;
495
496 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
497 if (!NT_SUCCESS(Status)) {
498 ERR("increase_extent_refcount returned %08x\n", Status);
499 return Status;
500 }
501
502 sdr.offset = mr->address;
503
504 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
505 sdr.offset, false, NULL);
506 if (!NT_SUCCESS(Status)) {
507 ERR("decrease_extent_refcount returned %08x\n", Status);
508 return Status;
509 }
510
511 c = get_chunk_from_address(Vcb, ed2->address);
512
513 if (c) {
514 // check changed_extents
515
516 ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
517
518 le = c->changed_extents.Flink;
519
520 while (le != &c->changed_extents) {
521 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
522
523 if (ce->address == ed2->address) {
524 LIST_ENTRY* le2;
525
526 le2 = ce->refs.Flink;
527 while (le2 != &ce->refs) {
528 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
529
530 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
531 cer->sdr.offset = mr->new_address;
532 break;
533 }
534
535 le2 = le2->Flink;
536 }
537
538 le2 = ce->old_refs.Flink;
539 while (le2 != &ce->old_refs) {
540 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
541
542 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
543 cer->sdr.offset = mr->new_address;
544 break;
545 }
546
547 le2 = le2->Flink;
548 }
549
550 break;
551 }
552
553 le = le->Flink;
554 }
555
556 ExReleaseResourceLite(&c->changed_extents_lock);
557 }
558 }
559 }
560 }
561 }
562 }
563 }
564 }
565
566 return STATUS_SUCCESS;
567 }
568
569 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
570 LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
571 LIST_ENTRY tree_writes, *le;
572 NTSTATUS Status;
573 traverse_ptr tp;
574 uint8_t level, max_level = 0;
575 chunk* newchunk = NULL;
576
577 InitializeListHead(&tree_writes);
578
579 le = items->Flink;
580 while (le != items) {
581 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
582 LIST_ENTRY* le2;
583 chunk* pc;
584
585 mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
586 if (!mr->data) {
587 ERR("out of memory\n");
588 return STATUS_INSUFFICIENT_RESOURCES;
589 }
590
591 Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
592 c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
593 if (!NT_SUCCESS(Status)) {
594 ERR("read_data returned %08x\n", Status);
595 return Status;
596 }
597
598 if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
599 mr->system = true;
600
601 if (data_items && mr->data->level == 0) {
602 le2 = data_items->Flink;
603 while (le2 != data_items) {
604 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
605 leaf_node* ln = (leaf_node*)&mr->data[1];
606 uint16_t i;
607
608 for (i = 0; i < mr->data->num_items; i++) {
609 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
610 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
611
612 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
613 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
614
615 if (ed2->address == dr->address)
616 ed2->address = dr->new_address;
617 }
618 }
619 }
620
621 le2 = le2->Flink;
622 }
623 }
624
625 if (mr->data->level > max_level)
626 max_level = mr->data->level;
627
628 le2 = mr->refs.Flink;
629 while (le2 != &mr->refs) {
630 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
631
632 if (ref->type == TYPE_TREE_BLOCK_REF) {
633 KEY* firstitem;
634 root* r = NULL;
635 LIST_ENTRY* le3;
636 tree* t;
637
638 firstitem = (KEY*)&mr->data[1];
639
640 le3 = Vcb->roots.Flink;
641 while (le3 != &Vcb->roots) {
642 root* r2 = CONTAINING_RECORD(le3, root, list_entry);
643
644 if (r2->id == ref->tbr.offset) {
645 r = r2;
646 break;
647 }
648
649 le3 = le3->Flink;
650 }
651
652 if (!r) {
653 ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
654 return STATUS_INTERNAL_ERROR;
655 }
656
657 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
658 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
659 ERR("find_item_to_level returned %08x\n", Status);
660 return Status;
661 }
662
663 t = tp.tree;
664 while (t && t->header.level < mr->data->level + 1) {
665 t = t->parent;
666 }
667
668 if (!t)
669 ref->top = true;
670 else {
671 metadata_reloc* mr2;
672
673 Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
674 if (!NT_SUCCESS(Status)) {
675 ERR("add_metadata_reloc_parent returned %08x\n", Status);
676 return Status;
677 }
678
679 ref->parent = mr2;
680 }
681 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
682 metadata_reloc* mr2;
683
684 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
685 if (!NT_SUCCESS(Status)) {
686 ERR("add_metadata_reloc_parent returned %08x\n", Status);
687 return Status;
688 }
689
690 ref->parent = mr2;
691 }
692
693 le2 = le2->Flink;
694 }
695
696 le = le->Flink;
697 }
698
699 le = items->Flink;
700 while (le != items) {
701 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
702 LIST_ENTRY* le2;
703 uint32_t hash;
704
705 mr->t = NULL;
706
707 hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
708
709 le2 = Vcb->trees_ptrs[hash >> 24];
710
711 if (le2) {
712 while (le2 != &Vcb->trees_hash) {
713 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
714
715 if (t->header.address == mr->address) {
716 mr->t = t;
717 break;
718 } else if (t->hash > hash)
719 break;
720
721 le2 = le2->Flink;
722 }
723 }
724
725 le = le->Flink;
726 }
727
728 for (level = 0; level <= max_level; level++) {
729 le = items->Flink;
730 while (le != items) {
731 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
732
733 if (mr->data->level == level) {
734 bool done = false;
735 LIST_ENTRY* le2;
736 tree_write* tw;
737 uint64_t flags;
738 tree* t3;
739
740 if (mr->system)
741 flags = Vcb->system_flags;
742 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
743 flags = Vcb->data_flags;
744 else
745 flags = Vcb->metadata_flags;
746
747 if (newchunk) {
748 acquire_chunk_lock(newchunk, Vcb);
749
750 if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
751 newchunk->used += Vcb->superblock.node_size;
752 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
753 done = true;
754 }
755
756 release_chunk_lock(newchunk, Vcb);
757 }
758
759 if (!done) {
760 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
761
762 le2 = Vcb->chunks.Flink;
763 while (le2 != &Vcb->chunks) {
764 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
765
766 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
767 acquire_chunk_lock(c2, Vcb);
768
769 if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
770 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
771 c2->used += Vcb->superblock.node_size;
772 space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback);
773 release_chunk_lock(c2, Vcb);
774 newchunk = c2;
775 done = true;
776 break;
777 }
778 }
779
780 release_chunk_lock(c2, Vcb);
781 }
782
783 le2 = le2->Flink;
784 }
785
786 // allocate new chunk if necessary
787 if (!done) {
788 Status = alloc_chunk(Vcb, flags, &newchunk, false);
789
790 if (!NT_SUCCESS(Status)) {
791 ERR("alloc_chunk returned %08x\n", Status);
792 ExReleaseResourceLite(&Vcb->chunk_lock);
793 goto end;
794 }
795
796 acquire_chunk_lock(newchunk, Vcb);
797
798 newchunk->balance_num = Vcb->balance.balance_num;
799
800 if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
801 release_chunk_lock(newchunk, Vcb);
802 ExReleaseResourceLite(&Vcb->chunk_lock);
803 ERR("could not find address in new chunk\n");
804 Status = STATUS_DISK_FULL;
805 goto end;
806 } else {
807 newchunk->used += Vcb->superblock.node_size;
808 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
809 }
810
811 release_chunk_lock(newchunk, Vcb);
812 }
813
814 ExReleaseResourceLite(&Vcb->chunk_lock);
815 }
816
817 // update parents
818 le2 = mr->refs.Flink;
819 while (le2 != &mr->refs) {
820 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
821
822 if (ref->parent) {
823 uint16_t i;
824 internal_node* in = (internal_node*)&ref->parent->data[1];
825
826 for (i = 0; i < ref->parent->data->num_items; i++) {
827 if (in[i].address == mr->address) {
828 in[i].address = mr->new_address;
829 break;
830 }
831 }
832
833 if (ref->parent->t) {
834 LIST_ENTRY* le3;
835
836 le3 = ref->parent->t->itemlist.Flink;
837 while (le3 != &ref->parent->t->itemlist) {
838 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
839
840 if (!td->inserted && td->treeholder.address == mr->address)
841 td->treeholder.address = mr->new_address;
842
843 le3 = le3->Flink;
844 }
845 }
846 } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
847 LIST_ENTRY* le3;
848 root* r = NULL;
849
850 // alter ROOT_ITEM
851
852 le3 = Vcb->roots.Flink;
853 while (le3 != &Vcb->roots) {
854 root* r2 = CONTAINING_RECORD(le3, root, list_entry);
855
856 if (r2->id == ref->tbr.offset) {
857 r = r2;
858 break;
859 }
860
861 le3 = le3->Flink;
862 }
863
864 if (r) {
865 r->treeholder.address = mr->new_address;
866
867 if (r == Vcb->root_root)
868 Vcb->superblock.root_tree_addr = mr->new_address;
869 else if (r == Vcb->chunk_root)
870 Vcb->superblock.chunk_tree_addr = mr->new_address;
871 else if (r->root_item.block_number == mr->address) {
872 KEY searchkey;
873 ROOT_ITEM* ri;
874
875 r->root_item.block_number = mr->new_address;
876
877 searchkey.obj_id = r->id;
878 searchkey.obj_type = TYPE_ROOT_ITEM;
879 searchkey.offset = 0xffffffffffffffff;
880
881 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
882 if (!NT_SUCCESS(Status)) {
883 ERR("find_item returned %08x\n", Status);
884 goto end;
885 }
886
887 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
888 ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
889 Status = STATUS_INTERNAL_ERROR;
890 goto end;
891 }
892
893 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
894 if (!ri) {
895 ERR("out of memory\n");
896 Status = STATUS_INSUFFICIENT_RESOURCES;
897 goto end;
898 }
899
900 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
901
902 Status = delete_tree_item(Vcb, &tp);
903 if (!NT_SUCCESS(Status)) {
904 ERR("delete_tree_item returned %08x\n", Status);
905 goto end;
906 }
907
908 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
909 if (!NT_SUCCESS(Status)) {
910 ERR("insert_tree_item returned %08x\n", Status);
911 goto end;
912 }
913 }
914 }
915 }
916
917 le2 = le2->Flink;
918 }
919
920 mr->data->address = mr->new_address;
921
922 t3 = mr->t;
923
924 while (t3) {
925 uint8_t h;
926 bool inserted;
927 tree* t4 = NULL;
928
929 // check if tree loaded more than once
930 if (t3->list_entry.Flink != &Vcb->trees_hash) {
931 tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
932
933 if (nt->header.address == t3->header.address)
934 t4 = nt;
935 }
936
937 t3->header.address = mr->new_address;
938
939 h = t3->hash >> 24;
940
941 if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
942 if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
943 Vcb->trees_ptrs[h] = NULL;
944 else {
945 tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
946
947 if (t2->hash >> 24 == h)
948 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
949 else
950 Vcb->trees_ptrs[h] = NULL;
951 }
952 }
953
954 RemoveEntryList(&t3->list_entry_hash);
955
956 t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
957 h = t3->hash >> 24;
958
959 if (!Vcb->trees_ptrs[h]) {
960 uint8_t h2 = h;
961
962 le2 = Vcb->trees_hash.Flink;
963
964 if (h2 > 0) {
965 h2--;
966 do {
967 if (Vcb->trees_ptrs[h2]) {
968 le2 = Vcb->trees_ptrs[h2];
969 break;
970 }
971
972 h2--;
973 } while (h2 > 0);
974 }
975 } else
976 le2 = Vcb->trees_ptrs[h];
977
978 inserted = false;
979 while (le2 != &Vcb->trees_hash) {
980 tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
981
982 if (t2->hash >= t3->hash) {
983 InsertHeadList(le2->Blink, &t3->list_entry_hash);
984 inserted = true;
985 break;
986 }
987
988 le2 = le2->Flink;
989 }
990
991 if (!inserted)
992 InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
993
994 if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
995 Vcb->trees_ptrs[h] = &t3->list_entry_hash;
996
997 if (data_items && level == 0) {
998 le2 = data_items->Flink;
999
1000 while (le2 != data_items) {
1001 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
1002 LIST_ENTRY* le3 = t3->itemlist.Flink;
1003
1004 while (le3 != &t3->itemlist) {
1005 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1006
1007 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1008 EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1009
1010 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1011 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1012
1013 if (ed2->address == dr->address)
1014 ed2->address = dr->new_address;
1015 }
1016 }
1017
1018 le3 = le3->Flink;
1019 }
1020
1021 le2 = le2->Flink;
1022 }
1023 }
1024
1025 t3 = t4;
1026 }
1027
1028 *((uint32_t*)mr->data) = ~calc_crc32c(0xffffffff, (uint8_t*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum));
1029
1030 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1031 if (!tw) {
1032 ERR("out of memory\n");
1033 Status = STATUS_INSUFFICIENT_RESOURCES;
1034 goto end;
1035 }
1036
1037 tw->address = mr->new_address;
1038 tw->length = Vcb->superblock.node_size;
1039 tw->data = (uint8_t*)mr->data;
1040 tw->allocated = false;
1041
1042 if (IsListEmpty(&tree_writes))
1043 InsertTailList(&tree_writes, &tw->list_entry);
1044 else {
1045 bool inserted = false;
1046
1047 le2 = tree_writes.Flink;
1048 while (le2 != &tree_writes) {
1049 tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1050
1051 if (tw2->address > tw->address) {
1052 InsertHeadList(le2->Blink, &tw->list_entry);
1053 inserted = true;
1054 break;
1055 }
1056
1057 le2 = le2->Flink;
1058 }
1059
1060 if (!inserted)
1061 InsertTailList(&tree_writes, &tw->list_entry);
1062 }
1063 }
1064
1065 le = le->Flink;
1066 }
1067 }
1068
1069 Status = do_tree_writes(Vcb, &tree_writes, true);
1070 if (!NT_SUCCESS(Status)) {
1071 ERR("do_tree_writes returned %08x\n", Status);
1072 goto end;
1073 }
1074
1075 le = items->Flink;
1076 while (le != items) {
1077 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1078
1079 Status = add_metadata_reloc_extent_item(Vcb, mr);
1080 if (!NT_SUCCESS(Status)) {
1081 ERR("add_metadata_reloc_extent_item returned %08x\n", Status);
1082 goto end;
1083 }
1084
1085 le = le->Flink;
1086 }
1087
1088 Status = STATUS_SUCCESS;
1089
1090 end:
1091 while (!IsListEmpty(&tree_writes)) {
1092 tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1093
1094 if (tw->allocated)
1095 ExFreePool(tw->data);
1096
1097 ExFreePool(tw);
1098 }
1099
1100 return Status;
1101 }
1102
1103 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1104 KEY searchkey;
1105 traverse_ptr tp;
1106 NTSTATUS Status;
1107 bool b;
1108 LIST_ENTRY items, rollback;
1109 uint32_t loaded = 0;
1110
1111 TRACE("chunk %I64x\n", c->offset);
1112
1113 InitializeListHead(&rollback);
1114 InitializeListHead(&items);
1115
1116 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1117
1118 searchkey.obj_id = c->offset;
1119 searchkey.obj_type = TYPE_METADATA_ITEM;
1120 searchkey.offset = 0xffffffffffffffff;
1121
1122 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1123 if (!NT_SUCCESS(Status)) {
1124 ERR("find_item returned %08x\n", Status);
1125 goto end;
1126 }
1127
1128 do {
1129 traverse_ptr next_tp;
1130
1131 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1132 break;
1133
1134 if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1135 bool tree = false, skinny = false;
1136
1137 if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1138 tree = true;
1139 skinny = true;
1140 } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1141 tp.item->size >= sizeof(EXTENT_ITEM)) {
1142 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1143
1144 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1145 tree = true;
1146 }
1147
1148 if (tree) {
1149 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1150
1151 if (!NT_SUCCESS(Status)) {
1152 ERR("add_metadata_reloc returned %08x\n", Status);
1153 goto end;
1154 }
1155
1156 loaded++;
1157
1158 if (loaded >= 64) // only do 64 at a time
1159 break;
1160 }
1161 }
1162
1163 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1164
1165 if (b)
1166 tp = next_tp;
1167 } while (b);
1168
1169 if (IsListEmpty(&items)) {
1170 *changed = false;
1171 Status = STATUS_SUCCESS;
1172 goto end;
1173 } else
1174 *changed = true;
1175
1176 Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1177 if (!NT_SUCCESS(Status)) {
1178 ERR("write_metadata_items returned %08x\n", Status);
1179 goto end;
1180 }
1181
1182 Status = STATUS_SUCCESS;
1183
1184 Vcb->need_write = true;
1185
1186 end:
1187 if (NT_SUCCESS(Status)) {
1188 Status = do_write(Vcb, NULL);
1189 if (!NT_SUCCESS(Status))
1190 ERR("do_write returned %08x\n", Status);
1191 }
1192
1193 if (NT_SUCCESS(Status))
1194 clear_rollback(&rollback);
1195 else
1196 do_rollback(Vcb, &rollback);
1197
1198 free_trees(Vcb);
1199
1200 ExReleaseResourceLite(&Vcb->tree_lock);
1201
1202 while (!IsListEmpty(&items)) {
1203 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1204
1205 while (!IsListEmpty(&mr->refs)) {
1206 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1207
1208 ExFreePool(ref);
1209 }
1210
1211 if (mr->data)
1212 ExFreePool(mr->data);
1213
1214 ExFreePool(mr);
1215 }
1216
1217 return Status;
1218 }
1219
1220 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1221 data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1222 NTSTATUS Status;
1223 LIST_ENTRY* le;
1224 KEY searchkey;
1225 traverse_ptr tp;
1226 root* r = NULL;
1227 metadata_reloc* mr;
1228 uint64_t last_tree = 0;
1229 data_reloc_ref* ref;
1230
1231 le = Vcb->roots.Flink;
1232 while (le != &Vcb->roots) {
1233 root* r2 = CONTAINING_RECORD(le, root, list_entry);
1234
1235 if (r2->id == edr->root) {
1236 r = r2;
1237 break;
1238 }
1239
1240 le = le->Flink;
1241 }
1242
1243 if (!r) {
1244 ERR("could not find subvol %I64x\n", edr->count);
1245 return STATUS_INTERNAL_ERROR;
1246 }
1247
1248 searchkey.obj_id = edr->objid;
1249 searchkey.obj_type = TYPE_EXTENT_DATA;
1250 searchkey.offset = 0;
1251
1252 Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1253 if (!NT_SUCCESS(Status)) {
1254 ERR("find_item returned %08x\n", Status);
1255 return Status;
1256 }
1257
1258 if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1259 traverse_ptr tp2;
1260
1261 if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1262 tp = tp2;
1263 else {
1264 ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1265 return STATUS_INTERNAL_ERROR;
1266 }
1267 }
1268
1269 ref = NULL;
1270
1271 while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1272 traverse_ptr tp2;
1273
1274 if (tp.item->size >= sizeof(EXTENT_DATA)) {
1275 EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1276
1277 if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1278 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1279
1280 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1281 if (ref && last_tree == tp.tree->header.address)
1282 ref->edr.count++;
1283 else {
1284 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1285 if (!ref) {
1286 ERR("out of memory\n");
1287 return STATUS_INSUFFICIENT_RESOURCES;
1288 }
1289
1290 ref->type = TYPE_EXTENT_DATA_REF;
1291 RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1292 ref->edr.count = 1;
1293
1294 Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1295 if (!NT_SUCCESS(Status)) {
1296 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1297 ExFreePool(ref);
1298 return Status;
1299 }
1300
1301 last_tree = tp.tree->header.address;
1302 ref->parent = mr;
1303
1304 InsertTailList(&dr->refs, &ref->list_entry);
1305 }
1306 }
1307 }
1308 }
1309
1310 if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1311 tp = tp2;
1312 else
1313 break;
1314 }
1315
1316 return STATUS_SUCCESS;
1317 }
1318
1319 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1320 traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1321 NTSTATUS Status;
1322 data_reloc* dr;
1323 EXTENT_ITEM* ei;
1324 uint16_t len;
1325 uint64_t inline_rc;
1326 uint8_t* ptr;
1327
1328 dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1329 if (!dr) {
1330 ERR("out of memory\n");
1331 return STATUS_INSUFFICIENT_RESOURCES;
1332 }
1333
1334 dr->address = tp->item->key.obj_id;
1335 dr->size = tp->item->key.offset;
1336 dr->ei = (EXTENT_ITEM*)tp->item->data;
1337 InitializeListHead(&dr->refs);
1338
1339 Status = delete_tree_item(Vcb, tp);
1340 if (!NT_SUCCESS(Status)) {
1341 ERR("delete_tree_item returned %08x\n", Status);
1342 return Status;
1343 }
1344
1345 if (!c)
1346 c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1347
1348 if (c) {
1349 acquire_chunk_lock(c, Vcb);
1350
1351 c->used -= tp->item->key.offset;
1352
1353 space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1354
1355 release_chunk_lock(c, Vcb);
1356 }
1357
1358 ei = (EXTENT_ITEM*)tp->item->data;
1359 inline_rc = 0;
1360
1361 len = tp->item->size - sizeof(EXTENT_ITEM);
1362 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1363
1364 while (len > 0) {
1365 uint8_t secttype = *ptr;
1366 uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1367
1368 len--;
1369
1370 if (sectlen > len) {
1371 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1372 return STATUS_INTERNAL_ERROR;
1373 }
1374
1375 if (sectlen == 0) {
1376 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1377 return STATUS_INTERNAL_ERROR;
1378 }
1379
1380 if (secttype == TYPE_EXTENT_DATA_REF) {
1381 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1382
1383 inline_rc += edr->count;
1384
1385 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1386 if (!NT_SUCCESS(Status)) {
1387 ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1388 return Status;
1389 }
1390 } else if (secttype == TYPE_SHARED_DATA_REF) {
1391 metadata_reloc* mr;
1392 data_reloc_ref* ref;
1393
1394 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1395 if (!ref) {
1396 ERR("out of memory\n");
1397 return STATUS_INSUFFICIENT_RESOURCES;
1398 }
1399
1400 ref->type = TYPE_SHARED_DATA_REF;
1401 RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1402 inline_rc += ref->sdr.count;
1403
1404 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1405 if (!NT_SUCCESS(Status)) {
1406 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1407 ExFreePool(ref);
1408 return Status;
1409 }
1410
1411 ref->parent = mr;
1412
1413 InsertTailList(&dr->refs, &ref->list_entry);
1414 } else {
1415 ERR("unexpected tree type %x\n", secttype);
1416 return STATUS_INTERNAL_ERROR;
1417 }
1418
1419
1420 len -= sectlen;
1421 ptr += sizeof(uint8_t) + sectlen;
1422 }
1423
1424 if (inline_rc < ei->refcount) { // look for non-inline entries
1425 traverse_ptr tp2 = *tp, next_tp;
1426
1427 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1428 tp2 = next_tp;
1429
1430 if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1431 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1432 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1433 if (!NT_SUCCESS(Status)) {
1434 ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1435 return Status;
1436 }
1437
1438 Status = delete_tree_item(Vcb, &tp2);
1439 if (!NT_SUCCESS(Status)) {
1440 ERR("delete_tree_item returned %08x\n", Status);
1441 return Status;
1442 }
1443 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1444 metadata_reloc* mr;
1445 data_reloc_ref* ref;
1446
1447 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1448 if (!ref) {
1449 ERR("out of memory\n");
1450 return STATUS_INSUFFICIENT_RESOURCES;
1451 }
1452
1453 ref->type = TYPE_SHARED_DATA_REF;
1454 ref->sdr.offset = tp2.item->key.offset;
1455 ref->sdr.count = *((uint32_t*)tp2.item->data);
1456
1457 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1458 if (!NT_SUCCESS(Status)) {
1459 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1460 ExFreePool(ref);
1461 return Status;
1462 }
1463
1464 ref->parent = mr;
1465 InsertTailList(&dr->refs, &ref->list_entry);
1466
1467 Status = delete_tree_item(Vcb, &tp2);
1468 if (!NT_SUCCESS(Status)) {
1469 ERR("delete_tree_item returned %08x\n", Status);
1470 return Status;
1471 }
1472 }
1473 } else
1474 break;
1475 }
1476 }
1477
1478 InsertTailList(items, &dr->list_entry);
1479
1480 return STATUS_SUCCESS;
1481 }
1482
1483 static void sort_data_reloc_refs(data_reloc* dr) {
1484 LIST_ENTRY newlist, *le;
1485
1486 if (IsListEmpty(&dr->refs))
1487 return;
1488
1489 // insertion sort
1490
1491 InitializeListHead(&newlist);
1492
1493 while (!IsListEmpty(&dr->refs)) {
1494 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1495 bool inserted = false;
1496
1497 if (ref->type == TYPE_EXTENT_DATA_REF)
1498 ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1499 else if (ref->type == TYPE_SHARED_DATA_REF)
1500 ref->hash = ref->parent->new_address;
1501
1502 le = newlist.Flink;
1503 while (le != &newlist) {
1504 data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1505
1506 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1507 InsertHeadList(le->Blink, &ref->list_entry);
1508 inserted = true;
1509 break;
1510 }
1511
1512 le = le->Flink;
1513 }
1514
1515 if (!inserted)
1516 InsertTailList(&newlist, &ref->list_entry);
1517 }
1518
1519 le = newlist.Flink;
1520 while (le != &newlist) {
1521 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1522
1523 if (le->Flink != &newlist) {
1524 data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1525
1526 if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1527 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1528 RemoveEntryList(&ref2->list_entry);
1529 ref->edr.count += ref2->edr.count;
1530 ExFreePool(ref2);
1531 continue;
1532 }
1533 }
1534
1535 le = le->Flink;
1536 }
1537
1538 newlist.Flink->Blink = &dr->refs;
1539 newlist.Blink->Flink = &dr->refs;
1540 dr->refs.Flink = newlist.Flink;
1541 dr->refs.Blink = newlist.Blink;
1542 }
1543
1544 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1545 NTSTATUS Status;
1546 LIST_ENTRY* le;
1547 uint64_t rc = 0;
1548 uint16_t inline_len;
1549 bool all_inline = true;
1550 data_reloc_ref* first_noninline = NULL;
1551 EXTENT_ITEM* ei;
1552 uint8_t* ptr;
1553
1554 inline_len = sizeof(EXTENT_ITEM);
1555
1556 sort_data_reloc_refs(dr);
1557
1558 le = dr->refs.Flink;
1559 while (le != &dr->refs) {
1560 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1561 uint16_t extlen = 0;
1562
1563 if (ref->type == TYPE_EXTENT_DATA_REF) {
1564 extlen += sizeof(EXTENT_DATA_REF);
1565 rc += ref->edr.count;
1566 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1567 extlen += sizeof(SHARED_DATA_REF);
1568 rc++;
1569 }
1570
1571 if (all_inline) {
1572 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1573 all_inline = false;
1574 first_noninline = ref;
1575 } else
1576 inline_len += extlen + 1;
1577 }
1578
1579 le = le->Flink;
1580 }
1581
1582 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1583 if (!ei) {
1584 ERR("out of memory\n");
1585 return STATUS_INSUFFICIENT_RESOURCES;
1586 }
1587
1588 ei->refcount = rc;
1589 ei->generation = dr->ei->generation;
1590 ei->flags = dr->ei->flags;
1591 ptr = (uint8_t*)&ei[1];
1592
1593 le = dr->refs.Flink;
1594 while (le != &dr->refs) {
1595 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1596
1597 if (ref == first_noninline)
1598 break;
1599
1600 *ptr = ref->type;
1601 ptr++;
1602
1603 if (ref->type == TYPE_EXTENT_DATA_REF) {
1604 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1605
1606 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1607
1608 ptr += sizeof(EXTENT_DATA_REF);
1609 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1610 SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1611
1612 sdr->offset = ref->parent->new_address;
1613 sdr->count = ref->sdr.count;
1614
1615 ptr += sizeof(SHARED_DATA_REF);
1616 }
1617
1618 le = le->Flink;
1619 }
1620
1621 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1622 if (!NT_SUCCESS(Status)) {
1623 ERR("insert_tree_item returned %08x\n", Status);
1624 return Status;
1625 }
1626
1627 if (!all_inline) {
1628 le = &first_noninline->list_entry;
1629
1630 while (le != &dr->refs) {
1631 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1632
1633 if (ref->type == TYPE_EXTENT_DATA_REF) {
1634 EXTENT_DATA_REF* edr;
1635
1636 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1637 if (!edr) {
1638 ERR("out of memory\n");
1639 return STATUS_INSUFFICIENT_RESOURCES;
1640 }
1641
1642 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1643
1644 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1645 if (!NT_SUCCESS(Status)) {
1646 ERR("insert_tree_item returned %08x\n", Status);
1647 return Status;
1648 }
1649 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1650 uint32_t* sdr;
1651
1652 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG);
1653 if (!sdr) {
1654 ERR("out of memory\n");
1655 return STATUS_INSUFFICIENT_RESOURCES;
1656 }
1657
1658 *sdr = ref->sdr.count;
1659
1660 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1661 if (!NT_SUCCESS(Status)) {
1662 ERR("insert_tree_item returned %08x\n", Status);
1663 return Status;
1664 }
1665 }
1666
1667 le = le->Flink;
1668 }
1669 }
1670
1671 return STATUS_SUCCESS;
1672 }
1673
1674 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1675 KEY searchkey;
1676 traverse_ptr tp;
1677 NTSTATUS Status;
1678 bool b;
1679 LIST_ENTRY items, metadata_items, rollback, *le;
1680 uint64_t loaded = 0, num_loaded = 0;
1681 chunk* newchunk = NULL;
1682 uint8_t* data = NULL;
1683
1684 TRACE("chunk %I64x\n", c->offset);
1685
1686 InitializeListHead(&rollback);
1687 InitializeListHead(&items);
1688 InitializeListHead(&metadata_items);
1689
1690 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1691
1692 searchkey.obj_id = c->offset;
1693 searchkey.obj_type = TYPE_EXTENT_ITEM;
1694 searchkey.offset = 0xffffffffffffffff;
1695
1696 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1697 if (!NT_SUCCESS(Status)) {
1698 ERR("find_item returned %08x\n", Status);
1699 goto end;
1700 }
1701
1702 do {
1703 traverse_ptr next_tp;
1704
1705 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1706 break;
1707
1708 if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1709 bool tree = false;
1710
1711 if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1712 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1713
1714 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1715 tree = true;
1716 }
1717
1718 if (!tree) {
1719 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1720
1721 if (!NT_SUCCESS(Status)) {
1722 ERR("add_data_reloc returned %08x\n", Status);
1723 goto end;
1724 }
1725
1726 loaded += tp.item->key.offset;
1727 num_loaded++;
1728
1729 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1730 break;
1731 }
1732 }
1733
1734 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1735
1736 if (b)
1737 tp = next_tp;
1738 } while (b);
1739
1740 if (IsListEmpty(&items)) {
1741 *changed = false;
1742 Status = STATUS_SUCCESS;
1743 goto end;
1744 } else
1745 *changed = true;
1746
1747 data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1748 if (!data) {
1749 ERR("out of memory\n");
1750 Status = STATUS_INSUFFICIENT_RESOURCES;
1751 goto end;
1752 }
1753
1754 le = items.Flink;
1755 while (le != &items) {
1756 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1757 bool done = false;
1758 LIST_ENTRY* le2;
1759 uint32_t* csum;
1760 RTL_BITMAP bmp;
1761 ULONG* bmparr;
1762 ULONG bmplen, runlength, index, lastoff;
1763
1764 if (newchunk) {
1765 acquire_chunk_lock(newchunk, Vcb);
1766
1767 if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1768 newchunk->used += dr->size;
1769 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1770 done = true;
1771 }
1772
1773 release_chunk_lock(newchunk, Vcb);
1774 }
1775
1776 if (!done) {
1777 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1778
1779 le2 = Vcb->chunks.Flink;
1780 while (le2 != &Vcb->chunks) {
1781 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1782
1783 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1784 acquire_chunk_lock(c2, Vcb);
1785
1786 if ((c2->chunk_item->size - c2->used) >= dr->size) {
1787 if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1788 c2->used += dr->size;
1789 space_list_subtract(c2, false, dr->new_address, dr->size, &rollback);
1790 release_chunk_lock(c2, Vcb);
1791 newchunk = c2;
1792 done = true;
1793 break;
1794 }
1795 }
1796
1797 release_chunk_lock(c2, Vcb);
1798 }
1799
1800 le2 = le2->Flink;
1801 }
1802
1803 // allocate new chunk if necessary
1804 if (!done) {
1805 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1806
1807 if (!NT_SUCCESS(Status)) {
1808 ERR("alloc_chunk returned %08x\n", Status);
1809 ExReleaseResourceLite(&Vcb->chunk_lock);
1810 goto end;
1811 }
1812
1813 acquire_chunk_lock(newchunk, Vcb);
1814
1815 newchunk->balance_num = Vcb->balance.balance_num;
1816
1817 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1818 release_chunk_lock(newchunk, Vcb);
1819 ExReleaseResourceLite(&Vcb->chunk_lock);
1820 ERR("could not find address in new chunk\n");
1821 Status = STATUS_DISK_FULL;
1822 goto end;
1823 } else {
1824 newchunk->used += dr->size;
1825 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1826 }
1827
1828 release_chunk_lock(newchunk, Vcb);
1829 }
1830
1831 ExReleaseResourceLite(&Vcb->chunk_lock);
1832 }
1833
1834 dr->newchunk = newchunk;
1835
1836 bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size);
1837
1838 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1839 if (!bmparr) {
1840 ERR("out of memory\n");
1841 Status = STATUS_INSUFFICIENT_RESOURCES;
1842 goto end;
1843 }
1844
1845 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(uint32_t) / Vcb->superblock.sector_size), ALLOC_TAG);
1846 if (!csum) {
1847 ERR("out of memory\n");
1848 ExFreePool(bmparr);
1849 Status = STATUS_INSUFFICIENT_RESOURCES;
1850 goto end;
1851 }
1852
1853 RtlInitializeBitMap(&bmp, bmparr, bmplen);
1854 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1855
1856 searchkey.obj_id = EXTENT_CSUM_ID;
1857 searchkey.obj_type = TYPE_EXTENT_CSUM;
1858 searchkey.offset = dr->address;
1859
1860 Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1861 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1862 ERR("find_item returned %08x\n", Status);
1863 ExFreePool(csum);
1864 ExFreePool(bmparr);
1865 goto end;
1866 }
1867
1868 if (Status != STATUS_NOT_FOUND) {
1869 do {
1870 traverse_ptr next_tp;
1871
1872 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1873 if (tp.item->key.offset >= dr->address + dr->size)
1874 break;
1875 else if (tp.item->size >= sizeof(uint32_t) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)) >= dr->address) {
1876 uint64_t cs = max(dr->address, tp.item->key.offset);
1877 uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)));
1878
1879 RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size),
1880 tp.item->data + ((cs - tp.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size),
1881 (ULONG)((ce - cs) * sizeof(uint32_t) / Vcb->superblock.sector_size));
1882
1883 RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1884
1885 if (ce == dr->address + dr->size)
1886 break;
1887 }
1888 }
1889
1890 if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1891 tp = next_tp;
1892 else
1893 break;
1894 } while (true);
1895 }
1896
1897 lastoff = 0;
1898 runlength = RtlFindFirstRunClear(&bmp, &index);
1899
1900 while (runlength != 0) {
1901 if (index >= bmplen)
1902 break;
1903
1904 if (index + runlength >= bmplen) {
1905 runlength = bmplen - index;
1906
1907 if (runlength == 0)
1908 break;
1909 }
1910
1911 if (index > lastoff) {
1912 ULONG off = lastoff;
1913 ULONG size = index - lastoff;
1914
1915 // handle no csum run
1916 do {
1917 ULONG rl;
1918
1919 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1920 rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1921 else
1922 rl = size;
1923
1924 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1925 c, NULL, NULL, 0, false, NormalPagePriority);
1926 if (!NT_SUCCESS(Status)) {
1927 ERR("read_data returned %08x\n", Status);
1928 ExFreePool(csum);
1929 ExFreePool(bmparr);
1930 goto end;
1931 }
1932
1933 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1934 NULL, newchunk, false, 0, NormalPagePriority);
1935 if (!NT_SUCCESS(Status)) {
1936 ERR("write_data_complete returned %08x\n", Status);
1937 ExFreePool(csum);
1938 ExFreePool(bmparr);
1939 goto end;
1940 }
1941
1942 size -= rl;
1943 off += rl;
1944 } while (size > 0);
1945 }
1946
1947 add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL);
1948 add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1949
1950 // handle csum run
1951 do {
1952 ULONG rl;
1953
1954 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1955 rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1956 else
1957 rl = runlength;
1958
1959 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], false, data,
1960 c, NULL, NULL, 0, false, NormalPagePriority);
1961 if (!NT_SUCCESS(Status)) {
1962 ERR("read_data returned %08x\n", Status);
1963 ExFreePool(csum);
1964 ExFreePool(bmparr);
1965 goto end;
1966 }
1967
1968 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1969 NULL, newchunk, false, 0, NormalPagePriority);
1970 if (!NT_SUCCESS(Status)) {
1971 ERR("write_data_complete returned %08x\n", Status);
1972 ExFreePool(csum);
1973 ExFreePool(bmparr);
1974 goto end;
1975 }
1976
1977 runlength -= rl;
1978 index += rl;
1979 } while (runlength > 0);
1980
1981 lastoff = index;
1982 runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1983 }
1984
1985 ExFreePool(csum);
1986 ExFreePool(bmparr);
1987
1988 // handle final nocsum run
1989 if (lastoff < dr->size / Vcb->superblock.sector_size) {
1990 ULONG off = lastoff;
1991 ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1992
1993 do {
1994 ULONG rl;
1995
1996 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1997 rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1998 else
1999 rl = size;
2000
2001 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
2002 c, NULL, NULL, 0, false, NormalPagePriority);
2003 if (!NT_SUCCESS(Status)) {
2004 ERR("read_data returned %08x\n", Status);
2005 goto end;
2006 }
2007
2008 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
2009 NULL, newchunk, false, 0, NormalPagePriority);
2010 if (!NT_SUCCESS(Status)) {
2011 ERR("write_data_complete returned %08x\n", Status);
2012 goto end;
2013 }
2014
2015 size -= rl;
2016 off += rl;
2017 } while (size > 0);
2018 }
2019
2020 le = le->Flink;
2021 }
2022
2023 ExFreePool(data);
2024 data = NULL;
2025
2026 Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2027 if (!NT_SUCCESS(Status)) {
2028 ERR("write_metadata_items returned %08x\n", Status);
2029 goto end;
2030 }
2031
2032 le = items.Flink;
2033 while (le != &items) {
2034 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2035
2036 Status = add_data_reloc_extent_item(Vcb, dr);
2037 if (!NT_SUCCESS(Status)) {
2038 ERR("add_data_reloc_extent_item returned %08x\n", Status);
2039 goto end;
2040 }
2041
2042 le = le->Flink;
2043 }
2044
2045 le = c->changed_extents.Flink;
2046 while (le != &c->changed_extents) {
2047 LIST_ENTRY *le2, *le3;
2048 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2049
2050 le3 = le->Flink;
2051
2052 le2 = items.Flink;
2053 while (le2 != &items) {
2054 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2055
2056 if (ce->address == dr->address) {
2057 ce->address = dr->new_address;
2058 RemoveEntryList(&ce->list_entry);
2059 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2060 break;
2061 }
2062
2063 le2 = le2->Flink;
2064 }
2065
2066 le = le3;
2067 }
2068
2069 Status = STATUS_SUCCESS;
2070
2071 Vcb->need_write = true;
2072
2073 end:
2074 if (NT_SUCCESS(Status)) {
2075 // update extents in cache inodes before we flush
2076 le = Vcb->chunks.Flink;
2077 while (le != &Vcb->chunks) {
2078 chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2079
2080 if (c2->cache) {
2081 LIST_ENTRY* le2;
2082
2083 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2084
2085 le2 = c2->cache->extents.Flink;
2086 while (le2 != &c2->cache->extents) {
2087 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2088
2089 if (!ext->ignore) {
2090 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2091 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2092
2093 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2094 LIST_ENTRY* le3 = items.Flink;
2095 while (le3 != &items) {
2096 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2097
2098 if (ed2->address == dr->address) {
2099 ed2->address = dr->new_address;
2100 break;
2101 }
2102
2103 le3 = le3->Flink;
2104 }
2105 }
2106 }
2107 }
2108
2109 le2 = le2->Flink;
2110 }
2111
2112 ExReleaseResourceLite(c2->cache->Header.Resource);
2113 }
2114
2115 le = le->Flink;
2116 }
2117
2118 Status = do_write(Vcb, NULL);
2119 if (!NT_SUCCESS(Status))
2120 ERR("do_write returned %08x\n", Status);
2121 }
2122
2123 if (NT_SUCCESS(Status)) {
2124 clear_rollback(&rollback);
2125
2126 // update open FCBs
2127 // FIXME - speed this up(?)
2128
2129 le = Vcb->all_fcbs.Flink;
2130 while (le != &Vcb->all_fcbs) {
2131 struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2132 LIST_ENTRY* le2;
2133
2134 ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2135
2136 le2 = fcb->extents.Flink;
2137 while (le2 != &fcb->extents) {
2138 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2139
2140 if (!ext->ignore) {
2141 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2142 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2143
2144 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2145 LIST_ENTRY* le3 = items.Flink;
2146 while (le3 != &items) {
2147 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2148
2149 if (ed2->address == dr->address) {
2150 ed2->address = dr->new_address;
2151 break;
2152 }
2153
2154 le3 = le3->Flink;
2155 }
2156 }
2157 }
2158 }
2159
2160 le2 = le2->Flink;
2161 }
2162
2163 ExReleaseResourceLite(fcb->Header.Resource);
2164
2165 le = le->Flink;
2166 }
2167 } else
2168 do_rollback(Vcb, &rollback);
2169
2170 free_trees(Vcb);
2171
2172 ExReleaseResourceLite(&Vcb->tree_lock);
2173
2174 if (data)
2175 ExFreePool(data);
2176
2177 while (!IsListEmpty(&items)) {
2178 data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2179
2180 while (!IsListEmpty(&dr->refs)) {
2181 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2182
2183 ExFreePool(ref);
2184 }
2185
2186 ExFreePool(dr);
2187 }
2188
2189 while (!IsListEmpty(&metadata_items)) {
2190 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2191
2192 while (!IsListEmpty(&mr->refs)) {
2193 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2194
2195 ExFreePool(ref);
2196 }
2197
2198 if (mr->data)
2199 ExFreePool(mr->data);
2200
2201 ExFreePool(mr);
2202 }
2203
2204 return Status;
2205 }
2206
2207 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2208 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2209 return BLOCK_FLAG_RAID0;
2210 else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2211 return BLOCK_FLAG_RAID1;
2212 else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2213 return BLOCK_FLAG_DUPLICATE;
2214 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2215 return BLOCK_FLAG_RAID10;
2216 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2217 return BLOCK_FLAG_RAID5;
2218 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2219 return BLOCK_FLAG_RAID6;
2220 else
2221 return BLOCK_FLAG_SINGLE;
2222 }
2223
2224 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) {
2225 btrfs_balance_opts* opts;
2226
2227 opts = &Vcb->balance.opts[sort];
2228
2229 if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2230 return false;
2231
2232 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2233 uint64_t type = get_chunk_dup_type(c);
2234
2235 if (!(type & opts->profiles))
2236 return false;
2237 }
2238
2239 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2240 uint16_t i;
2241 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2242 bool b = false;
2243
2244 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2245 if (cis[i].dev_id == opts->devid) {
2246 b = true;
2247 break;
2248 }
2249 }
2250
2251 if (!b)
2252 return false;
2253 }
2254
2255 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2256 uint16_t i, factor;
2257 uint64_t physsize;
2258 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2259 bool b = false;
2260
2261 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2262 factor = c->chunk_item->num_stripes;
2263 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2264 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2265 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2266 factor = c->chunk_item->num_stripes - 1;
2267 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2268 factor = c->chunk_item->num_stripes - 2;
2269 else // SINGLE, DUPLICATE, RAID1
2270 factor = 1;
2271
2272 physsize = c->chunk_item->size / factor;
2273
2274 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2275 if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2276 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2277 b = true;
2278 break;
2279 }
2280 }
2281
2282 if (!b)
2283 return false;
2284 }
2285
2286 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2287 if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2288 return false;
2289 }
2290
2291 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2292 if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2293 return false;
2294 }
2295
2296 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2297 uint64_t usage = c->used * 100 / c->chunk_item->size;
2298
2299 // usage == 0 should mean completely empty, not just that usage rounds to 0%
2300 if (c->used > 0 && usage == 0)
2301 usage = 1;
2302
2303 if (usage < opts->usage_start || usage > opts->usage_end)
2304 return false;
2305 }
2306
2307 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2308 uint64_t type = get_chunk_dup_type(c);
2309
2310 if (type == opts->convert)
2311 return false;
2312 }
2313
2314 return true;
2315 }
2316
2317 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2318 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2319 args->profiles = opts->profiles;
2320 args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2321 }
2322
2323 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2324 if (args->usage_start == 0) {
2325 args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2326 args->usage_start = opts->usage_start;
2327 args->usage_end = opts->usage_end;
2328 } else {
2329 args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2330 args->usage = opts->usage_end;
2331 }
2332 }
2333
2334 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2335 args->devid = opts->devid;
2336 args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2337 }
2338
2339 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2340 args->drange_start = opts->drange_start;
2341 args->drange_end = opts->drange_end;
2342 args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2343 }
2344
2345 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2346 args->vrange_start = opts->vrange_start;
2347 args->vrange_end = opts->vrange_end;
2348 args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2349 }
2350
2351 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2352 args->convert = opts->convert;
2353 args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2354
2355 if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2356 args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2357 }
2358
2359 if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2360 if (args->limit_start == 0) {
2361 args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2362 args->limit_start = (uint32_t)opts->limit_start;
2363 args->limit_end = (uint32_t)opts->limit_end;
2364 } else {
2365 args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2366 args->limit = opts->limit_end;
2367 }
2368 }
2369
2370 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2371 args->stripes_start = opts->stripes_start;
2372 args->stripes_end = opts->stripes_end;
2373 args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2374 }
2375 }
2376
2377 static NTSTATUS add_balance_item(device_extension* Vcb) {
2378 KEY searchkey;
2379 traverse_ptr tp;
2380 NTSTATUS Status;
2381 BALANCE_ITEM* bi;
2382
2383 searchkey.obj_id = BALANCE_ITEM_ID;
2384 searchkey.obj_type = TYPE_TEMP_ITEM;
2385 searchkey.offset = 0;
2386
2387 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2388
2389 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2390 if (!NT_SUCCESS(Status)) {
2391 ERR("find_item returned %08x\n", Status);
2392 goto end;
2393 }
2394
2395 if (!keycmp(tp.item->key, searchkey)) {
2396 Status = delete_tree_item(Vcb, &tp);
2397 if (!NT_SUCCESS(Status)) {
2398 ERR("delete_tree_item returned %08x\n", Status);
2399 goto end;
2400 }
2401 }
2402
2403 bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2404 if (!bi) {
2405 ERR("out of memory\n");
2406 Status = STATUS_INSUFFICIENT_RESOURCES;
2407 goto end;
2408 }
2409
2410 RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2411
2412 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2413 bi->flags |= BALANCE_FLAGS_DATA;
2414 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2415 }
2416
2417 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2418 bi->flags |= BALANCE_FLAGS_METADATA;
2419 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2420 }
2421
2422 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2423 bi->flags |= BALANCE_FLAGS_SYSTEM;
2424 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2425 }
2426
2427 Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2428 if (!NT_SUCCESS(Status)) {
2429 ERR("insert_tree_item returned %08x\n", Status);
2430 ExFreePool(bi);
2431 goto end;
2432 }
2433
2434 Status = STATUS_SUCCESS;
2435
2436 end:
2437 if (NT_SUCCESS(Status)) {
2438 Status = do_write(Vcb, NULL);
2439 if (!NT_SUCCESS(Status))
2440 ERR("do_write returned %08x\n", Status);
2441 }
2442
2443 free_trees(Vcb);
2444
2445 ExReleaseResourceLite(&Vcb->tree_lock);
2446
2447 return Status;
2448 }
2449
2450 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2451 KEY searchkey;
2452 traverse_ptr tp;
2453 NTSTATUS Status;
2454
2455 searchkey.obj_id = BALANCE_ITEM_ID;
2456 searchkey.obj_type = TYPE_TEMP_ITEM;
2457 searchkey.offset = 0;
2458
2459 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2460
2461 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2462 if (!NT_SUCCESS(Status)) {
2463 ERR("find_item returned %08x\n", Status);
2464 goto end;
2465 }
2466
2467 if (!keycmp(tp.item->key, searchkey)) {
2468 Status = delete_tree_item(Vcb, &tp);
2469 if (!NT_SUCCESS(Status)) {
2470 ERR("delete_tree_item returned %08x\n", Status);
2471 goto end;
2472 }
2473
2474 Status = do_write(Vcb, NULL);
2475 if (!NT_SUCCESS(Status)) {
2476 ERR("do_write returned %08x\n", Status);
2477 goto end;
2478 }
2479
2480 free_trees(Vcb);
2481 }
2482
2483 Status = STATUS_SUCCESS;
2484
2485 end:
2486 ExReleaseResourceLite(&Vcb->tree_lock);
2487
2488 return Status;
2489 }
2490
2491 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2492 opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2493
2494 if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2495 opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2496 opts->profiles = args->profiles;
2497 }
2498
2499 if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2500 opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2501
2502 opts->usage_start = 0;
2503 opts->usage_end = (uint8_t)args->usage;
2504 } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2505 opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2506
2507 opts->usage_start = (uint8_t)args->usage_start;
2508 opts->usage_end = (uint8_t)args->usage_end;
2509 }
2510
2511 if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2512 opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2513 opts->devid = args->devid;
2514 }
2515
2516 if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2517 opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2518 opts->drange_start = args->drange_start;
2519 opts->drange_end = args->drange_end;
2520 }
2521
2522 if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2523 opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2524 opts->vrange_start = args->vrange_start;
2525 opts->vrange_end = args->vrange_end;
2526 }
2527
2528 if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2529 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2530
2531 opts->limit_start = 0;
2532 opts->limit_end = args->limit;
2533 } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2534 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2535
2536 opts->limit_start = args->limit_start;
2537 opts->limit_end = args->limit_end;
2538 }
2539
2540 if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2541 opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2542
2543 opts->stripes_start = (uint16_t)args->stripes_start;
2544 opts->stripes_end = (uint16_t)args->stripes_end;
2545 }
2546
2547 if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2548 opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2549 opts->convert = args->convert;
2550
2551 if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2552 opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2553 }
2554 }
2555
2556 static NTSTATUS remove_superblocks(device* dev) {
2557 NTSTATUS Status;
2558 superblock* sb;
2559 int i = 0;
2560
2561 sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2562 if (!sb) {
2563 ERR("out of memory\n");
2564 return STATUS_INSUFFICIENT_RESOURCES;
2565 }
2566
2567 RtlZeroMemory(sb, sizeof(superblock));
2568
2569 while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2570 Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2571
2572 if (!NT_SUCCESS(Status)) {
2573 ExFreePool(sb);
2574 return Status;
2575 }
2576
2577 i++;
2578 }
2579
2580 ExFreePool(sb);
2581
2582 return STATUS_SUCCESS;
2583 }
2584
2585 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2586 KEY searchkey;
2587 traverse_ptr tp;
2588 NTSTATUS Status;
2589 LIST_ENTRY* le;
2590 volume_device_extension* vde;
2591
2592 if (Vcb->need_write) {
2593 Status = do_write(Vcb, NULL);
2594
2595 if (!NT_SUCCESS(Status))
2596 ERR("do_write returned %08x\n", Status);
2597 } else
2598 Status = STATUS_SUCCESS;
2599
2600 free_trees(Vcb);
2601
2602 if (!NT_SUCCESS(Status))
2603 return Status;
2604
2605 // remove entry in chunk tree
2606
2607 searchkey.obj_id = 1;
2608 searchkey.obj_type = TYPE_DEV_ITEM;
2609 searchkey.offset = dev->devitem.dev_id;
2610
2611 Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2612 if (!NT_SUCCESS(Status)) {
2613 ERR("find_item returned %08x\n", Status);
2614 return Status;
2615 }
2616
2617 if (!keycmp(searchkey, tp.item->key)) {
2618 Status = delete_tree_item(Vcb, &tp);
2619
2620 if (!NT_SUCCESS(Status)) {
2621 ERR("delete_tree_item returned %08x\n", Status);
2622 return Status;
2623 }
2624 }
2625
2626 // remove stats entry in device tree
2627
2628 searchkey.obj_id = 0;
2629 searchkey.obj_type = TYPE_DEV_STATS;
2630 searchkey.offset = dev->devitem.dev_id;
2631
2632 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2633 if (!NT_SUCCESS(Status)) {
2634 ERR("find_item returned %08x\n", Status);
2635 return Status;
2636 }
2637
2638 if (!keycmp(searchkey, tp.item->key)) {
2639 Status = delete_tree_item(Vcb, &tp);
2640
2641 if (!NT_SUCCESS(Status)) {
2642 ERR("delete_tree_item returned %08x\n", Status);
2643 return Status;
2644 }
2645 }
2646
2647 // update superblock
2648
2649 Vcb->superblock.num_devices--;
2650 Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2651 Vcb->devices_loaded--;
2652
2653 RemoveEntryList(&dev->list_entry);
2654
2655 // flush
2656
2657 Status = do_write(Vcb, NULL);
2658 if (!NT_SUCCESS(Status))
2659 ERR("do_write returned %08x\n", Status);
2660
2661 free_trees(Vcb);
2662
2663 if (!NT_SUCCESS(Status))
2664 return Status;
2665
2666 if (!dev->readonly && dev->devobj) {
2667 Status = remove_superblocks(dev);
2668 if (!NT_SUCCESS(Status))
2669 WARN("remove_superblocks returned %08x\n", Status);
2670 }
2671
2672 // remove entry in volume list
2673
2674 vde = Vcb->vde;
2675
2676 if (dev->devobj) {
2677 pdo_device_extension* pdode = vde->pdode;
2678
2679 ExAcquireResourceExclusiveLite(&pdode->child_lock, true);
2680
2681 le = pdode->children.Flink;
2682 while (le != &pdode->children) {
2683 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2684
2685 if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2686 PFILE_OBJECT FileObject;
2687 PDEVICE_OBJECT mountmgr;
2688 UNICODE_STRING mmdevpath;
2689
2690 pdode->children_loaded--;
2691
2692 if (vc->had_drive_letter) { // re-add entry to mountmgr
2693 RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2694 Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2695 if (!NT_SUCCESS(Status))
2696 ERR("IoGetDeviceObjectPointer returned %08x\n", Status);
2697 else {
2698 MOUNTDEV_NAME mdn;
2699
2700 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2701 if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2702 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2703 else {
2704 MOUNTDEV_NAME* mdn2;
2705 ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2706
2707 mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2708 if (!mdn2)
2709 ERR("out of memory\n");
2710 else {
2711 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2712 if (!NT_SUCCESS(Status))
2713 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2714 else {
2715 UNICODE_STRING name;
2716
2717 name.Buffer = mdn2->Name;
2718 name.Length = name.MaximumLength = mdn2->NameLength;
2719
2720 Status = mountmgr_add_drive_letter(mountmgr, &name);
2721 if (!NT_SUCCESS(Status))
2722 WARN("mountmgr_add_drive_letter returned %08x\n", Status);
2723 }
2724
2725 ExFreePool(mdn2);
2726 }
2727 }
2728
2729 ObDereferenceObject(FileObject);
2730 }
2731 }
2732
2733 ExFreePool(vc->pnp_name.Buffer);
2734 RemoveEntryList(&vc->list_entry);
2735 ExFreePool(vc);
2736
2737 ObDereferenceObject(vc->fileobj);
2738
2739 break;
2740 }
2741
2742 le = le->Flink;
2743 }
2744
2745 if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2746 vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2747
2748 le = pdode->children.Flink;
2749 while (le != &pdode->children) {
2750 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2751
2752 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2753 vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2754 break;
2755 }
2756
2757 le = le->Flink;
2758 }
2759 }
2760
2761 pdode->num_children = Vcb->superblock.num_devices;
2762
2763 ExReleaseResourceLite(&pdode->child_lock);
2764
2765 // free dev
2766
2767 if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2768 trim_whole_device(dev);
2769 }
2770
2771 while (!IsListEmpty(&dev->space)) {
2772 LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2773 space* s = CONTAINING_RECORD(le2, space, list_entry);
2774
2775 ExFreePool(s);
2776 }
2777
2778 ExFreePool(dev);
2779
2780 if (Vcb->trim) {
2781 Vcb->trim = false;
2782
2783 le = Vcb->devices.Flink;
2784 while (le != &Vcb->devices) {
2785 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2786
2787 if (dev2->trim) {
2788 Vcb->trim = true;
2789 break;
2790 }
2791
2792 le = le->Flink;
2793 }
2794 }
2795
2796 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2797
2798 return STATUS_SUCCESS;
2799 }
2800
2801 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2802 DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2803 DEVICE_DATA_SET_RANGE* ranges;
2804 ULONG datalen, i;
2805 KEY searchkey;
2806 traverse_ptr tp;
2807 NTSTATUS Status;
2808 bool b;
2809 uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2810 LIST_ENTRY* le;
2811
2812 dev->num_trim_entries = 0;
2813
2814 searchkey.obj_id = dev->devitem.dev_id;
2815 searchkey.obj_type = TYPE_DEV_EXTENT;
2816 searchkey.offset = 0;
2817
2818 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2819 if (!NT_SUCCESS(Status)) {
2820 ERR("find_item returned %08x\n", Status);
2821 return;
2822 }
2823
2824 do {
2825 traverse_ptr next_tp;
2826
2827 if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2828 if (tp.item->size >= sizeof(DEV_EXTENT)) {
2829 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2830
2831 if (tp.item->key.offset > lastoff)
2832 add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2833
2834 lastoff = tp.item->key.offset + de->length;
2835 } else {
2836 ERR("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2837 return;
2838 }
2839 }
2840
2841 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2842
2843 if (b) {
2844 tp = next_tp;
2845 if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2846 break;
2847 }
2848 } while (b);
2849
2850 if (lastoff < dev->devitem.num_bytes)
2851 add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2852
2853 if (dev->num_trim_entries == 0)
2854 return;
2855
2856 datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2857
2858 dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2859 if (!dmdsa) {
2860 ERR("out of memory\n");
2861 goto end;
2862 }
2863
2864 dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2865 dmdsa->Action = DeviceDsmAction_Trim;
2866 dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2867 dmdsa->ParameterBlockOffset = 0;
2868 dmdsa->ParameterBlockLength = 0;
2869 dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t));
2870 dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2871
2872 ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2873
2874 i = 0;
2875 le = dev->trim_list.Flink;
2876 while (le != &dev->trim_list) {
2877 space* s = CONTAINING_RECORD(le, space, list_entry);
2878
2879 ranges[i].StartingOffset = s->address;
2880 ranges[i].LengthInBytes = s->size;
2881 i++;
2882
2883 le = le->Flink;
2884 }
2885
2886 Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL);
2887 if (!NT_SUCCESS(Status))
2888 WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status);
2889
2890 ExFreePool(dmdsa);
2891
2892 end:
2893 while (!IsListEmpty(&dev->trim_list)) {
2894 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2895 ExFreePool(s);
2896 }
2897
2898 dev->num_trim_entries = 0;
2899 }
2900
2901 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) {
2902 NTSTATUS Status;
2903 bool changed;
2904 LIST_ENTRY* le;
2905 chunk* rc;
2906
2907 // FIXME - allow with metadata chunks?
2908
2909 while (true) {
2910 rc = NULL;
2911
2912 ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2913
2914 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2915
2916 // choose the least-used chunk we haven't looked at yet
2917 le = Vcb->chunks.Flink;
2918 while (le != &Vcb->chunks) {
2919 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2920
2921 // FIXME - skip full-size chunks over e.g. 90% full?
2922 if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2923 rc = c;
2924
2925 le = le->Flink;
2926 }
2927
2928 ExReleaseResourceLite(&Vcb->chunk_lock);
2929
2930 if (!rc) {
2931 ExReleaseResourceLite(&Vcb->tree_lock);
2932 break;
2933 }
2934
2935 if (rc->list_entry_balance.Flink) {
2936 RemoveEntryList(&rc->list_entry_balance);
2937 Vcb->balance.chunks_left--;
2938 }
2939
2940 rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2941 rc->reloc = true;
2942
2943 ExReleaseResourceLite(&Vcb->tree_lock);
2944
2945 do {
2946 changed = false;
2947
2948 Status = balance_data_chunk(Vcb, rc, &changed);
2949 if (!NT_SUCCESS(Status)) {
2950 ERR("balance_data_chunk returned %08x\n", Status);
2951 Vcb->balance.status = Status;
2952 rc->list_entry_balance.Flink = NULL;
2953 rc->reloc = false;
2954 return Status;
2955 }
2956
2957 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2958
2959 if (Vcb->readonly)
2960 Vcb->balance.stopping = true;
2961
2962 if (Vcb->balance.stopping)
2963 return STATUS_SUCCESS;
2964 } while (changed);
2965
2966 rc->list_entry_balance.Flink = NULL;
2967
2968 rc->changed = true;
2969 rc->space_changed = true;
2970 rc->balance_num = Vcb->balance.balance_num;
2971
2972 Status = do_write(Vcb, NULL);
2973 if (!NT_SUCCESS(Status)) {
2974 ERR("do_write returned %08x\n", Status);
2975 return Status;
2976 }
2977
2978 free_trees(Vcb);
2979 }
2980
2981 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2982
2983 Status = alloc_chunk(Vcb, flags, &rc, true);
2984
2985 ExReleaseResourceLite(&Vcb->chunk_lock);
2986
2987 if (NT_SUCCESS(Status)) {
2988 *newchunk = rc;
2989 return Status;
2990 } else {
2991 ERR("alloc_chunk returned %08x\n", Status);
2992 return Status;
2993 }
2994 }
2995
2996 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
2997 LIST_ENTRY* le;
2998
2999 while (!IsListEmpty(&dev->space)) {
3000 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
3001
3002 ExFreePool(s);
3003 }
3004
3005 // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
3006
3007 space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
3008
3009 le = Vcb->chunks.Flink;
3010 while (le != &Vcb->chunks) {
3011 uint16_t n;
3012 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3013 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3014
3015 for (n = 0; n < c->chunk_item->num_stripes; n++) {
3016 uint64_t stripe_size = 0;
3017
3018 if (cis[n].dev_id == dev->devitem.dev_id) {
3019 if (stripe_size == 0) {
3020 uint16_t factor;
3021
3022 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3023 factor = c->chunk_item->num_stripes;
3024 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3025 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3026 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3027 factor = c->chunk_item->num_stripes - 1;
3028 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3029 factor = c->chunk_item->num_stripes - 2;
3030 else // SINGLE, DUP, RAID1
3031 factor = 1;
3032
3033 stripe_size = c->chunk_item->size / factor;
3034 }
3035
3036 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3037 }
3038 }
3039
3040 le = le->Flink;
3041 }
3042
3043 return STATUS_SUCCESS;
3044 }
3045
3046 _Function_class_(KSTART_ROUTINE)
3047 void __stdcall balance_thread(void* context) {
3048 device_extension* Vcb = (device_extension*)context;
3049 LIST_ENTRY chunks;
3050 LIST_ENTRY* le;
3051 uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3052 uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3053 NTSTATUS Status;
3054
3055 Vcb->balance.balance_num++;
3056
3057 Vcb->balance.stopping = false;
3058 KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3059
3060 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3061 old_data_flags = Vcb->data_flags;
3062 Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3063
3064 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3065 }
3066
3067 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3068 old_metadata_flags = Vcb->metadata_flags;
3069 Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3070 }
3071
3072 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3073 old_system_flags = Vcb->system_flags;
3074 Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3075 }
3076
3077 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3078 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3079 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3080 else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3081 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3082 }
3083
3084 num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3085 Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3086
3087 InitializeListHead(&chunks);
3088
3089 // FIXME - what are we supposed to do with limit_start?
3090
3091 if (!Vcb->readonly) {
3092 if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3093 Status = add_balance_item(Vcb);
3094 if (!NT_SUCCESS(Status)) {
3095 ERR("add_balance_item returned %08x\n", Status);
3096 Vcb->balance.status = Status;
3097 goto end;
3098 }
3099 } else {
3100 if (Vcb->need_write) {
3101 Status = do_write(Vcb, NULL);
3102
3103 free_trees(Vcb);
3104
3105 if (!NT_SUCCESS(Status)) {
3106 ERR("do_write returned %08x\n", Status);
3107 Vcb->balance.status = Status;
3108 goto end;
3109 }
3110 }
3111 }
3112 }
3113
3114 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3115
3116 if (Vcb->balance.stopping)
3117 goto end;
3118
3119 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3120
3121 le = Vcb->chunks.Flink;
3122 while (le != &Vcb->chunks) {
3123 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3124 uint8_t sort;
3125
3126 acquire_chunk_lock(c, Vcb);
3127
3128 if (c->chunk_item->type & BLOCK_FLAG_DATA)
3129 sort = BALANCE_OPTS_DATA;
3130 else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3131 sort = BALANCE_OPTS_METADATA;
3132 else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3133 sort = BALANCE_OPTS_SYSTEM;
3134 else {
3135 ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3136 release_chunk_lock(c, Vcb);
3137 break;
3138 }
3139
3140 if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3141 should_balance_chunk(Vcb, sort, c)) {
3142 InsertTailList(&chunks, &c->list_entry_balance);
3143
3144 num_chunks[sort]++;
3145 Vcb->balance.total_chunks++;
3146 Vcb->balance.chunks_left++;
3147 } else if (sort == BALANCE_OPTS_METADATA)
3148 okay_metadata_chunks++;
3149 else if (sort == BALANCE_OPTS_DATA)
3150 okay_data_chunks++;
3151 else if (sort == BALANCE_OPTS_SYSTEM)
3152 okay_system_chunks++;
3153
3154 if (!c->cache_loaded) {
3155 Status = load_cache_chunk(Vcb, c, NULL);
3156
3157 if (!NT_SUCCESS(Status)) {
3158 ERR("load_cache_chunk returned %08x\n", Status);
3159 Vcb->balance.status = Status;
3160 release_chunk_lock(c, Vcb);
3161 ExReleaseResourceLite(&Vcb->chunk_lock);
3162 goto end;
3163 }
3164 }
3165
3166 release_chunk_lock(c, Vcb);
3167
3168 le = le->Flink;
3169 }
3170
3171 ExReleaseResourceLite(&Vcb->chunk_lock);
3172
3173 // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3174 if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3175 bool consolidated = false;
3176 chunk* c;
3177
3178 if (okay_metadata_chunks == 0) {
3179 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3180
3181 Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3182 if (NT_SUCCESS(Status))
3183 c->balance_num = Vcb->balance.balance_num;
3184 else if (Status != STATUS_DISK_FULL || consolidated) {
3185 ERR("alloc_chunk returned %08x\n", Status);
3186 ExReleaseResourceLite(&Vcb->chunk_lock);
3187 Vcb->balance.status = Status;
3188 goto end;
3189 }
3190
3191 ExReleaseResourceLite(&Vcb->chunk_lock);
3192
3193 if (Status == STATUS_DISK_FULL) {
3194 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3195 if (!NT_SUCCESS(Status)) {
3196 ERR("try_consolidation returned %08x\n", Status);
3197 Vcb->balance.status = Status;
3198 goto end;
3199 } else
3200 c->balance_num = Vcb->balance.balance_num;
3201
3202 consolidated = true;
3203
3204 if (Vcb->balance.stopping)
3205 goto end;
3206 }
3207 }
3208
3209 if (okay_data_chunks == 0) {
3210 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3211
3212 Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3213 if (NT_SUCCESS(Status))
3214 c->balance_num = Vcb->balance.balance_num;
3215 else if (Status != STATUS_DISK_FULL || consolidated) {
3216 ERR("alloc_chunk returned %08x\n", Status);
3217 ExReleaseResourceLite(&Vcb->chunk_lock);
3218 Vcb->balance.status = Status;
3219 goto end;
3220 }
3221
3222 ExReleaseResourceLite(&Vcb->chunk_lock);
3223
3224 if (Status == STATUS_DISK_FULL) {
3225 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3226 if (!NT_SUCCESS(Status)) {
3227 ERR("try_consolidation returned %08x\n", Status);
3228 Vcb->balance.status = Status;
3229 goto end;
3230 } else
3231 c->balance_num = Vcb->balance.balance_num;
3232
3233 consolidated = true;
3234
3235 if (Vcb->balance.stopping)
3236 goto end;
3237 }
3238 }
3239
3240 if (okay_system_chunks == 0) {
3241 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3242
3243 Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3244 if (NT_SUCCESS(Status))
3245 c->balance_num = Vcb->balance.balance_num;
3246 else if (Status != STATUS_DISK_FULL || consolidated) {
3247 ERR("alloc_chunk returned %08x\n", Status);
3248 ExReleaseResourceLite(&Vcb->chunk_lock);
3249 Vcb->balance.status = Status;
3250 goto end;
3251 }
3252
3253 ExReleaseResourceLite(&Vcb->chunk_lock);
3254
3255 if (Status == STATUS_DISK_FULL) {
3256 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3257 if (!NT_SUCCESS(Status)) {
3258 ERR("try_consolidation returned %08x\n", Status);
3259 Vcb->balance.status = Status;
3260 goto end;
3261 } else
3262 c->balance_num = Vcb->balance.balance_num;
3263
3264 consolidated = true;
3265
3266 if (Vcb->balance.stopping)
3267 goto end;
3268 }
3269 }
3270 }
3271
3272 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3273
3274 le = chunks.Flink;
3275 while (le != &chunks) {
3276 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3277
3278 c->reloc = true;
3279
3280 le = le->Flink;
3281 }
3282
3283 ExReleaseResourceLite(&Vcb->chunk_lock);
3284
3285 // do data chunks before metadata
3286 le = chunks.Flink;
3287 while (le != &chunks) {
3288 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3289 LIST_ENTRY* le2 = le->Flink;
3290
3291 if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3292 bool changed;
3293
3294 do {
3295 changed = false;
3296
3297 Status = balance_data_chunk(Vcb, c, &changed);
3298 if (!NT_SUCCESS(Status)) {
3299 ERR("balance_data_chunk returned %08x\n", Status);
3300 Vcb->balance.status = Status;
3301 goto end;
3302 }
3303
3304 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3305
3306 if (Vcb->readonly)
3307 Vcb->balance.stopping = true;
3308
3309 if (Vcb->balance.stopping)
3310 break;
3311 } while (changed);
3312
3313 c->changed = true;
3314 c->space_changed = true;
3315 }
3316
3317 if (Vcb->balance.stopping)
3318 goto end;
3319
3320 if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3321 (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3322 RemoveEntryList(&c->list_entry_balance);
3323 c->list_entry_balance.Flink = NULL;
3324
3325 Vcb->balance.chunks_left--;
3326 }
3327
3328 le = le2;
3329 }
3330
3331 // do metadata chunks
3332 while (!IsListEmpty(&chunks)) {
3333 chunk* c;
3334 bool changed;
3335
3336 le = RemoveHeadList(&chunks);
3337 c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3338
3339 if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3340 do {
3341 Status = balance_metadata_chunk(Vcb, c, &changed);
3342 if (!NT_SUCCESS(Status)) {
3343 ERR("balance_metadata_chunk returned %08x\n", Status);
3344 Vcb->balance.status = Status;
3345 goto end;
3346 }
3347
3348 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3349
3350 if (Vcb->readonly)
3351 Vcb->balance.stopping = true;
3352
3353 if (Vcb->balance.stopping)
3354 break;
3355 } while (changed);
3356
3357 c->changed = true;
3358 c->space_changed = true;
3359 }
3360
3361 if (Vcb->balance.stopping)
3362 break;
3363
3364 c->list_entry_balance.Flink = NULL;
3365
3366 Vcb->balance.chunks_left--;
3367 }
3368
3369 end:
3370 if (!Vcb->readonly) {
3371 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3372 le = chunks.Flink;
3373 while (le != &chunks) {
3374 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3375 c->reloc = false;
3376
3377 le = le->Flink;
3378 c->list_entry_balance.Flink = NULL;
3379 }
3380
3381 if (old_data_flags != 0)
3382 Vcb->data_flags = old_data_flags;
3383
3384 if (old_metadata_flags != 0)
3385 Vcb->metadata_flags = old_metadata_flags;
3386
3387 if (old_system_flags != 0)
3388 Vcb->system_flags = old_system_flags;
3389 }
3390
3391 if (Vcb->balance.removing) {
3392 device* dev = NULL;
3393
3394 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3395
3396 le = Vcb->devices.Flink;
3397 while (le != &Vcb->devices) {
3398 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3399
3400 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3401 dev = dev2;
3402 break;
3403 }
3404
3405 le = le->Flink;
3406 }
3407
3408 if (dev) {
3409 if (Vcb->balance.chunks_left == 0) {
3410 Status = finish_removing_device(Vcb, dev);
3411
3412 if (!NT_SUCCESS(Status)) {
3413 ERR("finish_removing_device returned %08x\n", Status);
3414 dev->reloc = false;
3415 }
3416 } else
3417 dev->reloc = false;
3418 }
3419
3420 ExReleaseResourceLite(&Vcb->tree_lock);
3421 } else if (Vcb->balance.shrinking) {
3422 device* dev = NULL;
3423
3424 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3425
3426 le = Vcb->devices.Flink;
3427 while (le != &Vcb->devices) {
3428 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3429
3430 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3431 dev = dev2;
3432 break;
3433 }
3434
3435 le = le->Flink;
3436 }
3437
3438 if (!dev) {
3439 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3440 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3441 }
3442
3443 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3444 if (dev) {
3445 Status = regenerate_space_list(Vcb, dev);
3446 if (!NT_SUCCESS(Status))
3447 WARN("regenerate_space_list returned %08x\n", Status);
3448 }
3449 } else {
3450 uint64_t old_size;
3451
3452 old_size = dev->devitem.num_bytes;
3453 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3454
3455 Status = update_dev_item(Vcb, dev, NULL);
3456 if (!NT_SUCCESS(Status)) {
3457 ERR("update_dev_item returned %08x\n", Status);
3458 dev->devitem.num_bytes = old_size;
3459 Vcb->balance.status = Status;
3460
3461 Status = regenerate_space_list(Vcb, dev);
3462 if (!NT_SUCCESS(Status))
3463 WARN("regenerate_space_list returned %08x\n", Status);
3464 } else {
3465 Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3466
3467 Status = do_write(Vcb, NULL);
3468 if (!NT_SUCCESS(Status))
3469 ERR("do_write returned %08x\n", Status);
3470
3471 free_trees(Vcb);
3472 }
3473 }
3474
3475 ExReleaseResourceLite(&Vcb->tree_lock);
3476
3477 if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3478 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3479 } else {
3480 Status = remove_balance_item(Vcb);
3481 if (!NT_SUCCESS(Status)) {
3482 ERR("remove_balance_item returned %08x\n", Status);
3483 goto end;
3484 }
3485 }
3486
3487 if (Vcb->trim && !Vcb->options.no_trim) {
3488 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3489
3490 le = Vcb->devices.Flink;
3491 while (le != &Vcb->devices) {
3492 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3493
3494 if (dev2->devobj && !dev2->readonly && dev2->trim)
3495 trim_unalloc_space(Vcb, dev2);
3496
3497 le = le->Flink;
3498 }
3499
3500 ExReleaseResourceLite(&Vcb->tree_lock);
3501 }
3502 }
3503
3504 ZwClose(Vcb->balance.thread);
3505 Vcb->balance.thread = NULL;
3506
3507 KeSetEvent(&Vcb->balance.finished, 0, false);
3508 }
3509
3510 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3511 NTSTATUS Status;
3512 btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3513 OBJECT_ATTRIBUTES oa;
3514 uint8_t i;
3515
3516 if (length < sizeof(btrfs_start_balance) || !data)
3517 return STATUS_INVALID_PARAMETER;
3518
3519 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3520 return STATUS_PRIVILEGE_NOT_HELD;
3521
3522 if (Vcb->locked) {
3523 WARN("cannot start balance while locked\n");
3524 return STATUS_DEVICE_NOT_READY;
3525 }
3526
3527 if (Vcb->scrub.thread) {
3528 WARN("cannot start balance while scrub running\n");
3529 return STATUS_DEVICE_NOT_READY;
3530 }
3531
3532 if (Vcb->balance.thread) {
3533 WARN("balance already running\n");
3534 return STATUS_DEVICE_NOT_READY;
3535 }
3536
3537 if (Vcb->readonly)
3538 return STATUS_MEDIA_WRITE_PROTECTED;
3539
3540 if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3541 !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3542 !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3543 return STATUS_SUCCESS;
3544
3545 for (i = 0; i < 3; i++) {
3546 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3547 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3548 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3549 BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE;
3550
3551 if (bsb->opts[i].profiles == 0)
3552 return STATUS_INVALID_PARAMETER;
3553 }
3554
3555 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3556 if (bsb->opts[i].devid == 0)
3557 return STATUS_INVALID_PARAMETER;
3558 }
3559
3560 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3561 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3562 return STATUS_INVALID_PARAMETER;
3563 }
3564
3565 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3566 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3567 return STATUS_INVALID_PARAMETER;
3568 }
3569
3570 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3571 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3572 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3573
3574 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3575 return STATUS_INVALID_PARAMETER;
3576 }
3577
3578 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3579 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3580 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3581
3582 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3583 return STATUS_INVALID_PARAMETER;
3584 }
3585
3586 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3587 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3588 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3589
3590 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3591 return STATUS_INVALID_PARAMETER;
3592 }
3593
3594 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3595 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3596 bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3597 bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3598 bsb->opts[i].convert != BLOCK_FLAG_SINGLE)
3599 return STATUS_INVALID_PARAMETER;
3600 }
3601 }
3602 }
3603
3604 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3605 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3606 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3607
3608 Vcb->balance.paused = false;
3609 Vcb->balance.removing = false;
3610 Vcb->balance.shrinking = false;
3611 Vcb->balance.status = STATUS_SUCCESS;
3612 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3613
3614 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3615
3616 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3617 if (!NT_SUCCESS(Status)) {
3618 ERR("PsCreateSystemThread returned %08x\n", Status);
3619 return Status;
3620 }
3621
3622 return STATUS_SUCCESS;
3623 }
3624
3625 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3626 KEY searchkey;
3627 traverse_ptr tp;
3628 NTSTATUS Status;
3629 BALANCE_ITEM* bi;
3630 OBJECT_ATTRIBUTES oa;
3631 int i;
3632
3633 searchkey.obj_id = BALANCE_ITEM_ID;
3634 searchkey.obj_type = TYPE_TEMP_ITEM;
3635 searchkey.offset = 0;
3636
3637 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3638 if (!NT_SUCCESS(Status)) {
3639 ERR("find_item returned %08x\n", Status);
3640 return Status;
3641 }
3642
3643 if (keycmp(tp.item->key, searchkey)) {
3644 TRACE("no balance item found\n");
3645 return STATUS_NOT_FOUND;
3646 }
3647
3648 if (tp.item->size < sizeof(BALANCE_ITEM)) {
3649 WARN("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3650 tp.item->size, sizeof(BALANCE_ITEM));
3651 return STATUS_INTERNAL_ERROR;
3652 }
3653
3654 bi = (BALANCE_ITEM*)tp.item->data;
3655
3656 if (bi->flags & BALANCE_FLAGS_DATA)
3657 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3658
3659 if (bi->flags & BALANCE_FLAGS_METADATA)
3660 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3661
3662 if (bi->flags & BALANCE_FLAGS_SYSTEM)
3663 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3664
3665 // do the heuristics that Linux driver does
3666
3667 for (i = 0; i < 3; i++) {
3668 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3669 // if converting, don't redo chunks already done
3670
3671 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3672 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3673
3674 // don't balance chunks more than 90% filled - presumably these
3675 // have already been done
3676
3677 if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3678 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3679 ) {
3680 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3681 Vcb->balance.opts[i].usage_start = 0;
3682 Vcb->balance.opts[i].usage_end = 90;
3683 }
3684 }
3685 }
3686
3687 if (Vcb->readonly || Vcb->options.skip_balance)
3688 Vcb->balance.paused = true;
3689 else
3690 Vcb->balance.paused = false;
3691
3692 Vcb->balance.removing = false;
3693 Vcb->balance.shrinking = false;
3694 Vcb->balance.status = STATUS_SUCCESS;
3695 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3696
3697 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3698
3699 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3700 if (!NT_SUCCESS(Status)) {
3701 ERR("PsCreateSystemThread returned %08x\n", Status);
3702 return Status;
3703 }
3704
3705 return STATUS_SUCCESS;
3706 }
3707
3708 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3709 btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3710
3711 if (length < sizeof(btrfs_query_balance) || !data)
3712 return STATUS_INVALID_PARAMETER;
3713
3714 if (!Vcb->balance.thread) {
3715 bqb->status = BTRFS_BALANCE_STOPPED;
3716
3717 if (!NT_SUCCESS(Vcb->balance.status)) {
3718 bqb->status |= BTRFS_BALANCE_ERROR;
3719 bqb->error = Vcb->balance.status;
3720 }
3721
3722 return STATUS_SUCCESS;
3723 }
3724
3725 bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3726
3727 if (Vcb->balance.removing)
3728 bqb->status |= BTRFS_BALANCE_REMOVAL;
3729
3730 if (Vcb->balance.shrinking)
3731 bqb->status |= BTRFS_BALANCE_SHRINKING;
3732
3733 if (!NT_SUCCESS(Vcb->balance.status))
3734 bqb->status |= BTRFS_BALANCE_ERROR;
3735
3736 bqb->chunks_left = Vcb->balance.chunks_left;
3737 bqb->total_chunks = Vcb->balance.total_chunks;
3738 bqb->error = Vcb->balance.status;
3739 RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3740 RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3741 RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3742
3743 return STATUS_SUCCESS;
3744 }
3745
3746 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3747 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3748 return STATUS_PRIVILEGE_NOT_HELD;
3749
3750 if (!Vcb->balance.thread)
3751 return STATUS_DEVICE_NOT_READY;
3752
3753 if (Vcb->balance.paused)
3754 return STATUS_DEVICE_NOT_READY;
3755
3756 Vcb->balance.paused = true;
3757 KeClearEvent(&Vcb->balance.event);
3758
3759 return STATUS_SUCCESS;
3760 }
3761
3762 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3763 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3764 return STATUS_PRIVILEGE_NOT_HELD;
3765
3766 if (!Vcb->balance.thread)
3767 return STATUS_DEVICE_NOT_READY;
3768
3769 if (!Vcb->balance.paused)
3770 return STATUS_DEVICE_NOT_READY;
3771
3772 if (Vcb->readonly)
3773 return STATUS_MEDIA_WRITE_PROTECTED;
3774
3775 Vcb->balance.paused = false;
3776 KeSetEvent(&Vcb->balance.event, 0, false);
3777
3778 return STATUS_SUCCESS;
3779 }
3780
3781 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3782 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3783 return STATUS_PRIVILEGE_NOT_HELD;
3784
3785 if (!Vcb->balance.thread)
3786 return STATUS_DEVICE_NOT_READY;
3787
3788 Vcb->balance.paused = false;
3789 Vcb->balance.stopping = true;
3790 Vcb->balance.status = STATUS_SUCCESS;
3791 KeSetEvent(&Vcb->balance.event, 0, false);
3792
3793 return STATUS_SUCCESS;
3794 }
3795
3796 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3797 uint64_t devid;
3798 LIST_ENTRY* le;
3799 device* dev = NULL;
3800 NTSTATUS Status;
3801 int i;
3802 uint64_t num_rw_devices;
3803 OBJECT_ATTRIBUTES oa;
3804
3805 TRACE("(%p, %p, %x)\n", Vcb, data, length);
3806
3807 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3808 return STATUS_PRIVILEGE_NOT_HELD;
3809
3810 if (length < sizeof(uint64_t))
3811 return STATUS_INVALID_PARAMETER;
3812
3813 devid = *(uint64_t*)data;
3814
3815 ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3816
3817 if (Vcb->readonly) {
3818 ExReleaseResourceLite(&Vcb->tree_lock);
3819 return STATUS_MEDIA_WRITE_PROTECTED;
3820 }
3821
3822 num_rw_devices = 0;
3823
3824 le = Vcb->devices.Flink;
3825 while (le != &Vcb->devices) {
3826 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3827
3828 if (dev2->devitem.dev_id == devid)
3829 dev = dev2;
3830
3831 if (!dev2->readonly)
3832 num_rw_devices++;
3833
3834 le = le->Flink;
3835 }
3836
3837 if (!dev) {
3838 ExReleaseResourceLite(&Vcb->tree_lock);
3839 WARN("device %I64x not found\n", devid);
3840 return STATUS_NOT_FOUND;
3841 }
3842
3843 if (!dev->readonly) {
3844 if (num_rw_devices == 1) {
3845 ExReleaseResourceLite(&Vcb->tree_lock);
3846 WARN("not removing last non-readonly device\n");
3847 return STATUS_INVALID_PARAMETER;
3848 }
3849
3850 if (num_rw_devices == 4 &&
3851 ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3852 (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6))
3853 ) {
3854 ExReleaseResourceLite(&Vcb->tree_lock);
3855 ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n");
3856 return STATUS_CANNOT_DELETE;
3857 }
3858
3859 if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) {
3860 ExReleaseResourceLite(&Vcb->tree_lock);
3861 ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n");
3862 return STATUS_CANNOT_DELETE;
3863 }
3864
3865 if (num_rw_devices == 2 &&
3866 ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3867 (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3868 ) {
3869 ExReleaseResourceLite(&Vcb->tree_lock);
3870 ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3871 return STATUS_CANNOT_DELETE;
3872 }
3873 }
3874
3875 ExReleaseResourceLite(&Vcb->tree_lock);
3876
3877 if (Vcb->balance.thread) {
3878 WARN("balance already running\n");
3879 return STATUS_DEVICE_NOT_READY;
3880 }
3881
3882 dev->reloc = true;
3883
3884 RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3885
3886 for (i = 0; i < 3; i++) {
3887 Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3888 Vcb->balance.opts[i].devid = devid;
3889 }
3890
3891 Vcb->balance.paused = false;
3892 Vcb->balance.removing = true;
3893 Vcb->balance.shrinking = false;
3894 Vcb->balance.status = STATUS_SUCCESS;
3895 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3896
3897 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3898
3899 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3900 if (!NT_SUCCESS(Status)) {
3901 ERR("PsCreateSystemThread returned %08x\n", Status);
3902 dev->reloc = false;
3903 return Status;
3904 }
3905
3906 return STATUS_SUCCESS;
3907 }