[BTRFS]
[reactos.git] / reactos / drivers / filesystems / btrfs / balance.c
1 /* Copyright (c) Mark Harmstone 2016-17
2 *
3 * This file is part of WinBtrfs.
4 *
5 * WinBtrfs is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public Licence as published by
7 * the Free Software Foundation, either version 3 of the Licence, or
8 * (at your option) any later version.
9 *
10 * WinBtrfs is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public Licence for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public Licence
16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include <ntddstor.h>
21
22 typedef struct {
23 UINT64 address;
24 UINT64 new_address;
25 tree_header* data;
26 EXTENT_ITEM* ei;
27 tree* t;
28 BOOL system;
29 LIST_ENTRY refs;
30 LIST_ENTRY list_entry;
31 } metadata_reloc;
32
33 typedef struct {
34 UINT8 type;
35 UINT64 hash;
36
37 union {
38 TREE_BLOCK_REF tbr;
39 SHARED_BLOCK_REF sbr;
40 };
41
42 metadata_reloc* parent;
43 BOOL top;
44 LIST_ENTRY list_entry;
45 } metadata_reloc_ref;
46
47 typedef struct {
48 UINT64 address;
49 UINT64 size;
50 UINT64 new_address;
51 chunk* newchunk;
52 EXTENT_ITEM* ei;
53 LIST_ENTRY refs;
54 LIST_ENTRY list_entry;
55 } data_reloc;
56
57 typedef struct {
58 UINT8 type;
59 UINT64 hash;
60
61 union {
62 EXTENT_DATA_REF edr;
63 SHARED_DATA_REF sdr;
64 };
65
66 metadata_reloc* parent;
67 LIST_ENTRY list_entry;
68 } data_reloc_ref;
69
70 #ifndef _MSC_VER // not in mingw yet
71 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
72 #endif
73
74 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
75
76 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
77 BOOL skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
78 NTSTATUS Status;
79 metadata_reloc* mr;
80 EXTENT_ITEM* ei;
81 UINT16 len;
82 UINT64 inline_rc;
83 UINT8* ptr;
84
85 mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
86 if (!mr) {
87 ERR("out of memory\n");
88 return STATUS_INSUFFICIENT_RESOURCES;
89 }
90
91 mr->address = tp->item->key.obj_id;
92 mr->data = NULL;
93 mr->ei = (EXTENT_ITEM*)tp->item->data;
94 mr->system = FALSE;
95 InitializeListHead(&mr->refs);
96
97 Status = delete_tree_item(Vcb, tp);
98 if (!NT_SUCCESS(Status)) {
99 ERR("delete_tree_item returned %08x\n", Status);
100 ExFreePool(mr);
101 return Status;
102 }
103
104 if (!c)
105 c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
106
107 if (c) {
108 ExAcquireResourceExclusiveLite(&c->lock, TRUE);
109
110 c->used -= Vcb->superblock.node_size;
111
112 space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
113
114 ExReleaseResourceLite(&c->lock);
115 }
116
117 ei = (EXTENT_ITEM*)tp->item->data;
118 inline_rc = 0;
119
120 len = tp->item->size - sizeof(EXTENT_ITEM);
121 ptr = (UINT8*)tp->item->data + sizeof(EXTENT_ITEM);
122 if (!skinny) {
123 len -= sizeof(EXTENT_ITEM2);
124 ptr += sizeof(EXTENT_ITEM2);
125 }
126
127 while (len > 0) {
128 UINT8 secttype = *ptr;
129 UINT16 sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
130 metadata_reloc_ref* ref;
131
132 len--;
133
134 if (sectlen > len) {
135 ERR("(%llx,%x,%llx): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
136 return STATUS_INTERNAL_ERROR;
137 }
138
139 if (sectlen == 0) {
140 ERR("(%llx,%x,%llx): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
141 return STATUS_INTERNAL_ERROR;
142 }
143
144 ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
145 if (!ref) {
146 ERR("out of memory\n");
147 return STATUS_INSUFFICIENT_RESOURCES;
148 }
149
150 if (secttype == TYPE_TREE_BLOCK_REF) {
151 ref->type = TYPE_TREE_BLOCK_REF;
152 RtlCopyMemory(&ref->tbr, ptr + sizeof(UINT8), sizeof(TREE_BLOCK_REF));
153 inline_rc++;
154 } else if (secttype == TYPE_SHARED_BLOCK_REF) {
155 ref->type = TYPE_SHARED_BLOCK_REF;
156 RtlCopyMemory(&ref->sbr, ptr + sizeof(UINT8), sizeof(SHARED_BLOCK_REF));
157 inline_rc++;
158 } else {
159 ERR("unexpected tree type %x\n", secttype);
160 ExFreePool(ref);
161 return STATUS_INTERNAL_ERROR;
162 }
163
164 ref->parent = NULL;
165 ref->top = FALSE;
166 InsertTailList(&mr->refs, &ref->list_entry);
167
168 len -= sectlen;
169 ptr += sizeof(UINT8) + sectlen;
170 }
171
172 if (inline_rc < ei->refcount) { // look for non-inline entries
173 traverse_ptr tp2 = *tp, next_tp;
174
175 while (find_next_item(Vcb, &tp2, &next_tp, FALSE, NULL)) {
176 tp2 = next_tp;
177
178 if (tp2.item->key.obj_id == tp->item->key.obj_id) {
179 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
180 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
181 if (!ref) {
182 ERR("out of memory\n");
183 return STATUS_INSUFFICIENT_RESOURCES;
184 }
185
186 ref->type = TYPE_TREE_BLOCK_REF;
187 ref->tbr.offset = tp2.item->key.offset;
188 ref->parent = NULL;
189 ref->top = FALSE;
190 InsertTailList(&mr->refs, &ref->list_entry);
191
192 Status = delete_tree_item(Vcb, &tp2);
193 if (!NT_SUCCESS(Status)) {
194 ERR("delete_tree_item returned %08x\n", Status);
195 return Status;
196 }
197 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
198 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
199 if (!ref) {
200 ERR("out of memory\n");
201 return STATUS_INSUFFICIENT_RESOURCES;
202 }
203
204 ref->type = TYPE_SHARED_BLOCK_REF;
205 ref->sbr.offset = tp2.item->key.offset;
206 ref->parent = NULL;
207 ref->top = FALSE;
208 InsertTailList(&mr->refs, &ref->list_entry);
209
210 Status = delete_tree_item(Vcb, &tp2);
211 if (!NT_SUCCESS(Status)) {
212 ERR("delete_tree_item returned %08x\n", Status);
213 return Status;
214 }
215 }
216 } else
217 break;
218 }
219 }
220
221 InsertTailList(items, &mr->list_entry);
222
223 if (mr2)
224 *mr2 = mr;
225
226 return STATUS_SUCCESS;
227 }
228
229 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
230 UINT64 address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
231 LIST_ENTRY* le;
232 KEY searchkey;
233 traverse_ptr tp;
234 BOOL skinny = FALSE;
235 NTSTATUS Status;
236
237 le = items->Flink;
238 while (le != items) {
239 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
240
241 if (mr->address == address) {
242 *mr2 = mr;
243 return STATUS_SUCCESS;
244 }
245
246 le = le->Flink;
247 }
248
249 searchkey.obj_id = address;
250 searchkey.obj_type = TYPE_METADATA_ITEM;
251 searchkey.offset = 0xffffffffffffffff;
252
253 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
254 if (!NT_SUCCESS(Status)) {
255 ERR("find_item returned %08x\n", Status);
256 return Status;
257 }
258
259 if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
260 skinny = TRUE;
261 else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
262 tp.item->size >= sizeof(EXTENT_ITEM)) {
263 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
264
265 if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
266 ERR("EXTENT_ITEM for %llx found, but tree flag not set\n", address);
267 return STATUS_INTERNAL_ERROR;
268 }
269 } else {
270 ERR("could not find valid EXTENT_ITEM for address %llx\n", address);
271 return STATUS_INTERNAL_ERROR;
272 }
273
274 Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
275 if (!NT_SUCCESS(Status)) {
276 ERR("add_metadata_reloc returned %08x\n", Status);
277 return Status;
278 }
279
280 return STATUS_SUCCESS;
281 }
282
283 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
284 LIST_ENTRY newlist, *le;
285
286 if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
287 return;
288
289 // insertion sort
290
291 InitializeListHead(&newlist);
292
293 while (!IsListEmpty(&mr->refs)) {
294 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
295 BOOL inserted = FALSE;
296
297 if (ref->type == TYPE_TREE_BLOCK_REF)
298 ref->hash = ref->tbr.offset;
299 else if (ref->type == TYPE_SHARED_BLOCK_REF)
300 ref->hash = ref->parent->new_address;
301
302 le = newlist.Flink;
303 while (le != &newlist) {
304 metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
305
306 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
307 InsertHeadList(le->Blink, &ref->list_entry);
308 inserted = TRUE;
309 break;
310 }
311
312 le = le->Flink;
313 }
314
315 if (!inserted)
316 InsertTailList(&newlist, &ref->list_entry);
317 }
318
319 newlist.Flink->Blink = &mr->refs;
320 newlist.Blink->Flink = &mr->refs;
321 mr->refs.Flink = newlist.Flink;
322 mr->refs.Blink = newlist.Blink;
323 }
324
325 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
326 NTSTATUS Status;
327 LIST_ENTRY* le;
328 UINT64 rc = 0;
329 UINT16 inline_len;
330 BOOL all_inline = TRUE;
331 metadata_reloc_ref* first_noninline = NULL;
332 EXTENT_ITEM* ei;
333 UINT8* ptr;
334
335 inline_len = sizeof(EXTENT_ITEM);
336 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
337 inline_len += sizeof(EXTENT_ITEM2);
338
339 sort_metadata_reloc_refs(mr);
340
341 le = mr->refs.Flink;
342 while (le != &mr->refs) {
343 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
344 UINT16 extlen = 0;
345
346 rc++;
347
348 if (ref->type == TYPE_TREE_BLOCK_REF)
349 extlen += sizeof(TREE_BLOCK_REF);
350 else if (ref->type == TYPE_SHARED_BLOCK_REF)
351 extlen += sizeof(SHARED_BLOCK_REF);
352
353 if (all_inline) {
354 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
355 all_inline = FALSE;
356 first_noninline = ref;
357 } else
358 inline_len += extlen + 1;
359 }
360
361 le = le->Flink;
362 }
363
364 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
365 if (!ei) {
366 ERR("out of memory\n");
367 return STATUS_INSUFFICIENT_RESOURCES;
368 }
369
370 ei->refcount = rc;
371 ei->generation = mr->ei->generation;
372 ei->flags = mr->ei->flags;
373 ptr = (UINT8*)&ei[1];
374
375 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
376 EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
377
378 ei2->firstitem = *(KEY*)&mr->data[1];
379 ei2->level = mr->data->level;
380
381 ptr += sizeof(EXTENT_ITEM2);
382 }
383
384 le = mr->refs.Flink;
385 while (le != &mr->refs) {
386 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
387
388 if (ref == first_noninline)
389 break;
390
391 *ptr = ref->type;
392 ptr++;
393
394 if (ref->type == TYPE_TREE_BLOCK_REF) {
395 TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
396
397 tbr->offset = ref->tbr.offset;
398
399 ptr += sizeof(TREE_BLOCK_REF);
400 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
401 SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
402
403 sbr->offset = ref->parent->new_address;
404
405 ptr += sizeof(SHARED_BLOCK_REF);
406 }
407
408 le = le->Flink;
409 }
410
411 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
412 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
413 else
414 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
415
416 if (!NT_SUCCESS(Status)) {
417 ERR("insert_tree_item returned %08x\n", Status);
418 ExFreePool(ei);
419 return Status;
420 }
421
422 if (!all_inline) {
423 le = &first_noninline->list_entry;
424
425 while (le != &mr->refs) {
426 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
427
428 if (ref->type == TYPE_TREE_BLOCK_REF) {
429 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
430 if (!NT_SUCCESS(Status)) {
431 ERR("insert_tree_item returned %08x\n", Status);
432 return Status;
433 }
434 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
435 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
436 if (!NT_SUCCESS(Status)) {
437 ERR("insert_tree_item returned %08x\n", Status);
438 return Status;
439 }
440 }
441
442 le = le->Flink;
443 }
444 }
445
446 if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
447 if (mr->data->level > 0) {
448 UINT16 i;
449 internal_node* in = (internal_node*)&mr->data[1];
450
451 for (i = 0; i < mr->data->num_items; i++) {
452 UINT64 sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
453
454 if (sbrrc > 0) {
455 SHARED_BLOCK_REF sbr;
456
457 sbr.offset = mr->new_address;
458
459 Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
460 if (!NT_SUCCESS(Status)) {
461 ERR("increase_extent_refcount returned %08x\n", Status);
462 return Status;
463 }
464
465 sbr.offset = mr->address;
466
467 Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
468 sbr.offset, FALSE, NULL);
469 if (!NT_SUCCESS(Status)) {
470 ERR("decrease_extent_refcount returned %08x\n", Status);
471 return Status;
472 }
473 }
474 }
475 } else {
476 UINT16 i;
477 leaf_node* ln = (leaf_node*)&mr->data[1];
478
479 for (i = 0; i < mr->data->num_items; i++) {
480 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
481 EXTENT_DATA* ed = (EXTENT_DATA*)((UINT8*)mr->data + sizeof(tree_header) + ln[i].offset);
482
483 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
484 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
485
486 if (ed2->size > 0) { // not sparse
487 UINT32 sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
488
489 if (sdrrc > 0) {
490 SHARED_DATA_REF sdr;
491 chunk* c;
492
493 sdr.offset = mr->new_address;
494 sdr.count = sdrrc;
495
496 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
497 if (!NT_SUCCESS(Status)) {
498 ERR("increase_extent_refcount returned %08x\n", Status);
499 return Status;
500 }
501
502 sdr.offset = mr->address;
503
504 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
505 sdr.offset, FALSE, NULL);
506 if (!NT_SUCCESS(Status)) {
507 ERR("decrease_extent_refcount returned %08x\n", Status);
508 return Status;
509 }
510
511 c = get_chunk_from_address(Vcb, ed2->address);
512
513 if (c) {
514 // check changed_extents
515
516 ExAcquireResourceExclusiveLite(&c->changed_extents_lock, TRUE);
517
518 le = c->changed_extents.Flink;
519
520 while (le != &c->changed_extents) {
521 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
522
523 if (ce->address == ed2->address) {
524 LIST_ENTRY* le2;
525
526 le2 = ce->refs.Flink;
527 while (le2 != &ce->refs) {
528 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
529
530 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
531 cer->sdr.offset = mr->new_address;
532 break;
533 }
534
535 le2 = le2->Flink;
536 }
537
538 le2 = ce->old_refs.Flink;
539 while (le2 != &ce->old_refs) {
540 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
541
542 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
543 cer->sdr.offset = mr->new_address;
544 break;
545 }
546
547 le2 = le2->Flink;
548 }
549
550 break;
551 }
552
553 le = le->Flink;
554 }
555
556 ExReleaseResourceLite(&c->changed_extents_lock);
557 }
558 }
559 }
560 }
561 }
562 }
563 }
564 }
565
566 return STATUS_SUCCESS;
567 }
568
569 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
570 LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
571 LIST_ENTRY tree_writes, *le;
572 NTSTATUS Status;
573 traverse_ptr tp;
574 UINT8 level, max_level = 0;
575 chunk* newchunk = NULL;
576
577 InitializeListHead(&tree_writes);
578
579 le = items->Flink;
580 while (le != items) {
581 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
582 LIST_ENTRY* le2;
583 chunk* pc;
584
585 mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
586 if (!mr->data) {
587 ERR("out of memory\n");
588 return STATUS_INSUFFICIENT_RESOURCES;
589 }
590
591 Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, TRUE, (UINT8*)mr->data,
592 c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, FALSE, NormalPagePriority);
593 if (!NT_SUCCESS(Status)) {
594 ERR("read_data returned %08x\n", Status);
595 return Status;
596 }
597
598 if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
599 mr->system = TRUE;
600
601 if (data_items && mr->data->level == 0) {
602 le2 = data_items->Flink;
603 while (le2 != data_items) {
604 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
605 leaf_node* ln = (leaf_node*)&mr->data[1];
606 UINT16 i;
607
608 for (i = 0; i < mr->data->num_items; i++) {
609 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
610 EXTENT_DATA* ed = (EXTENT_DATA*)((UINT8*)mr->data + sizeof(tree_header) + ln[i].offset);
611
612 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
613 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
614
615 if (ed2->address == dr->address)
616 ed2->address = dr->new_address;
617 }
618 }
619 }
620
621 le2 = le2->Flink;
622 }
623 }
624
625 if (mr->data->level > max_level)
626 max_level = mr->data->level;
627
628 le2 = mr->refs.Flink;
629 while (le2 != &mr->refs) {
630 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
631
632 if (ref->type == TYPE_TREE_BLOCK_REF) {
633 KEY* firstitem;
634 root* r = NULL;
635 LIST_ENTRY* le3;
636 tree* t;
637
638 firstitem = (KEY*)&mr->data[1];
639
640 le3 = Vcb->roots.Flink;
641 while (le3 != &Vcb->roots) {
642 root* r2 = CONTAINING_RECORD(le3, root, list_entry);
643
644 if (r2->id == ref->tbr.offset) {
645 r = r2;
646 break;
647 }
648
649 le3 = le3->Flink;
650 }
651
652 if (!r) {
653 ERR("could not find subvol with id %llx\n", ref->tbr.offset);
654 return STATUS_INTERNAL_ERROR;
655 }
656
657 Status = find_item_to_level(Vcb, r, &tp, firstitem, FALSE, mr->data->level + 1, NULL);
658 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
659 ERR("find_item_to_level returned %08x\n", Status);
660 return Status;
661 }
662
663 t = tp.tree;
664 while (t && t->header.level < mr->data->level + 1) {
665 t = t->parent;
666 }
667
668 if (!t)
669 ref->top = TRUE;
670 else {
671 metadata_reloc* mr2;
672
673 Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
674 if (!NT_SUCCESS(Status)) {
675 ERR("add_metadata_reloc_parent returned %08x\n", Status);
676 return Status;
677 }
678
679 ref->parent = mr2;
680 }
681 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
682 metadata_reloc* mr2;
683
684 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
685 if (!NT_SUCCESS(Status)) {
686 ERR("add_metadata_reloc_parent returned %08x\n", Status);
687 return Status;
688 }
689
690 ref->parent = mr2;
691 }
692
693 le2 = le2->Flink;
694 }
695
696 le = le->Flink;
697 }
698
699 le = items->Flink;
700 while (le != items) {
701 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
702 LIST_ENTRY* le2;
703 UINT32 hash;
704
705 mr->t = NULL;
706
707 hash = calc_crc32c(0xffffffff, (UINT8*)&mr->address, sizeof(UINT64));
708
709 le2 = Vcb->trees_ptrs[hash >> 24];
710
711 if (le2) {
712 while (le2 != &Vcb->trees_hash) {
713 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
714
715 if (t->header.address == mr->address) {
716 mr->t = t;
717 break;
718 } else if (t->hash > hash)
719 break;
720
721 le2 = le2->Flink;
722 }
723 }
724
725 le = le->Flink;
726 }
727
728 for (level = 0; level <= max_level; level++) {
729 le = items->Flink;
730 while (le != items) {
731 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
732
733 if (mr->data->level == level) {
734 BOOL done = FALSE;
735 LIST_ENTRY* le2;
736 tree_write* tw;
737 UINT64 flags;
738 tree* t3;
739
740 if (mr->system)
741 flags = Vcb->system_flags;
742 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
743 flags = Vcb->data_flags;
744 else
745 flags = Vcb->metadata_flags;
746
747 if (newchunk) {
748 ExAcquireResourceExclusiveLite(&newchunk->lock, TRUE);
749
750 if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
751 newchunk->used += Vcb->superblock.node_size;
752 space_list_subtract(newchunk, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
753 done = TRUE;
754 }
755
756 ExReleaseResourceLite(&newchunk->lock);
757 }
758
759 if (!done) {
760 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
761
762 le2 = Vcb->chunks.Flink;
763 while (le2 != &Vcb->chunks) {
764 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
765
766 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
767 ExAcquireResourceExclusiveLite(&c2->lock, TRUE);
768
769 if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
770 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
771 c2->used += Vcb->superblock.node_size;
772 space_list_subtract(c2, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
773 ExReleaseResourceLite(&c2->lock);
774 newchunk = c2;
775 done = TRUE;
776 break;
777 }
778 }
779
780 ExReleaseResourceLite(&c2->lock);
781 }
782
783 le2 = le2->Flink;
784 }
785
786 // allocate new chunk if necessary
787 if (!done) {
788 Status = alloc_chunk(Vcb, flags, &newchunk, FALSE);
789
790 if (!NT_SUCCESS(Status)) {
791 ERR("alloc_chunk returned %08x\n", Status);
792 ExReleaseResourceLite(&Vcb->chunk_lock);
793 goto end;
794 }
795
796 ExAcquireResourceExclusiveLite(&newchunk->lock, TRUE);
797
798 newchunk->balance_num = Vcb->balance.balance_num;
799
800 if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
801 ExReleaseResourceLite(&newchunk->lock);
802 ExReleaseResourceLite(&Vcb->chunk_lock);
803 ERR("could not find address in new chunk\n");
804 Status = STATUS_DISK_FULL;
805 goto end;
806 } else {
807 newchunk->used += Vcb->superblock.node_size;
808 space_list_subtract(newchunk, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
809 }
810
811 ExReleaseResourceLite(&newchunk->lock);
812 }
813
814 ExReleaseResourceLite(&Vcb->chunk_lock);
815 }
816
817 // update parents
818 le2 = mr->refs.Flink;
819 while (le2 != &mr->refs) {
820 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
821
822 if (ref->parent) {
823 UINT16 i;
824 internal_node* in = (internal_node*)&ref->parent->data[1];
825
826 for (i = 0; i < ref->parent->data->num_items; i++) {
827 if (in[i].address == mr->address) {
828 in[i].address = mr->new_address;
829 break;
830 }
831 }
832
833 if (ref->parent->t) {
834 LIST_ENTRY* le3;
835
836 le3 = ref->parent->t->itemlist.Flink;
837 while (le3 != &ref->parent->t->itemlist) {
838 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
839
840 if (!td->inserted && td->treeholder.address == mr->address)
841 td->treeholder.address = mr->new_address;
842
843 le3 = le3->Flink;
844 }
845 }
846 } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
847 LIST_ENTRY* le3;
848 root* r = NULL;
849
850 // alter ROOT_ITEM
851
852 le3 = Vcb->roots.Flink;
853 while (le3 != &Vcb->roots) {
854 root* r2 = CONTAINING_RECORD(le3, root, list_entry);
855
856 if (r2->id == ref->tbr.offset) {
857 r = r2;
858 break;
859 }
860
861 le3 = le3->Flink;
862 }
863
864 if (r) {
865 r->treeholder.address = mr->new_address;
866
867 if (r == Vcb->root_root)
868 Vcb->superblock.root_tree_addr = mr->new_address;
869 else if (r == Vcb->chunk_root)
870 Vcb->superblock.chunk_tree_addr = mr->new_address;
871 else if (r->root_item.block_number == mr->address) {
872 KEY searchkey;
873 ROOT_ITEM* ri;
874
875 r->root_item.block_number = mr->new_address;
876
877 searchkey.obj_id = r->id;
878 searchkey.obj_type = TYPE_ROOT_ITEM;
879 searchkey.offset = 0xffffffffffffffff;
880
881 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
882 if (!NT_SUCCESS(Status)) {
883 ERR("find_item returned %08x\n", Status);
884 goto end;
885 }
886
887 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
888 ERR("could not find ROOT_ITEM for tree %llx\n", searchkey.obj_id);
889 Status = STATUS_INTERNAL_ERROR;
890 goto end;
891 }
892
893 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
894 if (!ri) {
895 ERR("out of memory\n");
896 Status = STATUS_INSUFFICIENT_RESOURCES;
897 goto end;
898 }
899
900 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
901
902 Status = delete_tree_item(Vcb, &tp);
903 if (!NT_SUCCESS(Status)) {
904 ERR("delete_tree_item returned %08x\n", Status);
905 goto end;
906 }
907
908 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
909 if (!NT_SUCCESS(Status)) {
910 ERR("insert_tree_item returned %08x\n", Status);
911 goto end;
912 }
913 }
914 }
915 }
916
917 le2 = le2->Flink;
918 }
919
920 mr->data->address = mr->new_address;
921
922 t3 = mr->t;
923
924 while (t3) {
925 UINT8 h;
926 BOOL inserted;
927 tree* t4 = NULL;
928
929 // check if tree loaded more than once
930 if (t3->list_entry.Flink != &Vcb->trees_hash) {
931 tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
932
933 if (nt->header.address == t3->header.address)
934 t4 = nt;
935 }
936
937 t3->header.address = mr->new_address;
938
939 h = t3->hash >> 24;
940
941 if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
942 if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
943 Vcb->trees_ptrs[h] = NULL;
944 else {
945 tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
946
947 if (t2->hash >> 24 == h)
948 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
949 else
950 Vcb->trees_ptrs[h] = NULL;
951 }
952 }
953
954 RemoveEntryList(&t3->list_entry_hash);
955
956 t3->hash = calc_crc32c(0xffffffff, (UINT8*)&t3->header.address, sizeof(UINT64));
957 h = t3->hash >> 24;
958
959 if (!Vcb->trees_ptrs[h]) {
960 UINT8 h2 = h;
961
962 le2 = Vcb->trees_hash.Flink;
963
964 if (h2 > 0) {
965 h2--;
966 do {
967 if (Vcb->trees_ptrs[h2]) {
968 le2 = Vcb->trees_ptrs[h2];
969 break;
970 }
971
972 h2--;
973 } while (h2 > 0);
974 }
975 } else
976 le2 = Vcb->trees_ptrs[h];
977
978 inserted = FALSE;
979 while (le2 != &Vcb->trees_hash) {
980 tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
981
982 if (t2->hash >= t3->hash) {
983 InsertHeadList(le2->Blink, &t3->list_entry_hash);
984 inserted = TRUE;
985 break;
986 }
987
988 le2 = le2->Flink;
989 }
990
991 if (!inserted)
992 InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
993
994 if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
995 Vcb->trees_ptrs[h] = &t3->list_entry_hash;
996
997 if (data_items && level == 0) {
998 le2 = data_items->Flink;
999
1000 while (le2 != data_items) {
1001 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
1002 LIST_ENTRY* le3 = t3->itemlist.Flink;
1003
1004 while (le3 != &t3->itemlist) {
1005 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1006
1007 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1008 EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1009
1010 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1011 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1012
1013 if (ed2->address == dr->address)
1014 ed2->address = dr->new_address;
1015 }
1016 }
1017
1018 le3 = le3->Flink;
1019 }
1020
1021 le2 = le2->Flink;
1022 }
1023 }
1024
1025 t3 = t4;
1026 }
1027
1028 *((UINT32*)mr->data) = ~calc_crc32c(0xffffffff, (UINT8*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum));
1029
1030 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1031 if (!tw) {
1032 ERR("out of memory\n");
1033 Status = STATUS_INSUFFICIENT_RESOURCES;
1034 goto end;
1035 }
1036
1037 tw->address = mr->new_address;
1038 tw->length = Vcb->superblock.node_size;
1039 tw->data = (UINT8*)mr->data;
1040
1041 if (IsListEmpty(&tree_writes))
1042 InsertTailList(&tree_writes, &tw->list_entry);
1043 else {
1044 BOOL inserted = FALSE;
1045
1046 le2 = tree_writes.Flink;
1047 while (le2 != &tree_writes) {
1048 tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1049
1050 if (tw2->address > tw->address) {
1051 InsertHeadList(le2->Blink, &tw->list_entry);
1052 inserted = TRUE;
1053 break;
1054 }
1055
1056 le2 = le2->Flink;
1057 }
1058
1059 if (!inserted)
1060 InsertTailList(&tree_writes, &tw->list_entry);
1061 }
1062 }
1063
1064 le = le->Flink;
1065 }
1066 }
1067
1068 Status = do_tree_writes(Vcb, &tree_writes, TRUE);
1069 if (!NT_SUCCESS(Status)) {
1070 ERR("do_tree_writes returned %08x\n", Status);
1071 goto end;
1072 }
1073
1074 le = items->Flink;
1075 while (le != items) {
1076 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1077
1078 Status = add_metadata_reloc_extent_item(Vcb, mr);
1079 if (!NT_SUCCESS(Status)) {
1080 ERR("add_metadata_reloc_extent_item returned %08x\n", Status);
1081 goto end;
1082 }
1083
1084 le = le->Flink;
1085 }
1086
1087 Status = STATUS_SUCCESS;
1088
1089 end:
1090 while (!IsListEmpty(&tree_writes)) {
1091 tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1092 ExFreePool(tw);
1093 }
1094
1095 return Status;
1096 }
1097
1098 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, BOOL* changed) {
1099 KEY searchkey;
1100 traverse_ptr tp;
1101 NTSTATUS Status;
1102 BOOL b;
1103 LIST_ENTRY items, rollback;
1104 UINT32 loaded = 0;
1105
1106 TRACE("chunk %llx\n", c->offset);
1107
1108 InitializeListHead(&rollback);
1109 InitializeListHead(&items);
1110
1111 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
1112
1113 searchkey.obj_id = c->offset;
1114 searchkey.obj_type = TYPE_METADATA_ITEM;
1115 searchkey.offset = 0xffffffffffffffff;
1116
1117 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
1118 if (!NT_SUCCESS(Status)) {
1119 ERR("find_item returned %08x\n", Status);
1120 goto end;
1121 }
1122
1123 do {
1124 traverse_ptr next_tp;
1125
1126 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1127 break;
1128
1129 if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1130 BOOL tree = FALSE, skinny = FALSE;
1131
1132 if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1133 tree = TRUE;
1134 skinny = TRUE;
1135 } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1136 tp.item->size >= sizeof(EXTENT_ITEM)) {
1137 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1138
1139 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1140 tree = TRUE;
1141 }
1142
1143 if (tree) {
1144 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1145
1146 if (!NT_SUCCESS(Status)) {
1147 ERR("add_metadata_reloc returned %08x\n", Status);
1148 goto end;
1149 }
1150
1151 loaded++;
1152
1153 if (loaded >= 64) // only do 64 at a time
1154 break;
1155 }
1156 }
1157
1158 b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
1159
1160 if (b)
1161 tp = next_tp;
1162 } while (b);
1163
1164 if (IsListEmpty(&items)) {
1165 *changed = FALSE;
1166 Status = STATUS_SUCCESS;
1167 goto end;
1168 } else
1169 *changed = TRUE;
1170
1171 Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1172 if (!NT_SUCCESS(Status)) {
1173 ERR("write_metadata_items returned %08x\n", Status);
1174 goto end;
1175 }
1176
1177 Status = STATUS_SUCCESS;
1178
1179 Vcb->need_write = TRUE;
1180
1181 end:
1182 if (NT_SUCCESS(Status)) {
1183 Status = do_write(Vcb, NULL);
1184 if (!NT_SUCCESS(Status))
1185 ERR("do_write returned %08x\n", Status);
1186 }
1187
1188 if (NT_SUCCESS(Status))
1189 clear_rollback(&rollback);
1190 else
1191 do_rollback(Vcb, &rollback);
1192
1193 free_trees(Vcb);
1194
1195 ExReleaseResourceLite(&Vcb->tree_lock);
1196
1197 while (!IsListEmpty(&items)) {
1198 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1199
1200 while (!IsListEmpty(&mr->refs)) {
1201 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1202
1203 ExFreePool(ref);
1204 }
1205
1206 ExFreePool(mr);
1207 }
1208
1209 return Status;
1210 }
1211
1212 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1213 data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1214 NTSTATUS Status;
1215 LIST_ENTRY* le;
1216 KEY searchkey;
1217 traverse_ptr tp;
1218 root* r = NULL;
1219 metadata_reloc* mr;
1220 UINT64 last_tree = 0;
1221 data_reloc_ref* ref;
1222
1223 le = Vcb->roots.Flink;
1224 while (le != &Vcb->roots) {
1225 root* r2 = CONTAINING_RECORD(le, root, list_entry);
1226
1227 if (r2->id == edr->root) {
1228 r = r2;
1229 break;
1230 }
1231
1232 le = le->Flink;
1233 }
1234
1235 if (!r) {
1236 ERR("could not find subvol %llx\n", edr->count);
1237 return STATUS_INTERNAL_ERROR;
1238 }
1239
1240 searchkey.obj_id = edr->objid;
1241 searchkey.obj_type = TYPE_EXTENT_DATA;
1242 searchkey.offset = 0;
1243
1244 Status = find_item(Vcb, r, &tp, &searchkey, FALSE, NULL);
1245 if (!NT_SUCCESS(Status)) {
1246 ERR("find_item returned %08x\n", Status);
1247 return Status;
1248 }
1249
1250 if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1251 traverse_ptr tp2;
1252
1253 if (find_next_item(Vcb, &tp, &tp2, FALSE, NULL))
1254 tp = tp2;
1255 else {
1256 ERR("could not find EXTENT_DATA for inode %llx in root %llx\n", searchkey.obj_id, r->id);
1257 return STATUS_INTERNAL_ERROR;
1258 }
1259 }
1260
1261 ref = NULL;
1262
1263 while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1264 traverse_ptr tp2;
1265
1266 if (tp.item->size >= sizeof(EXTENT_DATA)) {
1267 EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1268
1269 if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1270 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1271
1272 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1273 if (ref && last_tree == tp.tree->header.address)
1274 ref->edr.count++;
1275 else {
1276 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1277 if (!ref) {
1278 ERR("out of memory\n");
1279 return STATUS_INSUFFICIENT_RESOURCES;
1280 }
1281
1282 ref->type = TYPE_EXTENT_DATA_REF;
1283 RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1284 ref->edr.count = 1;
1285
1286 Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1287 if (!NT_SUCCESS(Status)) {
1288 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1289 ExFreePool(ref);
1290 return Status;
1291 }
1292
1293 last_tree = tp.tree->header.address;
1294 ref->parent = mr;
1295
1296 InsertTailList(&dr->refs, &ref->list_entry);
1297 }
1298 }
1299 }
1300 }
1301
1302 if (find_next_item(Vcb, &tp, &tp2, FALSE, NULL))
1303 tp = tp2;
1304 else
1305 break;
1306 }
1307
1308 return STATUS_SUCCESS;
1309 }
1310
1311 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1312 traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1313 NTSTATUS Status;
1314 data_reloc* dr;
1315 EXTENT_ITEM* ei;
1316 UINT16 len;
1317 UINT64 inline_rc;
1318 UINT8* ptr;
1319
1320 dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1321 if (!dr) {
1322 ERR("out of memory\n");
1323 return STATUS_INSUFFICIENT_RESOURCES;
1324 }
1325
1326 dr->address = tp->item->key.obj_id;
1327 dr->size = tp->item->key.offset;
1328 dr->ei = (EXTENT_ITEM*)tp->item->data;
1329 InitializeListHead(&dr->refs);
1330
1331 Status = delete_tree_item(Vcb, tp);
1332 if (!NT_SUCCESS(Status)) {
1333 ERR("delete_tree_item returned %08x\n", Status);
1334 return Status;
1335 }
1336
1337 if (!c)
1338 c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1339
1340 if (c) {
1341 ExAcquireResourceExclusiveLite(&c->lock, TRUE);
1342
1343 c->used -= tp->item->key.offset;
1344
1345 space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1346
1347 ExReleaseResourceLite(&c->lock);
1348 }
1349
1350 ei = (EXTENT_ITEM*)tp->item->data;
1351 inline_rc = 0;
1352
1353 len = tp->item->size - sizeof(EXTENT_ITEM);
1354 ptr = (UINT8*)tp->item->data + sizeof(EXTENT_ITEM);
1355
1356 while (len > 0) {
1357 UINT8 secttype = *ptr;
1358 UINT16 sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1359
1360 len--;
1361
1362 if (sectlen > len) {
1363 ERR("(%llx,%x,%llx): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1364 return STATUS_INTERNAL_ERROR;
1365 }
1366
1367 if (sectlen == 0) {
1368 ERR("(%llx,%x,%llx): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1369 return STATUS_INTERNAL_ERROR;
1370 }
1371
1372 if (secttype == TYPE_EXTENT_DATA_REF) {
1373 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(UINT8));
1374
1375 inline_rc += edr->count;
1376
1377 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1378 if (!NT_SUCCESS(Status)) {
1379 ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1380 return Status;
1381 }
1382 } else if (secttype == TYPE_SHARED_DATA_REF) {
1383 metadata_reloc* mr;
1384 data_reloc_ref* ref;
1385
1386 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1387 if (!ref) {
1388 ERR("out of memory\n");
1389 return STATUS_INSUFFICIENT_RESOURCES;
1390 }
1391
1392 ref->type = TYPE_SHARED_DATA_REF;
1393 RtlCopyMemory(&ref->sdr, ptr + sizeof(UINT8), sizeof(SHARED_DATA_REF));
1394 inline_rc += ref->sdr.count;
1395
1396 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1397 if (!NT_SUCCESS(Status)) {
1398 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1399 ExFreePool(ref);
1400 return Status;
1401 }
1402
1403 ref->parent = mr;
1404
1405 InsertTailList(&dr->refs, &ref->list_entry);
1406 } else {
1407 ERR("unexpected tree type %x\n", secttype);
1408 return STATUS_INTERNAL_ERROR;
1409 }
1410
1411
1412 len -= sectlen;
1413 ptr += sizeof(UINT8) + sectlen;
1414 }
1415
1416 if (inline_rc < ei->refcount) { // look for non-inline entries
1417 traverse_ptr tp2 = *tp, next_tp;
1418
1419 while (find_next_item(Vcb, &tp2, &next_tp, FALSE, NULL)) {
1420 tp2 = next_tp;
1421
1422 if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1423 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1424 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1425 if (!NT_SUCCESS(Status)) {
1426 ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1427 return Status;
1428 }
1429
1430 Status = delete_tree_item(Vcb, &tp2);
1431 if (!NT_SUCCESS(Status)) {
1432 ERR("delete_tree_item returned %08x\n", Status);
1433 return Status;
1434 }
1435 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(UINT32)) {
1436 metadata_reloc* mr;
1437 data_reloc_ref* ref;
1438
1439 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1440 if (!ref) {
1441 ERR("out of memory\n");
1442 return STATUS_INSUFFICIENT_RESOURCES;
1443 }
1444
1445 ref->type = TYPE_SHARED_DATA_REF;
1446 ref->sdr.offset = tp2.item->key.offset;
1447 ref->sdr.count = *((UINT32*)tp2.item->data);
1448
1449 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1450 if (!NT_SUCCESS(Status)) {
1451 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1452 ExFreePool(ref);
1453 return Status;
1454 }
1455
1456 ref->parent = mr;
1457 InsertTailList(&dr->refs, &ref->list_entry);
1458
1459 Status = delete_tree_item(Vcb, &tp2);
1460 if (!NT_SUCCESS(Status)) {
1461 ERR("delete_tree_item returned %08x\n", Status);
1462 return Status;
1463 }
1464 }
1465 } else
1466 break;
1467 }
1468 }
1469
1470 InsertTailList(items, &dr->list_entry);
1471
1472 return STATUS_SUCCESS;
1473 }
1474
1475 static void sort_data_reloc_refs(data_reloc* dr) {
1476 LIST_ENTRY newlist, *le;
1477
1478 if (IsListEmpty(&dr->refs))
1479 return;
1480
1481 // insertion sort
1482
1483 InitializeListHead(&newlist);
1484
1485 while (!IsListEmpty(&dr->refs)) {
1486 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1487 BOOL inserted = FALSE;
1488
1489 if (ref->type == TYPE_EXTENT_DATA_REF)
1490 ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1491 else if (ref->type == TYPE_SHARED_DATA_REF)
1492 ref->hash = ref->parent->new_address;
1493
1494 le = newlist.Flink;
1495 while (le != &newlist) {
1496 data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1497
1498 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1499 InsertHeadList(le->Blink, &ref->list_entry);
1500 inserted = TRUE;
1501 break;
1502 }
1503
1504 le = le->Flink;
1505 }
1506
1507 if (!inserted)
1508 InsertTailList(&newlist, &ref->list_entry);
1509 }
1510
1511 le = newlist.Flink;
1512 while (le != &newlist) {
1513 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1514
1515 if (le->Flink != &newlist) {
1516 data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1517
1518 if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1519 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1520 RemoveEntryList(&ref2->list_entry);
1521 ref->edr.count += ref2->edr.count;
1522 ExFreePool(ref2);
1523 continue;
1524 }
1525 }
1526
1527 le = le->Flink;
1528 }
1529
1530 newlist.Flink->Blink = &dr->refs;
1531 newlist.Blink->Flink = &dr->refs;
1532 dr->refs.Flink = newlist.Flink;
1533 dr->refs.Blink = newlist.Blink;
1534 }
1535
1536 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1537 NTSTATUS Status;
1538 LIST_ENTRY* le;
1539 UINT64 rc = 0;
1540 UINT16 inline_len;
1541 BOOL all_inline = TRUE;
1542 data_reloc_ref* first_noninline = NULL;
1543 EXTENT_ITEM* ei;
1544 UINT8* ptr;
1545
1546 inline_len = sizeof(EXTENT_ITEM);
1547
1548 sort_data_reloc_refs(dr);
1549
1550 le = dr->refs.Flink;
1551 while (le != &dr->refs) {
1552 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1553 UINT16 extlen = 0;
1554
1555 if (ref->type == TYPE_EXTENT_DATA_REF) {
1556 extlen += sizeof(EXTENT_DATA_REF);
1557 rc += ref->edr.count;
1558 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1559 extlen += sizeof(SHARED_DATA_REF);
1560 rc++;
1561 }
1562
1563 if (all_inline) {
1564 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1565 all_inline = FALSE;
1566 first_noninline = ref;
1567 } else
1568 inline_len += extlen + 1;
1569 }
1570
1571 le = le->Flink;
1572 }
1573
1574 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1575 if (!ei) {
1576 ERR("out of memory\n");
1577 return STATUS_INSUFFICIENT_RESOURCES;
1578 }
1579
1580 ei->refcount = rc;
1581 ei->generation = dr->ei->generation;
1582 ei->flags = dr->ei->flags;
1583 ptr = (UINT8*)&ei[1];
1584
1585 le = dr->refs.Flink;
1586 while (le != &dr->refs) {
1587 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1588
1589 if (ref == first_noninline)
1590 break;
1591
1592 *ptr = ref->type;
1593 ptr++;
1594
1595 if (ref->type == TYPE_EXTENT_DATA_REF) {
1596 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1597
1598 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1599
1600 ptr += sizeof(EXTENT_DATA_REF);
1601 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1602 SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1603
1604 sdr->offset = ref->parent->new_address;
1605 sdr->count = ref->sdr.count;
1606
1607 ptr += sizeof(SHARED_DATA_REF);
1608 }
1609
1610 le = le->Flink;
1611 }
1612
1613 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1614 if (!NT_SUCCESS(Status)) {
1615 ERR("insert_tree_item returned %08x\n", Status);
1616 return Status;
1617 }
1618
1619 if (!all_inline) {
1620 le = &first_noninline->list_entry;
1621
1622 while (le != &dr->refs) {
1623 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1624
1625 if (ref->type == TYPE_EXTENT_DATA_REF) {
1626 EXTENT_DATA_REF* edr;
1627
1628 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1629 if (!edr) {
1630 ERR("out of memory\n");
1631 return STATUS_INSUFFICIENT_RESOURCES;
1632 }
1633
1634 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1635
1636 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1637 if (!NT_SUCCESS(Status)) {
1638 ERR("insert_tree_item returned %08x\n", Status);
1639 return Status;
1640 }
1641 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1642 UINT32* sdr;
1643
1644 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(UINT32), ALLOC_TAG);
1645 if (!sdr) {
1646 ERR("out of memory\n");
1647 return STATUS_INSUFFICIENT_RESOURCES;
1648 }
1649
1650 *sdr = ref->sdr.count;
1651
1652 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(UINT32), NULL, NULL);
1653 if (!NT_SUCCESS(Status)) {
1654 ERR("insert_tree_item returned %08x\n", Status);
1655 return Status;
1656 }
1657 }
1658
1659 le = le->Flink;
1660 }
1661 }
1662
1663 return STATUS_SUCCESS;
1664 }
1665
1666 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, BOOL* changed) {
1667 KEY searchkey;
1668 traverse_ptr tp;
1669 NTSTATUS Status;
1670 BOOL b;
1671 LIST_ENTRY items, metadata_items, rollback, *le;
1672 UINT64 loaded = 0, num_loaded = 0;
1673 chunk* newchunk = NULL;
1674 UINT8* data = NULL;
1675
1676 TRACE("chunk %llx\n", c->offset);
1677
1678 InitializeListHead(&rollback);
1679 InitializeListHead(&items);
1680 InitializeListHead(&metadata_items);
1681
1682 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
1683
1684 searchkey.obj_id = c->offset;
1685 searchkey.obj_type = TYPE_EXTENT_ITEM;
1686 searchkey.offset = 0xffffffffffffffff;
1687
1688 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
1689 if (!NT_SUCCESS(Status)) {
1690 ERR("find_item returned %08x\n", Status);
1691 goto end;
1692 }
1693
1694 do {
1695 traverse_ptr next_tp;
1696
1697 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1698 break;
1699
1700 if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1701 BOOL tree = FALSE;
1702
1703 if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1704 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1705
1706 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1707 tree = TRUE;
1708 }
1709
1710 if (!tree) {
1711 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1712
1713 if (!NT_SUCCESS(Status)) {
1714 ERR("add_data_reloc returned %08x\n", Status);
1715 goto end;
1716 }
1717
1718 loaded += tp.item->key.offset;
1719 num_loaded++;
1720
1721 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1722 break;
1723 }
1724 }
1725
1726 b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
1727
1728 if (b)
1729 tp = next_tp;
1730 } while (b);
1731
1732 if (IsListEmpty(&items)) {
1733 *changed = FALSE;
1734 Status = STATUS_SUCCESS;
1735 goto end;
1736 } else
1737 *changed = TRUE;
1738
1739 data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1740 if (!data) {
1741 ERR("out of memory\n");
1742 Status = STATUS_INSUFFICIENT_RESOURCES;
1743 goto end;
1744 }
1745
1746 le = items.Flink;
1747 while (le != &items) {
1748 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1749 BOOL done = FALSE;
1750 LIST_ENTRY* le2;
1751 UINT32* csum;
1752 RTL_BITMAP bmp;
1753 ULONG* bmparr;
1754 ULONG runlength, index, lastoff;
1755
1756 if (newchunk) {
1757 ExAcquireResourceExclusiveLite(&newchunk->lock, TRUE);
1758
1759 if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1760 newchunk->used += dr->size;
1761 space_list_subtract(newchunk, FALSE, dr->new_address, dr->size, &rollback);
1762 done = TRUE;
1763 }
1764
1765 ExReleaseResourceLite(&newchunk->lock);
1766 }
1767
1768 if (!done) {
1769 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
1770
1771 le2 = Vcb->chunks.Flink;
1772 while (le2 != &Vcb->chunks) {
1773 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1774
1775 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1776 ExAcquireResourceExclusiveLite(&c2->lock, TRUE);
1777
1778 if ((c2->chunk_item->size - c2->used) >= dr->size) {
1779 if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1780 c2->used += dr->size;
1781 space_list_subtract(c2, FALSE, dr->new_address, dr->size, &rollback);
1782 ExReleaseResourceLite(&c2->lock);
1783 newchunk = c2;
1784 done = TRUE;
1785 break;
1786 }
1787 }
1788
1789 ExReleaseResourceLite(&c2->lock);
1790 }
1791
1792 le2 = le2->Flink;
1793 }
1794
1795 // allocate new chunk if necessary
1796 if (!done) {
1797 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, FALSE);
1798
1799 if (!NT_SUCCESS(Status)) {
1800 ERR("alloc_chunk returned %08x\n", Status);
1801 ExReleaseResourceLite(&Vcb->chunk_lock);
1802 goto end;
1803 }
1804
1805 ExAcquireResourceExclusiveLite(&newchunk->lock, TRUE);
1806
1807 newchunk->balance_num = Vcb->balance.balance_num;
1808
1809 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1810 ExReleaseResourceLite(&newchunk->lock);
1811 ExReleaseResourceLite(&Vcb->chunk_lock);
1812 ERR("could not find address in new chunk\n");
1813 Status = STATUS_DISK_FULL;
1814 goto end;
1815 } else {
1816 newchunk->used += dr->size;
1817 space_list_subtract(newchunk, FALSE, dr->new_address, dr->size, &rollback);
1818 }
1819
1820 ExReleaseResourceLite(&newchunk->lock);
1821 }
1822
1823 ExReleaseResourceLite(&Vcb->chunk_lock);
1824 }
1825
1826 dr->newchunk = newchunk;
1827
1828 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align((dr->size / Vcb->superblock.sector_size) + 1, sizeof(ULONG)), ALLOC_TAG);
1829 if (!bmparr) {
1830 ERR("out of memory\n");
1831 Status = STATUS_INSUFFICIENT_RESOURCES;
1832 goto end;
1833 }
1834
1835 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(UINT32) / Vcb->superblock.sector_size), ALLOC_TAG);
1836 if (!csum) {
1837 ERR("out of memory\n");
1838 ExFreePool(bmparr);
1839 Status = STATUS_INSUFFICIENT_RESOURCES;
1840 goto end;
1841 }
1842
1843 RtlInitializeBitMap(&bmp, bmparr, (ULONG)(dr->size / Vcb->superblock.sector_size));
1844 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1845
1846 searchkey.obj_id = EXTENT_CSUM_ID;
1847 searchkey.obj_type = TYPE_EXTENT_CSUM;
1848 searchkey.offset = dr->address;
1849
1850 Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, FALSE, NULL);
1851 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1852 ERR("find_item returned %08x\n", Status);
1853 ExFreePool(csum);
1854 ExFreePool(bmparr);
1855 goto end;
1856 }
1857
1858 if (Status != STATUS_NOT_FOUND) {
1859 do {
1860 traverse_ptr next_tp;
1861
1862 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1863 if (tp.item->key.offset >= dr->address + dr->size)
1864 break;
1865 else if (tp.item->size >= sizeof(UINT32) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(UINT32)) >= dr->address) {
1866 UINT64 cs = max(dr->address, tp.item->key.offset);
1867 UINT64 ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(UINT32)));
1868
1869 RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size),
1870 tp.item->data + ((cs - tp.item->key.offset) * sizeof(UINT32) / Vcb->superblock.sector_size),
1871 (ULONG)((ce - cs) * sizeof(UINT32) / Vcb->superblock.sector_size));
1872
1873 RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1874
1875 if (ce == dr->address + dr->size)
1876 break;
1877 }
1878 }
1879
1880 if (find_next_item(Vcb, &tp, &next_tp, FALSE, NULL))
1881 tp = next_tp;
1882 else
1883 break;
1884 } while (TRUE);
1885 }
1886
1887 lastoff = 0;
1888 runlength = RtlFindFirstRunClear(&bmp, &index);
1889
1890 while (runlength != 0) {
1891 if (index > lastoff) {
1892 ULONG off = lastoff;
1893 ULONG size = index - lastoff;
1894
1895 // handle no csum run
1896 do {
1897 ULONG rl;
1898
1899 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1900 rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1901 else
1902 rl = size;
1903
1904 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, FALSE, data,
1905 c, NULL, NULL, 0, FALSE, NormalPagePriority);
1906 if (!NT_SUCCESS(Status)) {
1907 ERR("read_data returned %08x\n", Status);
1908 ExFreePool(csum);
1909 ExFreePool(bmparr);
1910 goto end;
1911 }
1912
1913 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1914 NULL, newchunk, FALSE, 0, NormalPagePriority);
1915 if (!NT_SUCCESS(Status)) {
1916 ERR("write_data_complete returned %08x\n", Status);
1917 ExFreePool(csum);
1918 ExFreePool(bmparr);
1919 goto end;
1920 }
1921
1922 size -= rl;
1923 off += rl;
1924 } while (size > 0);
1925 }
1926
1927 add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL);
1928 add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1929
1930 // handle csum run
1931 do {
1932 ULONG rl;
1933
1934 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1935 rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1936 else
1937 rl = runlength;
1938
1939 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], FALSE, data,
1940 c, NULL, NULL, 0, FALSE, NormalPagePriority);
1941 if (!NT_SUCCESS(Status)) {
1942 ERR("read_data returned %08x\n", Status);
1943 ExFreePool(csum);
1944 ExFreePool(bmparr);
1945 goto end;
1946 }
1947
1948 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1949 NULL, newchunk, FALSE, 0, NormalPagePriority);
1950 if (!NT_SUCCESS(Status)) {
1951 ERR("write_data_complete returned %08x\n", Status);
1952 ExFreePool(csum);
1953 ExFreePool(bmparr);
1954 goto end;
1955 }
1956
1957 runlength -= rl;
1958 index += rl;
1959 } while (runlength > 0);
1960
1961 lastoff = index;
1962 runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1963 }
1964
1965 ExFreePool(csum);
1966 ExFreePool(bmparr);
1967
1968 // handle final nocsum run
1969 if (lastoff < dr->size / Vcb->superblock.sector_size) {
1970 ULONG off = lastoff;
1971 ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1972
1973 do {
1974 ULONG rl;
1975
1976 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1977 rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1978 else
1979 rl = size;
1980
1981 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, FALSE, data,
1982 c, NULL, NULL, 0, FALSE, NormalPagePriority);
1983 if (!NT_SUCCESS(Status)) {
1984 ERR("read_data returned %08x\n", Status);
1985 ExFreePool(csum);
1986 ExFreePool(bmparr);
1987 goto end;
1988 }
1989
1990 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1991 NULL, newchunk, FALSE, 0, NormalPagePriority);
1992 if (!NT_SUCCESS(Status)) {
1993 ERR("write_data_complete returned %08x\n", Status);
1994 ExFreePool(csum);
1995 ExFreePool(bmparr);
1996 goto end;
1997 }
1998
1999 size -= rl;
2000 off += rl;
2001 } while (size > 0);
2002 }
2003
2004 le = le->Flink;
2005 }
2006
2007 ExFreePool(data);
2008 data = NULL;
2009
2010 Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2011 if (!NT_SUCCESS(Status)) {
2012 ERR("write_metadata_items returned %08x\n", Status);
2013 goto end;
2014 }
2015
2016 le = items.Flink;
2017 while (le != &items) {
2018 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2019
2020 Status = add_data_reloc_extent_item(Vcb, dr);
2021 if (!NT_SUCCESS(Status)) {
2022 ERR("add_data_reloc_extent_item returned %08x\n", Status);
2023 goto end;
2024 }
2025
2026 le = le->Flink;
2027 }
2028
2029 le = c->changed_extents.Flink;
2030 while (le != &c->changed_extents) {
2031 LIST_ENTRY *le2, *le3;
2032 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2033
2034 le3 = le->Flink;
2035
2036 le2 = items.Flink;
2037 while (le2 != &items) {
2038 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2039
2040 if (ce->address == dr->address) {
2041 ce->address = dr->new_address;
2042 RemoveEntryList(&ce->list_entry);
2043 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2044 break;
2045 }
2046
2047 le2 = le2->Flink;
2048 }
2049
2050 le = le3;
2051 }
2052
2053 Status = STATUS_SUCCESS;
2054
2055 Vcb->need_write = TRUE;
2056
2057 end:
2058 if (NT_SUCCESS(Status)) {
2059 // update extents in cache inodes before we flush
2060 le = Vcb->chunks.Flink;
2061 while (le != &Vcb->chunks) {
2062 chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2063
2064 if (c2->cache) {
2065 LIST_ENTRY* le2;
2066
2067 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, TRUE);
2068
2069 le2 = c2->cache->extents.Flink;
2070 while (le2 != &c2->cache->extents) {
2071 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2072
2073 if (!ext->ignore) {
2074 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2075 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2076
2077 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2078 LIST_ENTRY* le3 = items.Flink;
2079 while (le3 != &items) {
2080 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2081
2082 if (ed2->address == dr->address) {
2083 ed2->address = dr->new_address;
2084 break;
2085 }
2086
2087 le3 = le3->Flink;
2088 }
2089 }
2090 }
2091 }
2092
2093 le2 = le2->Flink;
2094 }
2095
2096 ExReleaseResourceLite(c2->cache->Header.Resource);
2097 }
2098
2099 le = le->Flink;
2100 }
2101
2102 Status = do_write(Vcb, NULL);
2103 if (!NT_SUCCESS(Status))
2104 ERR("do_write returned %08x\n", Status);
2105 }
2106
2107 if (NT_SUCCESS(Status)) {
2108 clear_rollback(&rollback);
2109
2110 // update open FCBs
2111 // FIXME - speed this up(?)
2112
2113 ExAcquireResourceSharedLite(&Vcb->fcb_lock, TRUE);
2114
2115 le = Vcb->all_fcbs.Flink;
2116 while (le != &Vcb->all_fcbs) {
2117 struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2118 LIST_ENTRY* le2;
2119
2120 ExAcquireResourceExclusiveLite(fcb->Header.Resource, TRUE);
2121
2122 le2 = fcb->extents.Flink;
2123 while (le2 != &fcb->extents) {
2124 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2125
2126 if (!ext->ignore) {
2127 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2128 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2129
2130 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2131 LIST_ENTRY* le3 = items.Flink;
2132 while (le3 != &items) {
2133 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2134
2135 if (ed2->address == dr->address) {
2136 ed2->address = dr->new_address;
2137 break;
2138 }
2139
2140 le3 = le3->Flink;
2141 }
2142 }
2143 }
2144 }
2145
2146 le2 = le2->Flink;
2147 }
2148
2149 ExReleaseResourceLite(fcb->Header.Resource);
2150
2151 le = le->Flink;
2152 }
2153
2154 ExReleaseResourceLite(&Vcb->fcb_lock);
2155 } else
2156 do_rollback(Vcb, &rollback);
2157
2158 free_trees(Vcb);
2159
2160 ExReleaseResourceLite(&Vcb->tree_lock);
2161
2162 if (data)
2163 ExFreePool(data);
2164
2165 while (!IsListEmpty(&items)) {
2166 data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2167
2168 while (!IsListEmpty(&dr->refs)) {
2169 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2170
2171 ExFreePool(ref);
2172 }
2173
2174 ExFreePool(dr);
2175 }
2176
2177 while (!IsListEmpty(&metadata_items)) {
2178 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2179
2180 while (!IsListEmpty(&mr->refs)) {
2181 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2182
2183 ExFreePool(ref);
2184 }
2185
2186 ExFreePool(mr);
2187 }
2188
2189 return Status;
2190 }
2191
2192 static __inline UINT64 get_chunk_dup_type(chunk* c) {
2193 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2194 return BLOCK_FLAG_RAID0;
2195 else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2196 return BLOCK_FLAG_RAID1;
2197 else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2198 return BLOCK_FLAG_DUPLICATE;
2199 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2200 return BLOCK_FLAG_RAID10;
2201 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2202 return BLOCK_FLAG_RAID5;
2203 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2204 return BLOCK_FLAG_RAID6;
2205 else
2206 return BLOCK_FLAG_SINGLE;
2207 }
2208
2209 static BOOL should_balance_chunk(device_extension* Vcb, UINT8 sort, chunk* c) {
2210 btrfs_balance_opts* opts;
2211
2212 opts = &Vcb->balance.opts[sort];
2213
2214 if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2215 return FALSE;
2216
2217 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2218 UINT64 type = get_chunk_dup_type(c);
2219
2220 if (!(type & opts->profiles))
2221 return FALSE;
2222 }
2223
2224 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2225 UINT16 i;
2226 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2227 BOOL b = FALSE;
2228
2229 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2230 if (cis[i].dev_id == opts->devid) {
2231 b = TRUE;
2232 break;
2233 }
2234 }
2235
2236 if (!b)
2237 return FALSE;
2238 }
2239
2240 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2241 UINT16 i, factor;
2242 UINT64 physsize;
2243 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2244 BOOL b = FALSE;
2245
2246 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2247 factor = c->chunk_item->num_stripes;
2248 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2249 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2250 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2251 factor = c->chunk_item->num_stripes - 1;
2252 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2253 factor = c->chunk_item->num_stripes - 2;
2254 else // SINGLE, DUPLICATE, RAID1
2255 factor = 1;
2256
2257 physsize = c->chunk_item->size / factor;
2258
2259 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2260 if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2261 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2262 b = TRUE;
2263 break;
2264 }
2265 }
2266
2267 if (!b)
2268 return FALSE;
2269 }
2270
2271 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2272 if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2273 return FALSE;
2274 }
2275
2276 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2277 if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2278 return FALSE;
2279 }
2280
2281 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2282 UINT64 usage = c->used * 100 / c->chunk_item->size;
2283
2284 // usage == 0 should mean completely empty, not just that usage rounds to 0%
2285 if (c->used > 0 && usage == 0)
2286 usage = 1;
2287
2288 if (usage < opts->usage_start || usage > opts->usage_end)
2289 return FALSE;
2290 }
2291
2292 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2293 UINT64 type = get_chunk_dup_type(c);
2294
2295 if (type == opts->convert)
2296 return FALSE;
2297 }
2298
2299 return TRUE;
2300 }
2301
2302 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2303 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2304 args->profiles = opts->profiles;
2305 args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2306 }
2307
2308 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2309 if (args->usage_start == 0) {
2310 args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2311 args->usage_start = opts->usage_start;
2312 args->usage_end = opts->usage_end;
2313 } else {
2314 args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2315 args->usage = opts->usage_end;
2316 }
2317 }
2318
2319 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2320 args->devid = opts->devid;
2321 args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2322 }
2323
2324 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2325 args->drange_start = opts->drange_start;
2326 args->drange_end = opts->drange_end;
2327 args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2328 }
2329
2330 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2331 args->vrange_start = opts->vrange_start;
2332 args->vrange_end = opts->vrange_end;
2333 args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2334 }
2335
2336 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2337 args->convert = opts->convert;
2338 args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2339
2340 if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2341 args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2342 }
2343
2344 if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2345 if (args->limit_start == 0) {
2346 args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2347 args->limit_start = (UINT32)opts->limit_start;
2348 args->limit_end = (UINT32)opts->limit_end;
2349 } else {
2350 args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2351 args->limit = opts->limit_end;
2352 }
2353 }
2354
2355 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2356 args->stripes_start = opts->stripes_start;
2357 args->stripes_end = opts->stripes_end;
2358 args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2359 }
2360 }
2361
2362 static NTSTATUS add_balance_item(device_extension* Vcb) {
2363 KEY searchkey;
2364 traverse_ptr tp;
2365 NTSTATUS Status;
2366 BALANCE_ITEM* bi;
2367
2368 searchkey.obj_id = BALANCE_ITEM_ID;
2369 searchkey.obj_type = TYPE_TEMP_ITEM;
2370 searchkey.offset = 0;
2371
2372 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
2373
2374 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
2375 if (!NT_SUCCESS(Status)) {
2376 ERR("find_item returned %08x\n", Status);
2377 goto end;
2378 }
2379
2380 if (!keycmp(tp.item->key, searchkey)) {
2381 Status = delete_tree_item(Vcb, &tp);
2382 if (!NT_SUCCESS(Status)) {
2383 ERR("delete_tree_item returned %08x\n", Status);
2384 goto end;
2385 }
2386 }
2387
2388 bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2389 if (!bi) {
2390 ERR("out of memory\n");
2391 Status = STATUS_INSUFFICIENT_RESOURCES;
2392 goto end;
2393 }
2394
2395 RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2396
2397 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2398 bi->flags |= BALANCE_FLAGS_DATA;
2399 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2400 }
2401
2402 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2403 bi->flags |= BALANCE_FLAGS_METADATA;
2404 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2405 }
2406
2407 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2408 bi->flags |= BALANCE_FLAGS_SYSTEM;
2409 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2410 }
2411
2412 Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2413 if (!NT_SUCCESS(Status)) {
2414 ERR("insert_tree_item returned %08x\n", Status);
2415 ExFreePool(bi);
2416 goto end;
2417 }
2418
2419 Status = STATUS_SUCCESS;
2420
2421 end:
2422 if (NT_SUCCESS(Status)) {
2423 Status = do_write(Vcb, NULL);
2424 if (!NT_SUCCESS(Status))
2425 ERR("do_write returned %08x\n", Status);
2426 }
2427
2428 free_trees(Vcb);
2429
2430 ExReleaseResourceLite(&Vcb->tree_lock);
2431
2432 return Status;
2433 }
2434
2435 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2436 KEY searchkey;
2437 traverse_ptr tp;
2438 NTSTATUS Status;
2439
2440 searchkey.obj_id = BALANCE_ITEM_ID;
2441 searchkey.obj_type = TYPE_TEMP_ITEM;
2442 searchkey.offset = 0;
2443
2444 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
2445
2446 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
2447 if (!NT_SUCCESS(Status)) {
2448 ERR("find_item returned %08x\n", Status);
2449 goto end;
2450 }
2451
2452 if (!keycmp(tp.item->key, searchkey)) {
2453 Status = delete_tree_item(Vcb, &tp);
2454 if (!NT_SUCCESS(Status)) {
2455 ERR("delete_tree_item returned %08x\n", Status);
2456 goto end;
2457 }
2458
2459 Status = do_write(Vcb, NULL);
2460 if (!NT_SUCCESS(Status)) {
2461 ERR("do_write returned %08x\n", Status);
2462 goto end;
2463 }
2464
2465 free_trees(Vcb);
2466 }
2467
2468 Status = STATUS_SUCCESS;
2469
2470 end:
2471 ExReleaseResourceLite(&Vcb->tree_lock);
2472
2473 return Status;
2474 }
2475
2476 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2477 opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2478
2479 if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2480 opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2481 opts->profiles = args->profiles;
2482 }
2483
2484 if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2485 opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2486
2487 opts->usage_start = 0;
2488 opts->usage_end = (UINT8)args->usage;
2489 } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2490 opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2491
2492 opts->usage_start = (UINT8)args->usage_start;
2493 opts->usage_end = (UINT8)args->usage_end;
2494 }
2495
2496 if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2497 opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2498 opts->devid = args->devid;
2499 }
2500
2501 if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2502 opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2503 opts->drange_start = args->drange_start;
2504 opts->drange_end = args->drange_end;
2505 }
2506
2507 if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2508 opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2509 opts->vrange_start = args->vrange_start;
2510 opts->vrange_end = args->vrange_end;
2511 }
2512
2513 if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2514 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2515
2516 opts->limit_start = 0;
2517 opts->limit_end = args->limit;
2518 } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2519 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2520
2521 opts->limit_start = args->limit_start;
2522 opts->limit_end = args->limit_end;
2523 }
2524
2525 if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2526 opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2527
2528 opts->stripes_start = (UINT16)args->stripes_start;
2529 opts->stripes_end = (UINT16)args->stripes_end;
2530 }
2531
2532 if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2533 opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2534 opts->convert = args->convert;
2535
2536 if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2537 opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2538 }
2539 }
2540
2541 static NTSTATUS remove_superblocks(device* dev) {
2542 NTSTATUS Status;
2543 superblock* sb;
2544 int i = 0;
2545
2546 sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2547 if (!sb) {
2548 ERR("out of memory\n");
2549 return STATUS_INSUFFICIENT_RESOURCES;
2550 }
2551
2552 RtlZeroMemory(sb, sizeof(superblock));
2553
2554 while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2555 Status = write_data_phys(dev->devobj, superblock_addrs[i], sb, sizeof(superblock));
2556
2557 if (!NT_SUCCESS(Status)) {
2558 ExFreePool(sb);
2559 return Status;
2560 }
2561
2562 i++;
2563 }
2564
2565 ExFreePool(sb);
2566
2567 return STATUS_SUCCESS;
2568 }
2569
2570 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2571 KEY searchkey;
2572 traverse_ptr tp;
2573 NTSTATUS Status;
2574 LIST_ENTRY* le;
2575 volume_device_extension* vde;
2576
2577 if (Vcb->need_write) {
2578 Status = do_write(Vcb, NULL);
2579
2580 if (!NT_SUCCESS(Status))
2581 ERR("do_write returned %08x\n", Status);
2582 } else
2583 Status = STATUS_SUCCESS;
2584
2585 free_trees(Vcb);
2586
2587 if (!NT_SUCCESS(Status))
2588 return Status;
2589
2590 // remove entry in chunk tree
2591
2592 searchkey.obj_id = 1;
2593 searchkey.obj_type = TYPE_DEV_ITEM;
2594 searchkey.offset = dev->devitem.dev_id;
2595
2596 Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, FALSE, NULL);
2597 if (!NT_SUCCESS(Status)) {
2598 ERR("find_item returned %08x\n", Status);
2599 return Status;
2600 }
2601
2602 if (!keycmp(searchkey, tp.item->key)) {
2603 Status = delete_tree_item(Vcb, &tp);
2604
2605 if (!NT_SUCCESS(Status)) {
2606 ERR("delete_tree_item returned %08x\n", Status);
2607 return Status;
2608 }
2609 }
2610
2611 // remove stats entry in device tree
2612
2613 searchkey.obj_id = 0;
2614 searchkey.obj_type = TYPE_DEV_STATS;
2615 searchkey.offset = dev->devitem.dev_id;
2616
2617 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, FALSE, NULL);
2618 if (!NT_SUCCESS(Status)) {
2619 ERR("find_item returned %08x\n", Status);
2620 return Status;
2621 }
2622
2623 if (!keycmp(searchkey, tp.item->key)) {
2624 Status = delete_tree_item(Vcb, &tp);
2625
2626 if (!NT_SUCCESS(Status)) {
2627 ERR("delete_tree_item returned %08x\n", Status);
2628 return Status;
2629 }
2630 }
2631
2632 // update superblock
2633
2634 Vcb->superblock.num_devices--;
2635 Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2636 Vcb->devices_loaded--;
2637
2638 RemoveEntryList(&dev->list_entry);
2639
2640 // flush
2641
2642 Status = do_write(Vcb, NULL);
2643 if (!NT_SUCCESS(Status))
2644 ERR("do_write returned %08x\n", Status);
2645
2646 free_trees(Vcb);
2647
2648 if (!NT_SUCCESS(Status))
2649 return Status;
2650
2651 if (!dev->readonly && dev->devobj) {
2652 Status = remove_superblocks(dev);
2653 if (!NT_SUCCESS(Status))
2654 WARN("remove_superblocks returned %08x\n", Status);
2655 }
2656
2657 // remove entry in volume list
2658
2659 vde = Vcb->vde;
2660
2661 if (dev->devobj) {
2662 pdo_device_extension* pdode = vde->pdode;
2663
2664 ExAcquireResourceExclusiveLite(&pdode->child_lock, TRUE);
2665
2666 le = pdode->children.Flink;
2667 while (le != &pdode->children) {
2668 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2669
2670 if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2671 PFILE_OBJECT FileObject;
2672 PDEVICE_OBJECT mountmgr;
2673 UNICODE_STRING mmdevpath;
2674
2675 pdode->children_loaded--;
2676
2677 if (vc->had_drive_letter) { // re-add entry to mountmgr
2678 RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2679 Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2680 if (!NT_SUCCESS(Status))
2681 ERR("IoGetDeviceObjectPointer returned %08x\n", Status);
2682 else {
2683 MOUNTDEV_NAME mdn;
2684
2685 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), TRUE, NULL);
2686 if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2687 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2688 else {
2689 MOUNTDEV_NAME* mdn2;
2690 ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2691
2692 mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2693 if (!mdn2)
2694 ERR("out of memory\n");
2695 else {
2696 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, TRUE, NULL);
2697 if (!NT_SUCCESS(Status))
2698 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2699 else {
2700 UNICODE_STRING name;
2701
2702 name.Buffer = mdn2->Name;
2703 name.Length = name.MaximumLength = mdn2->NameLength;
2704
2705 Status = mountmgr_add_drive_letter(mountmgr, &name);
2706 if (!NT_SUCCESS(Status))
2707 WARN("mountmgr_add_drive_letter returned %08x\n", Status);
2708 }
2709
2710 ExFreePool(mdn2);
2711 }
2712 }
2713
2714 ObDereferenceObject(FileObject);
2715 }
2716 }
2717
2718 ExFreePool(vc->pnp_name.Buffer);
2719 RemoveEntryList(&vc->list_entry);
2720 ExFreePool(vc);
2721
2722 ObDereferenceObject(vc->fileobj);
2723
2724 break;
2725 }
2726
2727 le = le->Flink;
2728 }
2729
2730 if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2731 vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2732
2733 le = pdode->children.Flink;
2734 while (le != &pdode->children) {
2735 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2736
2737 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2738 vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2739 break;
2740 }
2741
2742 le = le->Flink;
2743 }
2744 }
2745
2746 pdode->num_children = Vcb->superblock.num_devices;
2747
2748 ExReleaseResourceLite(&pdode->child_lock);
2749
2750 // free dev
2751
2752 if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2753 trim_whole_device(dev);
2754 }
2755
2756 while (!IsListEmpty(&dev->space)) {
2757 LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2758 space* s = CONTAINING_RECORD(le2, space, list_entry);
2759
2760 ExFreePool(s);
2761 }
2762
2763 ExFreePool(dev);
2764
2765 if (Vcb->trim) {
2766 Vcb->trim = FALSE;
2767
2768 le = Vcb->devices.Flink;
2769 while (le != &Vcb->devices) {
2770 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2771
2772 if (dev2->trim) {
2773 Vcb->trim = TRUE;
2774 break;
2775 }
2776
2777 le = le->Flink;
2778 }
2779 }
2780
2781 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2782
2783 return STATUS_SUCCESS;
2784 }
2785
2786 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2787 DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2788 DEVICE_DATA_SET_RANGE* ranges;
2789 ULONG datalen, i;
2790 KEY searchkey;
2791 traverse_ptr tp;
2792 NTSTATUS Status;
2793 BOOL b;
2794 UINT64 lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2795 LIST_ENTRY* le;
2796
2797 dev->num_trim_entries = 0;
2798
2799 searchkey.obj_id = dev->devitem.dev_id;
2800 searchkey.obj_type = TYPE_DEV_EXTENT;
2801 searchkey.offset = 0;
2802
2803 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, FALSE, NULL);
2804 if (!NT_SUCCESS(Status)) {
2805 ERR("find_item returned %08x\n", Status);
2806 return;
2807 }
2808
2809 do {
2810 traverse_ptr next_tp;
2811
2812 if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2813 if (tp.item->size >= sizeof(DEV_EXTENT)) {
2814 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2815
2816 if (tp.item->key.offset > lastoff)
2817 add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2818
2819 lastoff = tp.item->key.offset + de->length;
2820 } else {
2821 ERR("(%llx,%x,%llx) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2822 return;
2823 }
2824 }
2825
2826 b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
2827
2828 if (b) {
2829 tp = next_tp;
2830 if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2831 break;
2832 }
2833 } while (b);
2834
2835 if (lastoff < dev->devitem.num_bytes)
2836 add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2837
2838 if (dev->num_trim_entries == 0)
2839 return;
2840
2841 datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(UINT64)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2842
2843 dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2844 if (!dmdsa) {
2845 ERR("out of memory\n");
2846 goto end;
2847 }
2848
2849 dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2850 dmdsa->Action = DeviceDsmAction_Trim;
2851 dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2852 dmdsa->ParameterBlockOffset = 0;
2853 dmdsa->ParameterBlockLength = 0;
2854 dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(UINT64));
2855 dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2856
2857 ranges = (DEVICE_DATA_SET_RANGE*)((UINT8*)dmdsa + dmdsa->DataSetRangesOffset);
2858
2859 i = 0;
2860 le = dev->trim_list.Flink;
2861 while (le != &dev->trim_list) {
2862 space* s = CONTAINING_RECORD(le, space, list_entry);
2863
2864 ranges[i].StartingOffset = s->address;
2865 ranges[i].LengthInBytes = s->size;
2866 i++;
2867
2868 le = le->Flink;
2869 }
2870
2871 Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, TRUE, NULL);
2872 if (!NT_SUCCESS(Status))
2873 WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status);
2874
2875 ExFreePool(dmdsa);
2876
2877 end:
2878 while (!IsListEmpty(&dev->trim_list)) {
2879 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2880 ExFreePool(s);
2881 }
2882
2883 dev->num_trim_entries = 0;
2884 }
2885
2886 static NTSTATUS try_consolidation(device_extension* Vcb, UINT64 flags, chunk** newchunk) {
2887 NTSTATUS Status;
2888 BOOL changed;
2889 LIST_ENTRY* le;
2890 chunk* rc;
2891
2892 // FIXME - allow with metadata chunks?
2893
2894 while (TRUE) {
2895 rc = NULL;
2896
2897 ExAcquireResourceSharedLite(&Vcb->tree_lock, TRUE);
2898
2899 ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
2900
2901 // choose the least-used chunk we haven't looked at yet
2902 le = Vcb->chunks.Flink;
2903 while (le != &Vcb->chunks) {
2904 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2905
2906 // FIXME - skip full-size chunks over e.g. 90% full?
2907 if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2908 rc = c;
2909
2910 le = le->Flink;
2911 }
2912
2913 ExReleaseResourceLite(&Vcb->chunk_lock);
2914
2915 if (!rc) {
2916 ExReleaseResourceLite(&Vcb->tree_lock);
2917 break;
2918 }
2919
2920 if (rc->list_entry_balance.Flink) {
2921 RemoveEntryList(&rc->list_entry_balance);
2922 Vcb->balance.chunks_left--;
2923 }
2924
2925 rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2926 rc->reloc = TRUE;
2927
2928 ExReleaseResourceLite(&Vcb->tree_lock);
2929
2930 do {
2931 changed = FALSE;