[BTRFS]
[reactos.git] / reactos / drivers / filesystems / btrfs / read.c
index 694eea5..f2739cd 100644 (file)
@@ -52,10 +52,15 @@ typedef struct {
     UINT16 firstoff, startoffstripe, sectors_per_stripe, stripes_cancel;
     UINT32* csum;
     BOOL tree;
+    BOOL check_nocsum_parity;
     read_data_stripe* stripes;
     KSPIN_LOCK spin_lock;
 } read_data_context;
 
+extern BOOL diskacc;
+extern tPsUpdateDiskCounters PsUpdateDiskCounters;
+extern tCcCopyReadEx CcCopyReadEx;
+
 static NTSTATUS STDCALL read_data_completion(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
     read_data_stripe* stripe = conptr;
     read_data_context* context = (read_data_context*)stripe->context;
@@ -103,7 +108,7 @@ static NTSTATUS STDCALL read_data_completion(PDEVICE_OBJECT DeviceObject, PIRP I
         } else if (context->type == BLOCK_FLAG_RAID5) {
             stripe->status = ReadDataStatus_Success;
             
-            if (stripes_left > 0 && stripes_left == context->stripes_cancel && (context->csum || context->tree)) {
+            if (stripes_left > 0 && stripes_left == context->stripes_cancel && (context->csum || context->tree || !context->check_nocsum_parity)) {
                 for (i = 0; i < context->num_stripes; i++) {
                     if (context->stripes[i].status == ReadDataStatus_Pending) {
                         context->stripes[i].status = ReadDataStatus_Cancelling;
@@ -115,7 +120,7 @@ static NTSTATUS STDCALL read_data_completion(PDEVICE_OBJECT DeviceObject, PIRP I
         } else if (context->type == BLOCK_FLAG_RAID6) {
             stripe->status = ReadDataStatus_Success;
 
-            if (stripes_left > 0 && stripes_left == context->stripes_cancel && (context->csum || context->tree)) {
+            if (stripes_left > 0 && stripes_left == context->stripes_cancel && (context->csum || context->tree || !context->check_nocsum_parity)) {
                 for (i = 0; i < context->num_stripes; i++) {
                     if (context->stripes[i].status == ReadDataStatus_Pending) {
                         context->stripes[i].status = ReadDataStatus_Cancelling;
     return STATUS_SUCCESS;
 }
 
-NTSTATUS STDCALL read_data(device_extension* Vcb, UINT64 addr, UINT32 length, UINT32* csum, BOOL is_tree, UINT8* buf, chunk* c, chunk** pc, PIRP Irp) {
-    CHUNK_ITEM* ci;
-    CHUNK_ITEM_STRIPE* cis;
-    read_data_context* context;
-    UINT64 i, type, offset;
+static NTSTATUS check_csum(device_extension* Vcb, UINT8* data, UINT32 sectors, UINT32* csum) {
     NTSTATUS Status;
-    device** devices;
-    UINT64 *stripestart = NULL, *stripeend = NULL;
-    UINT32 firststripesize;
-    UINT16 startoffstripe, allowed_missing, missing_devices = 0;
-#ifdef DEBUG_STATS
-    LARGE_INTEGER time1, time2;
-#endif
+    calc_job* cj;
+    UINT32* csum2;
+    
+    // From experimenting, it seems that 40 sectors is roughly the crossover
+    // point where offloading the crc32 calculation becomes worth it.
+    
+    if (sectors < 40) {
+        ULONG j;
+        
+        for (j = 0; j < sectors; j++) {
+            UINT32 crc32 = ~calc_crc32c(0xffffffff, data + (j * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
+            
+            if (crc32 != csum[j]) {
+                return STATUS_CRC_ERROR;
+            }
+        }
+        
+        return STATUS_SUCCESS;
+    }
+    
+    csum2 = ExAllocatePoolWithTag(PagedPool, sizeof(UINT32) * sectors, ALLOC_TAG);
+    if (!csum2) {
+        ERR("out of memory\n");
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
     
-    Status = verify_vcb(Vcb, Irp);
+    Status = add_calc_job(Vcb, data, sectors, csum2, &cj);
     if (!NT_SUCCESS(Status)) {
-        ERR("verify_vcb returned %08x\n", Status);
+        ERR("add_calc_job returned %08x\n", Status);
         return Status;
     }
     
-    if (Vcb->log_to_phys_loaded) {
-        if (!c) {
-            c = get_chunk_from_address(Vcb, addr);
-            
-            if (!c) {
-                ERR("get_chunk_from_address failed\n");
-                return STATUS_INTERNAL_ERROR;
+    KeWaitForSingleObject(&cj->event, Executive, KernelMode, FALSE, NULL);
+    
+    if (RtlCompareMemory(csum2, csum, sectors * sizeof(UINT32)) != sectors * sizeof(UINT32)) {
+        free_calc_job(cj);
+        ExFreePool(csum2);
+        return STATUS_CRC_ERROR;
+    }
+    
+    free_calc_job(cj);
+    ExFreePool(csum2);
+    
+    return STATUS_SUCCESS;
+}
+
+static NTSTATUS read_data_dup(device_extension* Vcb, UINT8* buf, UINT64 addr, UINT32 length, PIRP Irp, read_data_context* context,
+                              CHUNK_ITEM* ci, device** devices, UINT64 *stripestart, UINT64 *stripeend) {
+    UINT64 i;
+    BOOL checksum_error = FALSE;
+    UINT16 cancelled = 0;
+    NTSTATUS Status;
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        if (context->stripes[i].status == ReadDataStatus_Success) {
+            if (context->tree) {
+                tree_header* th = (tree_header*)context->stripes[i].buf;
+                UINT32 crc32;
+                
+                crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, context->buflen - sizeof(th->csum));
+                
+                if (th->address != context->address || crc32 != *((UINT32*)th->csum)) {
+                    context->stripes[i].status = ReadDataStatus_CRCError;
+                    checksum_error = TRUE;
+                }
+            } else if (context->csum) {
+#ifdef DEBUG_STATS
+                LARGE_INTEGER time1, time2;
+                
+                time1 = KeQueryPerformanceCounter(NULL);
+#endif
+                Status = check_csum(Vcb, context->stripes[i].buf, context->stripes[i].Irp->IoStatus.Information / context->sector_size, context->csum);
+                
+                if (Status == STATUS_CRC_ERROR) {
+                    context->stripes[i].status = ReadDataStatus_CRCError;
+                    checksum_error = TRUE;
+                    break;
+                } else if (!NT_SUCCESS(Status)) {
+                    ERR("check_csum returned %08x\n", Status);
+                    return Status;
+                }
+#ifdef DEBUG_STATS
+                time2 = KeQueryPerformanceCounter(NULL);
+                
+                Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
+#endif
             }
+        } else if (context->stripes[i].status == ReadDataStatus_Cancelled) {
+            cancelled++;
         }
+    }
+    
+    if (checksum_error) {
+        CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
         
-        ci = c->chunk_item;
-        offset = c->offset;
-        devices = c->devices;
-           
-        if (pc)
-            *pc = c;
-    } else {
-        LIST_ENTRY* le = Vcb->sys_chunks.Flink;
-        
-        ci = NULL;
+        // FIXME - update dev stats
         
-        while (le != &Vcb->sys_chunks) {
-            sys_chunk* sc = CONTAINING_RECORD(le, sys_chunk, list_entry);
+        if (cancelled > 0) {
+#ifdef DEBUG_STATS
+            LARGE_INTEGER time1, time2;
+#endif
+            context->stripes_left = 0;
             
-            if (sc->key.obj_id == 0x100 && sc->key.obj_type == TYPE_CHUNK_ITEM && sc->key.offset <= addr) {
-                CHUNK_ITEM* chunk_item = sc->data;
-                
-                if ((addr - sc->key.offset) < chunk_item->size && chunk_item->num_stripes > 0) {
-                    ci = chunk_item;
-                    offset = sc->key.offset;
-                    cis = (CHUNK_ITEM_STRIPE*)&chunk_item[1];
+            for (i = 0; i < ci->num_stripes; i++) {
+                if (context->stripes[i].status == ReadDataStatus_Cancelled) {
+                    PIO_STACK_LOCATION IrpSp;
                     
-                    devices = ExAllocatePoolWithTag(PagedPool, sizeof(device*) * ci->num_stripes, ALLOC_TAG);
-                    if (!devices) {
-                        ERR("out of memory\n");
-                        return STATUS_INSUFFICIENT_RESOURCES;
+                    // re-run Irp that we cancelled
+                    
+                    if (context->stripes[i].Irp) {
+                        if (devices[i]->devobj->Flags & DO_DIRECT_IO) {
+                            MmUnlockPages(context->stripes[i].Irp->MdlAddress);
+                            IoFreeMdl(context->stripes[i].Irp->MdlAddress);
+                        }
+                        IoFreeIrp(context->stripes[i].Irp);
                     }
                     
-                    for (i = 0; i < ci->num_stripes; i++) {
-                        devices[i] = find_device_from_uuid(Vcb, &cis[i].dev_uuid);
+                    if (!Irp) {
+                        context->stripes[i].Irp = IoAllocateIrp(devices[i]->devobj->StackSize, FALSE);
+                        
+                        if (!context->stripes[i].Irp) {
+                            ERR("IoAllocateIrp failed\n");
+                            return STATUS_INSUFFICIENT_RESOURCES;
+                        }
+                    } else {
+                        context->stripes[i].Irp = IoMakeAssociatedIrp(Irp, devices[i]->devobj->StackSize);
+                        
+                        if (!context->stripes[i].Irp) {
+                            ERR("IoMakeAssociatedIrp failed\n");
+                            return STATUS_INSUFFICIENT_RESOURCES;
+                        }
                     }
                     
-                    break;
+                    IrpSp = IoGetNextIrpStackLocation(context->stripes[i].Irp);
+                    IrpSp->MajorFunction = IRP_MJ_READ;
+                    
+                    if (devices[i]->devobj->Flags & DO_BUFFERED_IO) {
+                        FIXME("FIXME - buffered IO\n");
+                    } else if (devices[i]->devobj->Flags & DO_DIRECT_IO) {
+                        context->stripes[i].Irp->MdlAddress = IoAllocateMdl(context->stripes[i].buf, stripeend[i] - stripestart[i], FALSE, FALSE, NULL);
+                        if (!context->stripes[i].Irp->MdlAddress) {
+                            ERR("IoAllocateMdl failed\n");
+                            return STATUS_INSUFFICIENT_RESOURCES;
+                        }
+                        
+                        MmProbeAndLockPages(context->stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
+                    } else {
+                        context->stripes[i].Irp->UserBuffer = context->stripes[i].buf;
+                    }
+
+                    IrpSp->Parameters.Read.Length = stripeend[i] - stripestart[i];
+                    IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[i] + cis[i].offset;
+                    
+                    context->stripes[i].Irp->UserIosb = &context->stripes[i].iosb;
+                    
+                    IoSetCompletionRoutine(context->stripes[i].Irp, read_data_completion, &context->stripes[i], TRUE, TRUE, TRUE);
+                    
+                    context->stripes_left++;
+                    context->stripes[i].status = ReadDataStatus_Pending;
                 }
             }
             
-            le = le->Flink;
+            context->stripes_cancel = 0;
+            KeClearEvent(&context->Event);
+            
+#ifdef DEBUG_STATS
+            if (!context->tree)
+                time1 = KeQueryPerformanceCounter(NULL);
+#endif
+
+            for (i = 0; i < ci->num_stripes; i++) {
+                if (context->stripes[i].status == ReadDataStatus_Pending) {
+                    IoCallDriver(devices[i]->devobj, context->stripes[i].Irp);
+                }
+            }
+            
+            KeWaitForSingleObject(&context->Event, Executive, KernelMode, FALSE, NULL);
+            
+#ifdef DEBUG_STATS
+            if (!context->tree) {
+                time2 = KeQueryPerformanceCounter(NULL);
+                
+                Vcb->stats.read_disk_time += time2.QuadPart - time1.QuadPart;
+            }
+#endif
+            for (i = 0; i < ci->num_stripes; i++) {
+                if (context->stripes[i].status == ReadDataStatus_Success) {
+                    if (context->tree) {
+                        tree_header* th = (tree_header*)context->stripes[i].buf;
+                        UINT32 crc32;
+                        
+                        crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, context->buflen - sizeof(th->csum));
+                        
+                        if (th->address != context->address || crc32 != *((UINT32*)th->csum))
+                            context->stripes[i].status = ReadDataStatus_CRCError;
+                    } else if (context->csum) {
+                        NTSTATUS Status;
+#ifdef DEBUG_STATS
+                        time1 = KeQueryPerformanceCounter(NULL);
+#endif
+                        Status = check_csum(Vcb, context->stripes[i].buf, context->stripes[i].Irp->IoStatus.Information / Vcb->superblock.sector_size, context->csum);
+                        
+                        if (Status == STATUS_CRC_ERROR)
+                            context->stripes[i].status = ReadDataStatus_CRCError;
+                        else if (!NT_SUCCESS(Status)) {
+                            ERR("check_csum returned %08x\n", Status);
+                            return Status;
+                        }
+#ifdef DEBUG_STATS
+                        time2 = KeQueryPerformanceCounter(NULL);
+                        
+                        Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
+#endif
+                    }
+                }
+            }
         }
         
-        if (!ci) {
-            ERR("could not find chunk for %llx in bootstrap\n", addr);
-            return STATUS_INTERNAL_ERROR;
+        for (i = 0; i < ci->num_stripes; i++) {
+            if (context->stripes[i].status == ReadDataStatus_Success) {
+                RtlCopyMemory(buf, context->stripes[i].buf, length);
+                goto raid1write;
+            }
         }
         
-        if (pc)
-            *pc = NULL;
+        if (context->tree || ci->num_stripes == 1) { // unable to recover from checksum error
+            ERR("unrecoverable checksum error at %llx\n", addr);
+            
+#ifdef _DEBUG
+            if (context->tree) {
+                for (i = 0; i < ci->num_stripes; i++) {
+                    if (context->stripes[i].status == ReadDataStatus_CRCError) {
+                        tree_header* th = (tree_header*)context->stripes[i].buf;
+                        UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, context->buflen - sizeof(th->csum));
+                        
+                        if (crc32 != *((UINT32*)th->csum)) {
+                            WARN("crc32 was %08x, expected %08x\n", crc32, *((UINT32*)th->csum));
+                            return STATUS_CRC_ERROR;
+                        } else if (addr != th->address) {
+                            WARN("address of tree was %llx, not %llx as expected\n", th->address, addr);
+                            return STATUS_CRC_ERROR;
+                        }
+                    }
+                }
+            }
+#endif
+            return STATUS_CRC_ERROR;
+        }
+        
+        // checksum errors on both stripes - we need to check sector by sector
+        
+        for (i = 0; i < (stripeend[0] - stripestart[0]) / context->sector_size; i++) {
+            UINT16 j;
+            BOOL success = FALSE;
+#ifdef DEBUG_STATS
+            LARGE_INTEGER time1, time2;
+            
+            time1 = KeQueryPerformanceCounter(NULL);
+#endif
+            
+            for (j = 0; j < ci->num_stripes; j++) {
+                if (context->stripes[j].status == ReadDataStatus_CRCError) {
+                    UINT32 crc32 = ~calc_crc32c(0xffffffff, context->stripes[j].buf + (i * context->sector_size), context->sector_size);
+                    
+                    if (crc32 == context->csum[i]) {
+                        RtlCopyMemory(buf + (i * context->sector_size), context->stripes[j].buf + (i * context->sector_size), context->sector_size);
+                        success = TRUE;
+                        break;
+                    }
+                }
+            }
+            
+#ifdef DEBUG_STATS
+            time2 = KeQueryPerformanceCounter(NULL);
+
+            Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
+#endif
+            if (!success) {
+                ERR("unrecoverable checksum error at %llx\n", addr + (i * context->sector_size));
+                return STATUS_CRC_ERROR;
+            }
+        }
+        
+raid1write:
+        // write good data over bad
+        
+        if (!Vcb->readonly) {
+            for (i = 0; i < ci->num_stripes; i++) {
+                if (context->stripes[i].status == ReadDataStatus_CRCError && devices[i] && !devices[i]->readonly) {
+                    Status = write_data_phys(devices[i]->devobj, cis[i].offset + stripestart[i], buf, length);
+                    
+                    if (!NT_SUCCESS(Status))
+                        WARN("write_data_phys returned %08x\n", Status);
+                }
+            }
+        }
+        
+        return STATUS_SUCCESS;
     }
     
-    if (ci->type & BLOCK_FLAG_DUPLICATE) {
-        type = BLOCK_FLAG_DUPLICATE;
-        allowed_missing = 0;
-    } else if (ci->type & BLOCK_FLAG_RAID0) {
-        type = BLOCK_FLAG_RAID0;
-        allowed_missing = 0;
-    } else if (ci->type & BLOCK_FLAG_RAID1) {
-        type = BLOCK_FLAG_DUPLICATE;
-        allowed_missing = 1;
-    } else if (ci->type & BLOCK_FLAG_RAID10) {
-        type = BLOCK_FLAG_RAID10;
-        allowed_missing = 1;
-    } else if (ci->type & BLOCK_FLAG_RAID5) {
-        type = BLOCK_FLAG_RAID5;
-        allowed_missing = 1;
-    } else if (ci->type & BLOCK_FLAG_RAID6) {
-        type = BLOCK_FLAG_RAID6;
-        allowed_missing = 2;
-    } else { // SINGLE
-        type = BLOCK_FLAG_DUPLICATE;
-        allowed_missing = 0;
+    // check if any of the stripes succeeded
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        if (context->stripes[i].status == ReadDataStatus_Success) {
+            RtlCopyMemory(buf, context->stripes[i].buf, length);
+            return STATUS_SUCCESS;
+        }
     }
-
-    cis = (CHUNK_ITEM_STRIPE*)&ci[1];
-
-    context = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_context), ALLOC_TAG);
-    if (!context) {
-        ERR("out of memory\n");
-        return STATUS_INSUFFICIENT_RESOURCES;
+    
+    // failing that, return the first error we encountered
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        if (context->stripes[i].status == ReadDataStatus_Error)
+            return context->stripes[i].iosb.Status;
     }
     
-    RtlZeroMemory(context, sizeof(read_data_context));
-    KeInitializeEvent(&context->Event, NotificationEvent, FALSE);
+    // if we somehow get here, return STATUS_INTERNAL_ERROR
     
-    context->stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe) * ci->num_stripes, ALLOC_TAG);
-    if (!context->stripes) {
+    return STATUS_INTERNAL_ERROR;
+}
+
+static NTSTATUS read_data_raid0(device_extension* Vcb, UINT8* buf, UINT64 addr, UINT32 length, read_data_context* context,
+                                CHUNK_ITEM* ci, UINT64* stripestart, UINT64* stripeend, UINT16 startoffstripe) {
+    UINT64 i;
+    UINT32 pos, *stripeoff;
+    UINT8 stripe;
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        if (context->stripes[i].status == ReadDataStatus_Error) {
+            WARN("stripe %llu returned error %08x\n", i, context->stripes[i].iosb.Status); 
+            return context->stripes[i].iosb.Status;
+        }
+    }
+    
+    pos = 0;
+    stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(UINT32) * ci->num_stripes, ALLOC_TAG);
+    if (!stripeoff) {
         ERR("out of memory\n");
-        ExFreePool(context);
         return STATUS_INSUFFICIENT_RESOURCES;
     }
     
-    RtlZeroMemory(context->stripes, sizeof(read_data_stripe) * ci->num_stripes);
+    RtlZeroMemory(stripeoff, sizeof(UINT32) * ci->num_stripes);
     
-    context->buflen = length;
-    context->num_stripes = ci->num_stripes;
-    context->stripes_left = context->num_stripes;
-    context->sector_size = Vcb->superblock.sector_size;
-    context->csum = csum;
-    context->tree = is_tree;
-    context->type = type;
+    stripe = startoffstripe;
+    while (pos < length) {
+        if (pos == 0) {
+            UINT32 readlen = min(stripeend[stripe] - stripestart[stripe], ci->stripe_length - (stripestart[stripe] % ci->stripe_length));
+            
+            RtlCopyMemory(buf, context->stripes[stripe].buf, readlen);
+            stripeoff[stripe] += readlen;
+            pos += readlen;
+        } else if (length - pos < ci->stripe_length) {
+            RtlCopyMemory(buf + pos, &context->stripes[stripe].buf[stripeoff[stripe]], length - pos);
+            pos = length;
+        } else {
+            RtlCopyMemory(buf + pos, &context->stripes[stripe].buf[stripeoff[stripe]], ci->stripe_length);
+            stripeoff[stripe] += ci->stripe_length;
+            pos += ci->stripe_length;
+        }
+        
+        stripe = (stripe + 1) % ci->num_stripes;
+    }
     
-    stripestart = ExAllocatePoolWithTag(NonPagedPool, sizeof(UINT64) * ci->num_stripes, ALLOC_TAG);
-    if (!stripestart) {
-        ERR("out of memory\n");
-        ExFreePool(context);
-        return STATUS_INSUFFICIENT_RESOURCES;
+    ExFreePool(stripeoff);
+    
+    // FIXME - handle the case where one of the stripes doesn't read everything, i.e. Irp->IoStatus.Information is short
+    
+    if (context->tree) { // shouldn't happen, as trees shouldn't cross stripe boundaries
+        tree_header* th = (tree_header*)buf;
+        UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
+        
+        if (crc32 != *((UINT32*)th->csum)) {
+            WARN("crc32 was %08x, expected %08x\n", crc32, *((UINT32*)th->csum));
+            return STATUS_CRC_ERROR;
+        } else if (addr != th->address) {
+            WARN("address of tree was %llx, not %llx as expected\n", th->address, addr);
+            return STATUS_CRC_ERROR;
+        }
+    } else if (context->csum) {
+        NTSTATUS Status;
+#ifdef DEBUG_STATS
+        LARGE_INTEGER time1, time2;
+        
+        time1 = KeQueryPerformanceCounter(NULL);
+#endif
+        Status = check_csum(Vcb, buf, length / Vcb->superblock.sector_size, context->csum);
+        
+        if (Status == STATUS_CRC_ERROR) {
+            WARN("checksum error\n");
+            return Status;
+        } else if (!NT_SUCCESS(Status)) {
+            ERR("check_csum returned %08x\n", Status);
+            return Status;
+        }
+#ifdef DEBUG_STATS
+        time2 = KeQueryPerformanceCounter(NULL);
+        
+        Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
+#endif
     }
     
-    stripeend = ExAllocatePoolWithTag(NonPagedPool, sizeof(UINT64) * ci->num_stripes, ALLOC_TAG);
-    if (!stripeend) {
+    return STATUS_SUCCESS;    
+}
+
+static NTSTATUS read_data_raid10(device_extension* Vcb, UINT8* buf, UINT64 addr, UINT32 length, PIRP Irp, read_data_context* context,
+                                 CHUNK_ITEM* ci, device** devices, UINT64* stripestart, UINT64* stripeend, UINT16 startoffstripe) {
+    UINT64 i;
+    NTSTATUS Status;
+    BOOL checksum_error = FALSE;
+    UINT32 pos, *stripeoff;
+    UINT8 stripe;
+    read_data_stripe** stripes;
+
+    stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
+    if (!stripes) {
         ERR("out of memory\n");
-        ExFreePool(stripestart);
-        ExFreePool(context);
         return STATUS_INSUFFICIENT_RESOURCES;
     }
     
-    if (type == BLOCK_FLAG_RAID0) {
-        UINT64 startoff, endoff;
-        UINT16 endoffstripe;
+    RtlZeroMemory(stripes, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes);
+    
+    for (i = 0; i < ci->num_stripes; i += ci->sub_stripes) {
+        UINT16 j;
         
-        get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes, &startoff, &startoffstripe);
-        get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes, &endoff, &endoffstripe);
+        for (j = 0; j < ci->sub_stripes; j++) {
+            if (context->stripes[i+j].status == ReadDataStatus_Success) {
+                stripes[i / ci->sub_stripes] = &context->stripes[i+j];
+                break;
+            }
+        }
         
-        for (i = 0; i < ci->num_stripes; i++) {
-            if (startoffstripe > i) {
-                stripestart[i] = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
-            } else if (startoffstripe == i) {
-                stripestart[i] = startoff;
-            } else {
-                stripestart[i] = startoff - (startoff % ci->stripe_length);
+        if (!stripes[i / ci->sub_stripes]) {
+            for (j = 0; j < ci->sub_stripes; j++) {
+                if (context->stripes[i+j].status == ReadDataStatus_Error) {
+                    // both stripes must have errored if we get here
+                    WARN("stripe %llu returned error %08x\n", i+j, context->stripes[i+j].iosb.Status);
+                    ExFreePool(stripes);
+                    return context->stripes[i].iosb.Status;
+                }
             }
+        }
+    }
+    
+    pos = 0;
+    stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(UINT32) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
+    if (!stripeoff) {
+        ERR("out of memory\n");
+        ExFreePool(stripes);
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+    
+    RtlZeroMemory(stripeoff, sizeof(UINT32) * ci->num_stripes / ci->sub_stripes);
+    
+    stripe = startoffstripe / ci->sub_stripes;
+    while (pos < length) {
+        if (pos == 0) {
+            UINT32 readlen = min(stripeend[stripe * ci->sub_stripes] - stripestart[stripe * ci->sub_stripes], ci->stripe_length - (stripestart[stripe * ci->sub_stripes] % ci->stripe_length));
             
-            if (endoffstripe > i) {
-                stripeend[i] = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
-            } else if (endoffstripe == i) {
-                stripeend[i] = endoff + 1;
-            } else {
-                stripeend[i] = endoff - (endoff % ci->stripe_length);
-            }
+            RtlCopyMemory(buf, stripes[stripe]->buf, readlen);
+            stripeoff[stripe] += readlen;
+            pos += readlen;
+        } else if (length - pos < ci->stripe_length) {
+            RtlCopyMemory(buf + pos, &stripes[stripe]->buf[stripeoff[stripe]], length - pos);
+            
+            pos = length;
+        } else {
+            RtlCopyMemory(buf + pos, &stripes[stripe]->buf[stripeoff[stripe]], ci->stripe_length);
+            stripeoff[stripe] += ci->stripe_length;
+
+            pos += ci->stripe_length;
         }
-    } else if (type == BLOCK_FLAG_RAID10) {
-        UINT64 startoff, endoff;
-        UINT16 endoffstripe, j;
         
-        get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &startoff, &startoffstripe);
-        get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &endoff, &endoffstripe);
+        stripe = (stripe + 1) % (ci->num_stripes / ci->sub_stripes);
+    }
+    
+    if (context->tree) {
+        tree_header* th = (tree_header*)buf;
+        UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
         
-        if ((ci->num_stripes % ci->sub_stripes) != 0) {
-            ERR("chunk %llx: num_stripes %x was not a multiple of sub_stripes %x!\n", offset, ci->num_stripes, ci->sub_stripes);
-            Status = STATUS_INTERNAL_ERROR;
-            goto exit;
+        if (crc32 != *((UINT32*)th->csum)) {
+            WARN("crc32 was %08x, expected %08x\n", crc32, *((UINT32*)th->csum));
+            checksum_error = TRUE;
+            stripes[startoffstripe]->status = ReadDataStatus_CRCError;
+        } else if (addr != th->address) {
+            WARN("address of tree was %llx, not %llx as expected\n", th->address, addr);
+            checksum_error = TRUE;
+            stripes[startoffstripe]->status = ReadDataStatus_CRCError;
         }
+    } else if (context->csum) {
+        NTSTATUS Status;
+#ifdef DEBUG_STATS
+        LARGE_INTEGER time1, time2;
         
-        context->firstoff = (startoff % ci->stripe_length) / Vcb->superblock.sector_size;
-        context->startoffstripe = startoffstripe;
-        context->sectors_per_stripe = ci->stripe_length / Vcb->superblock.sector_size;
-        
-        startoffstripe *= ci->sub_stripes;
-        endoffstripe *= ci->sub_stripes;
+        time1 = KeQueryPerformanceCounter(NULL);
+#endif
+        Status = check_csum(Vcb, buf, length / Vcb->superblock.sector_size, context->csum);
         
-        for (i = 0; i < ci->num_stripes; i += ci->sub_stripes) {
-            if (startoffstripe > i) {
-                stripestart[i] = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
-            } else if (startoffstripe == i) {
-                stripestart[i] = startoff;
-            } else {
-                stripestart[i] = startoff - (startoff % ci->stripe_length);
-            }
-            
-            if (endoffstripe > i) {
-                stripeend[i] = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
-            } else if (endoffstripe == i) {
-                stripeend[i] = endoff + 1;
-            } else {
-                stripeend[i] = endoff - (endoff % ci->stripe_length);
-            }
-            
-            for (j = 1; j < ci->sub_stripes; j++) {
-                stripestart[i+j] = stripestart[i];
-                stripeend[i+j] = stripeend[i];
-            }
+        if (Status == STATUS_CRC_ERROR)
+            checksum_error = TRUE;
+        else if (!NT_SUCCESS(Status)) {
+            ERR("check_csum returned %08x\n", Status);
+            return Status;
         }
+#ifdef DEBUG_STATS
+        time2 = KeQueryPerformanceCounter(NULL);
         
-        context->stripes_cancel = 1;
-    } else if (type == BLOCK_FLAG_DUPLICATE) {
-        for (i = 0; i < ci->num_stripes; i++) {
-            stripestart[i] = addr - offset;
-            stripeend[i] = stripestart[i] + length;
-        }
+        Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
+#endif
+    }
+    
+    if (checksum_error) {
+        CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
+#ifdef DEBUG_STATS
+        LARGE_INTEGER time1, time2;
+#endif
         
-        context->stripes_cancel = ci->num_stripes - 1;
-    } else if (type == BLOCK_FLAG_RAID5) {
-        UINT64 startoff, endoff;
-        UINT16 endoffstripe;
-        UINT64 start = 0xffffffffffffffff, end = 0;
+        // FIXME - update dev stats
         
-        get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 1, &startoff, &startoffstripe);
-        get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 1, &endoff, &endoffstripe);
+        WARN("checksum error\n");
         
-        for (i = 0; i < ci->num_stripes - 1; i++) {
-            UINT64 ststart, stend;
-            
-            if (startoffstripe > i) {
-                ststart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
-            } else if (startoffstripe == i) {
-                ststart = startoff;
-            } else {
-                ststart = startoff - (startoff % ci->stripe_length);
-            }
-              
-            if (endoffstripe > i) {
-                stend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
-            } else if (endoffstripe == i) {
-                stend = endoff + 1;
-            } else {
-                stend = endoff - (endoff % ci->stripe_length);
-            }
+        if (!context->tree) {
+            RtlZeroMemory(stripeoff, sizeof(UINT32) * ci->num_stripes / ci->sub_stripes);
             
-            if (ststart != stend) {
-                if (ststart < start) {
-                    start = ststart;
-                    firststripesize = ci->stripe_length - (ststart % ci->stripe_length);
+            // find out which stripe the error was on
+            pos = 0;
+            stripe = startoffstripe / ci->sub_stripes;
+            while (pos < length) {
+                if (pos == 0) {
+                    UINT32 readlen = min(stripeend[stripe * ci->sub_stripes] - stripestart[stripe * ci->sub_stripes], ci->stripe_length - (stripestart[stripe * ci->sub_stripes] % ci->stripe_length));
+                    
+                    stripeoff[stripe] += readlen;
+                    pos += readlen;
+
+                    for (i = 0; i < readlen / Vcb->superblock.sector_size; i++) {
+                        UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
+                    
+                        if (crc32 != context->csum[i])
+                            stripes[stripe]->status = ReadDataStatus_CRCError;
+                    }
+                } else if (length - pos < ci->stripe_length) {
+                    for (i = 0; i < (length - pos) / Vcb->superblock.sector_size; i++) {
+                        UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + pos + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
+                    
+                        if (crc32 != context->csum[(pos / Vcb->superblock.sector_size) + i])
+                            stripes[stripe]->status = ReadDataStatus_CRCError;
+                    }
+                    
+                    pos = length;
+                } else {
+                    stripeoff[stripe] += ci->stripe_length;
+                    
+                    for (i = 0; i < ci->stripe_length / Vcb->superblock.sector_size; i++) {
+                        UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + pos + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
+                    
+                        if (crc32 != context->csum[(pos / Vcb->superblock.sector_size) + i])
+                            stripes[stripe]->status = ReadDataStatus_CRCError;
+                    }
+                    
+                    pos += ci->stripe_length;
                 }
                 
-                if (stend > end)
-                    end = stend;
+                stripe = (stripe + 1) % (ci->num_stripes / ci->sub_stripes);
             }
         }
         
+        context->stripes_left = 0;
+        
         for (i = 0; i < ci->num_stripes; i++) {
-            stripestart[i] = start;
-            stripeend[i] = end;
+            if (context->stripes[i].status == ReadDataStatus_CRCError) {
+                UINT16 other_stripe = (i % 1) ? (i - 1) : (i + 1);
+                
+                if (context->stripes[other_stripe].status == ReadDataStatus_Cancelled) {
+                    PIO_STACK_LOCATION IrpSp;
+                    
+                    // re-run Irp that we cancelled
+                    
+                    if (context->stripes[other_stripe].Irp) {
+                        if (devices[other_stripe]->devobj->Flags & DO_DIRECT_IO) {
+                            MmUnlockPages(context->stripes[other_stripe].Irp->MdlAddress);
+                            IoFreeMdl(context->stripes[other_stripe].Irp->MdlAddress);
+                        }
+                        IoFreeIrp(context->stripes[other_stripe].Irp);
+                    }
+                    
+                    if (!Irp) {
+                        context->stripes[other_stripe].Irp = IoAllocateIrp(devices[other_stripe]->devobj->StackSize, FALSE);
+                        
+                        if (!context->stripes[other_stripe].Irp) {
+                            ERR("IoAllocateIrp failed\n");
+                            return STATUS_INSUFFICIENT_RESOURCES;
+                        }
+                    } else {
+                        context->stripes[other_stripe].Irp = IoMakeAssociatedIrp(Irp, devices[other_stripe]->devobj->StackSize);
+                        
+                        if (!context->stripes[other_stripe].Irp) {
+                            ERR("IoMakeAssociatedIrp failed\n");
+                            return STATUS_INSUFFICIENT_RESOURCES;
+                        }
+                    }
+                    
+                    IrpSp = IoGetNextIrpStackLocation(context->stripes[other_stripe].Irp);
+                    IrpSp->MajorFunction = IRP_MJ_READ;
+                    
+                    if (devices[other_stripe]->devobj->Flags & DO_BUFFERED_IO) {
+                        FIXME("FIXME - buffered IO\n");
+                    } else if (devices[other_stripe]->devobj->Flags & DO_DIRECT_IO) {
+                        context->stripes[other_stripe].Irp->MdlAddress = IoAllocateMdl(context->stripes[other_stripe].buf, stripeend[other_stripe] - stripestart[other_stripe], FALSE, FALSE, NULL);
+                        if (!context->stripes[other_stripe].Irp->MdlAddress) {
+                            ERR("IoAllocateMdl failed\n");
+                            return STATUS_INSUFFICIENT_RESOURCES;
+                        }
+                        
+                        MmProbeAndLockPages(context->stripes[other_stripe].Irp->MdlAddress, KernelMode, IoWriteAccess);
+                    } else {
+                        context->stripes[other_stripe].Irp->UserBuffer = context->stripes[other_stripe].buf;
+                    }
+
+                    IrpSp->Parameters.Read.Length = stripeend[other_stripe] - stripestart[other_stripe];
+                    IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[other_stripe] + cis[other_stripe].offset;
+                    
+                    context->stripes[other_stripe].Irp->UserIosb = &context->stripes[other_stripe].iosb;
+                    
+                    IoSetCompletionRoutine(context->stripes[other_stripe].Irp, read_data_completion, &context->stripes[other_stripe], TRUE, TRUE, TRUE);
+                    
+                    context->stripes_left++;
+                    context->stripes[other_stripe].status = ReadDataStatus_Pending;
+                }
+            }
         }
         
-        context->stripes_cancel = Vcb->options.raid5_recalculation;
-    } else if (type == BLOCK_FLAG_RAID6) {
-        UINT64 startoff, endoff;
-        UINT16 endoffstripe;
-        UINT64 start = 0xffffffffffffffff, end = 0;
+        if (context->stripes_left == 0) {
+            WARN("could not recover from checksum error\n");
+            ExFreePool(stripes);
+            ExFreePool(stripeoff);
+            return STATUS_CRC_ERROR;
+        }
         
-        get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 2, &startoff, &startoffstripe);
-        get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 2, &endoff, &endoffstripe);
+        context->stripes_cancel = 0;
+        KeClearEvent(&context->Event);
         
-        for (i = 0; i < ci->num_stripes - 2; i++) {
-            UINT64 ststart, stend;
-            
-            if (startoffstripe > i) {
-                ststart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
-            } else if (startoffstripe == i) {
-                ststart = startoff;
-            } else {
-                ststart = startoff - (startoff % ci->stripe_length);
-            }
-              
-            if (endoffstripe > i) {
-                stend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
-            } else if (endoffstripe == i) {
-                stend = endoff + 1;
-            } else {
-                stend = endoff - (endoff % ci->stripe_length);
+#ifdef DEBUG_STATS
+        if (!context->tree)
+            time1 = KeQueryPerformanceCounter(NULL);
+#endif
+
+        for (i = 0; i < ci->num_stripes; i++) {
+            if (context->stripes[i].status == ReadDataStatus_Pending) {
+                IoCallDriver(devices[i]->devobj, context->stripes[i].Irp);
             }
+        }
+        
+        KeWaitForSingleObject(&context->Event, Executive, KernelMode, FALSE, NULL);
+        
+#ifdef DEBUG_STATS
+        if (!context->tree) {
+            time2 = KeQueryPerformanceCounter(NULL);
             
-            if (ststart != stend) {
-                if (ststart < start) {
-                    start = ststart;
-                    firststripesize = ci->stripe_length - (ststart % ci->stripe_length);
+            Vcb->stats.read_disk_time += time2.QuadPart - time1.QuadPart;
+        }
+#endif
+
+        for (i = 0; i < ci->num_stripes; i++) {
+            if (context->stripes[i].status == ReadDataStatus_CRCError) {
+                UINT16 other_stripe = (i % 1) ? (i - 1) : (i + 1);
+                
+                if (context->stripes[other_stripe].status != ReadDataStatus_Success) {
+                    WARN("could not recover from checksum error\n");
+                    ExFreePool(stripes);
+                    ExFreePool(stripeoff);
+                    return STATUS_CRC_ERROR;
                 }
+            }
+        }
+
+        RtlZeroMemory(stripeoff, sizeof(UINT32) * ci->num_stripes / ci->sub_stripes);
+    
+        pos = 0;
+        stripe = startoffstripe / ci->sub_stripes;
+        while (pos < length) {
+            if (pos == 0) {
+                UINT32 readlen = min(stripeend[stripe * ci->sub_stripes] - stripestart[stripe * ci->sub_stripes], ci->stripe_length - (stripestart[stripe * ci->sub_stripes] % ci->stripe_length));
                 
-                if (stend > end)
-                    end = stend;
+                stripeoff[stripe] += readlen;
+                pos += readlen;
+                
+                if (context->csum && stripes[stripe]->status == ReadDataStatus_CRCError) {
+                    for (i = 0; i < readlen / Vcb->superblock.sector_size; i++) {
+                        UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
+                    
+                        if (crc32 != context->csum[i]) {
+                            UINT16 other_stripe = (stripe * ci->sub_stripes) + (context->stripes[stripe * ci->sub_stripes].status == ReadDataStatus_CRCError ? 1 : 0);
+                            UINT32 crc32b = ~calc_crc32c(0xffffffff, context->stripes[other_stripe].buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
+                            
+                            if (crc32b == context->csum[i]) {
+                                RtlCopyMemory(buf + (i * Vcb->superblock.sector_size), context->stripes[other_stripe].buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
+                                RtlCopyMemory(stripes[stripe]->buf + (i * Vcb->superblock.sector_size), context->stripes[other_stripe].buf + (i * Vcb->superblock.sector_size),
+                                                Vcb->superblock.sector_size);
+                                stripes[stripe]->rewrite = TRUE;
+                            } else {
+                                WARN("could not recover from checksum error\n");
+                                ExFreePool(stripes);
+                                ExFreePool(stripeoff);
+                                return STATUS_CRC_ERROR;
+                            }
+                        }
+                    }
+                } else if (context->tree) {
+                    UINT16 other_stripe = (stripe * ci->sub_stripes) + (context->stripes[stripe * ci->sub_stripes].status == ReadDataStatus_CRCError ? 1 : 0);
+                    tree_header* th = (tree_header*)buf;
+                    UINT32 crc32;
+                    
+                    RtlCopyMemory(buf, context->stripes[other_stripe].buf, readlen);
+                    
+                    crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
+                    
+                    if (addr != th->address || crc32 != *((UINT32*)th->csum)) {
+                        WARN("could not recover from checksum error\n");
+                        ExFreePool(stripes);
+                        ExFreePool(stripeoff);
+                        return STATUS_CRC_ERROR;
+                    }
+                    
+                    RtlCopyMemory(stripes[stripe]->buf, buf, readlen);
+                    stripes[stripe]->rewrite = TRUE;
+                }
+            } else if (length - pos < ci->stripe_length) {
+                if (context->csum && stripes[stripe]->status == ReadDataStatus_CRCError) {
+                    for (i = 0; i < (length - pos) / Vcb->superblock.sector_size; i++) {
+                        UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + pos + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
+                    
+                        if (crc32 != context->csum[(pos / Vcb->superblock.sector_size) + i]) {
+                            UINT16 other_stripe = (stripe * ci->sub_stripes) + (context->stripes[stripe * ci->sub_stripes].status == ReadDataStatus_CRCError ? 1 : 0);
+                            UINT32 crc32b = ~calc_crc32c(0xffffffff, &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
+                                                            Vcb->superblock.sector_size);
+                            
+                            if (crc32b == context->csum[i]) {
+                                RtlCopyMemory(buf + pos + (i * Vcb->superblock.sector_size),
+                                                &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size);
+                                RtlCopyMemory(&stripes[stripe]->buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
+                                                &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
+                                                Vcb->superblock.sector_size);
+                                stripes[stripe]->rewrite = TRUE;
+                            } else {
+                                WARN("could not recover from checksum error\n");
+                                ExFreePool(stripes);
+                                ExFreePool(stripeoff);
+                                return STATUS_CRC_ERROR;
+                            }
+                        }
+                    }
+                }
+                
+                pos = length;
+            } else {
+                if (context->csum && stripes[stripe]->status == ReadDataStatus_CRCError) {
+                    for (i = 0; i < ci->stripe_length / Vcb->superblock.sector_size; i++) {
+                        UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + pos + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
+                    
+                        if (crc32 != context->csum[(pos / Vcb->superblock.sector_size) + i]) {
+                            UINT16 other_stripe = (stripe * ci->sub_stripes) + (context->stripes[stripe * ci->sub_stripes].status == ReadDataStatus_CRCError ? 1 : 0);
+                            UINT32 crc32b = ~calc_crc32c(0xffffffff, &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
+                                                            Vcb->superblock.sector_size);
+                            
+                            if (crc32b == context->csum[i]) {
+                                RtlCopyMemory(buf + pos + (i * Vcb->superblock.sector_size),
+                                                &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size);
+                                RtlCopyMemory(&stripes[stripe]->buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
+                                                &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
+                                                Vcb->superblock.sector_size);
+                                stripes[stripe]->rewrite = TRUE;
+                            } else {
+                                WARN("could not recover from checksum error\n");
+                                ExFreePool(stripes);
+                                ExFreePool(stripeoff);
+                                return STATUS_CRC_ERROR;
+                            }
+                        }
+                    }
+                }
+                
+                stripeoff[stripe] += ci->stripe_length;
+                pos += ci->stripe_length;
             }
+            
+            stripe = (stripe + 1) % (ci->num_stripes / ci->sub_stripes);
         }
         
-        for (i = 0; i < ci->num_stripes; i++) {
-            stripestart[i] = start;
-            stripeend[i] = end;
-        }
+        // write good data over bad
         
-        context->stripes_cancel = Vcb->options.raid6_recalculation;
+        if (!Vcb->readonly) {
+            for (i = 0; i < ci->num_stripes; i++) {
+                if (context->stripes[i].rewrite && devices[i] && !devices[i]->readonly) {
+                    Status = write_data_phys(devices[i]->devobj, cis[i].offset + stripestart[i], context->stripes[i].buf, stripeend[i] - stripestart[i]);
+                    
+                    if (!NT_SUCCESS(Status))
+                        WARN("write_data_phys returned %08x\n", Status);
+                }
+            }
+        }
     }
     
-    KeInitializeSpinLock(&context->spin_lock);
+    ExFreePool(stripes);
+    ExFreePool(stripeoff);
     
-    context->address = addr;
+    // FIXME - handle the case where one of the stripes doesn't read everything, i.e. Irp->IoStatus.Information is short
+    
+    return STATUS_SUCCESS;
+}
+
+static NTSTATUS read_data_raid5(device_extension* Vcb, UINT8* buf, UINT64 addr, UINT32 length, PIRP Irp, read_data_context* context, CHUNK_ITEM* ci,
+                                device** devices, UINT64* stripestart, UINT64* stripeend, UINT64 offset, UINT32 firststripesize, BOOL check_nocsum_parity) {
+    UINT32 pos, skip;
+    NTSTATUS Status;
+    int num_errors = 0;
+    UINT64 i, off, stripeoff, origoff;
+    BOOL needs_reconstruct = FALSE;
+    UINT64 reconstruct_stripe;
+    BOOL checksum_error = FALSE;
     
     for (i = 0; i < ci->num_stripes; i++) {
-        if (!devices[i] || stripestart[i] == stripeend[i]) {
-            context->stripes[i].status = ReadDataStatus_MissingDevice;
-            context->stripes[i].buf = NULL;
-            context->stripes_left--;
-            
-            if (!devices[i])
-                missing_devices++;
+        if (context->stripes[i].status == ReadDataStatus_Error) {
+            num_errors++;
+            if (num_errors > 1)
+                break;
         }
     }
-      
-    if (missing_devices > allowed_missing) {
-        ERR("not enough devices to service request (%u missing)\n", missing_devices);
-        Status = STATUS_UNEXPECTED_IO_ERROR;
-        goto exit;
+    
+    if (num_errors > 1) {
+        for (i = 0; i < ci->num_stripes; i++) {
+            if (context->stripes[i].status == ReadDataStatus_Error) {
+                WARN("stripe %llu returned error %08x\n", i, context->stripes[i].iosb.Status); 
+                return context->stripes[i].iosb.Status;
+            }
+        }
     }
     
+    off = addr - offset;
+    off -= off % ((ci->num_stripes - 1) * ci->stripe_length);
+    skip = addr - offset - off;
+    origoff = off;
+    
     for (i = 0; i < ci->num_stripes; i++) {
-        PIO_STACK_LOCATION IrpSp;
+        if (context->stripes[i].status == ReadDataStatus_Cancelled) {
+            if (needs_reconstruct) {
+                ERR("more than one stripe needs reconstruction\n");
+                return STATUS_INTERNAL_ERROR;
+            } else {
+                needs_reconstruct = TRUE;
+                reconstruct_stripe = i;
+            }
+        }
+    }
+    
+    if (needs_reconstruct) {
+        TRACE("reconstructing stripe %u\n", reconstruct_stripe);
         
-        if (devices[i] && stripestart[i] != stripeend[i]) {
-            context->stripes[i].context = (struct read_data_context*)context;
-            context->stripes[i].buf = ExAllocatePoolWithTag(NonPagedPool, stripeend[i] - stripestart[i], ALLOC_TAG);
+        stripeoff = 0;
+        
+        raid5_reconstruct(off, skip, context, ci, &stripeoff, stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe], TRUE, firststripesize, reconstruct_stripe);
+        
+        while (stripeoff < stripeend[0] - stripestart[0]) {
+            off += (ci->num_stripes - 1) * ci->stripe_length;
+            raid5_reconstruct(off, 0, context, ci, &stripeoff, stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe], FALSE, 0, reconstruct_stripe);
+        }
+        
+        off = addr - offset;
+        off -= off % ((ci->num_stripes - 1) * ci->stripe_length);
+    }
+    
+    pos = 0;
+    stripeoff = 0;
+    raid5_decode(off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize);
+    
+    while (pos < length) {
+        off += (ci->num_stripes - 1) * ci->stripe_length;
+        raid5_decode(off, 0, context, ci, &stripeoff, buf, &pos, length, 0);
+    }
+    
+    if (context->tree) {
+        tree_header* th = (tree_header*)buf;
+        UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
+        
+        if (addr != th->address || crc32 != *((UINT32*)th->csum))
+            checksum_error = TRUE;
+    } else if (context->csum) {
+#ifdef DEBUG_STATS
+        LARGE_INTEGER time1, time2;
+        
+        time1 = KeQueryPerformanceCounter(NULL);
+#endif
+        Status = check_csum(Vcb, buf, length / Vcb->superblock.sector_size, context->csum);
+        
+        if (Status == STATUS_CRC_ERROR) {
+            WARN("checksum error\n");
+            checksum_error = TRUE;
+        } else if (!NT_SUCCESS(Status)) {
+            ERR("check_csum returned %08x\n", Status);
+            return Status;
+        }
+
+#ifdef DEBUG_STATS
+        time2 = KeQueryPerformanceCounter(NULL);
+        
+        Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
+#endif
+    }
+    
+    if (checksum_error) {
+        CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
+        
+        if (needs_reconstruct) {
+            PIO_STACK_LOCATION IrpSp;
+#ifdef DEBUG_STATS
+            LARGE_INTEGER time1, time2;
+#endif
             
-            if (!context->stripes[i].buf) {
-                ERR("out of memory\n");
-                Status = STATUS_INSUFFICIENT_RESOURCES;
-                goto exit;
-            }
+            // re-run Irp that we cancelled
             
-            if (type == BLOCK_FLAG_RAID10) {
-                context->stripes[i].stripenum = i / ci->sub_stripes;
+            if (context->stripes[reconstruct_stripe].Irp) {
+                if (devices[reconstruct_stripe]->devobj->Flags & DO_DIRECT_IO) {
+                    MmUnlockPages(context->stripes[reconstruct_stripe].Irp->MdlAddress);
+                    IoFreeMdl(context->stripes[reconstruct_stripe].Irp->MdlAddress);
+                }
+                IoFreeIrp(context->stripes[reconstruct_stripe].Irp);
             }
-
+            
             if (!Irp) {
-                context->stripes[i].Irp = IoAllocateIrp(devices[i]->devobj->StackSize, FALSE);
+                context->stripes[reconstruct_stripe].Irp = IoAllocateIrp(devices[reconstruct_stripe]->devobj->StackSize, FALSE);
                 
-                if (!context->stripes[i].Irp) {
+                if (!context->stripes[reconstruct_stripe].Irp) {
                     ERR("IoAllocateIrp failed\n");
-                    Status = STATUS_INSUFFICIENT_RESOURCES;
-                    goto exit;
+                    return STATUS_INSUFFICIENT_RESOURCES;
                 }
             } else {
-                context->stripes[i].Irp = IoMakeAssociatedIrp(Irp, devices[i]->devobj->StackSize);
+                context->stripes[reconstruct_stripe].Irp = IoMakeAssociatedIrp(Irp, devices[reconstruct_stripe]->devobj->StackSize);
                 
-                if (!context->stripes[i].Irp) {
+                if (!context->stripes[reconstruct_stripe].Irp) {
                     ERR("IoMakeAssociatedIrp failed\n");
-                    Status = STATUS_INSUFFICIENT_RESOURCES;
-                    goto exit;
+                    return STATUS_INSUFFICIENT_RESOURCES;
                 }
             }
             
-            IrpSp = IoGetNextIrpStackLocation(context->stripes[i].Irp);
+            IrpSp = IoGetNextIrpStackLocation(context->stripes[reconstruct_stripe].Irp);
             IrpSp->MajorFunction = IRP_MJ_READ;
             
-            if (devices[i]->devobj->Flags & DO_BUFFERED_IO) {
+            if (devices[reconstruct_stripe]->devobj->Flags & DO_BUFFERED_IO) {
                 FIXME("FIXME - buffered IO\n");
-            } else if (devices[i]->devobj->Flags & DO_DIRECT_IO) {
-                context->stripes[i].Irp->MdlAddress = IoAllocateMdl(context->stripes[i].buf, stripeend[i] - stripestart[i], FALSE, FALSE, NULL);
-                if (!context->stripes[i].Irp->MdlAddress) {
+            } else if (devices[reconstruct_stripe]->devobj->Flags & DO_DIRECT_IO) {
+                context->stripes[reconstruct_stripe].Irp->MdlAddress = IoAllocateMdl(context->stripes[reconstruct_stripe].buf,
+                                                                                        stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe], FALSE, FALSE, NULL);
+                if (!context->stripes[reconstruct_stripe].Irp->MdlAddress) {
                     ERR("IoAllocateMdl failed\n");
-                    Status = STATUS_INSUFFICIENT_RESOURCES;
-                    goto exit;
+                    return STATUS_INSUFFICIENT_RESOURCES;
                 }
                 
-                MmProbeAndLockPages(context->stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
+                MmProbeAndLockPages(context->stripes[reconstruct_stripe].Irp->MdlAddress, KernelMode, IoWriteAccess);
             } else {
-                context->stripes[i].Irp->UserBuffer = context->stripes[i].buf;
+                context->stripes[reconstruct_stripe].Irp->UserBuffer = context->stripes[reconstruct_stripe].buf;
             }
 
-            IrpSp->Parameters.Read.Length = stripeend[i] - stripestart[i];
-            IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[i] + cis[i].offset;
+            IrpSp->Parameters.Read.Length = stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe];
+            IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[reconstruct_stripe] + cis[reconstruct_stripe].offset;
             
-            context->stripes[i].Irp->UserIosb = &context->stripes[i].iosb;
+            context->stripes[reconstruct_stripe].Irp->UserIosb = &context->stripes[reconstruct_stripe].iosb;
             
-            IoSetCompletionRoutine(context->stripes[i].Irp, read_data_completion, &context->stripes[i], TRUE, TRUE, TRUE);
+            IoSetCompletionRoutine(context->stripes[reconstruct_stripe].Irp, read_data_completion, &context->stripes[reconstruct_stripe], TRUE, TRUE, TRUE);
 
-            context->stripes[i].status = ReadDataStatus_Pending;
-        }
-    }
-    
+            context->stripes[reconstruct_stripe].status = ReadDataStatus_Pending;
+            
+            context->stripes_left = 1;
+            KeClearEvent(&context->Event);
+            
 #ifdef DEBUG_STATS
-    if (!is_tree)
-        time1 = KeQueryPerformanceCounter(NULL);
+            if (!context->tree)
+                time1 = KeQueryPerformanceCounter(NULL);
 #endif
-    
-    for (i = 0; i < ci->num_stripes; i++) {
-        if (context->stripes[i].status != ReadDataStatus_MissingDevice) {
-            IoCallDriver(devices[i]->devobj, context->stripes[i].Irp);
-        }
-    }
 
-    KeWaitForSingleObject(&context->Event, Executive, KernelMode, FALSE, NULL);
-   
+            IoCallDriver(devices[reconstruct_stripe]->devobj, context->stripes[reconstruct_stripe].Irp);
+            
+            KeWaitForSingleObject(&context->Event, Executive, KernelMode, FALSE, NULL);
+            
 #ifdef DEBUG_STATS
-    if (!is_tree) {
-        time2 = KeQueryPerformanceCounter(NULL);
-        
-        Vcb->stats.read_disk_time += time2.QuadPart - time1.QuadPart;
-    }
-#endif
-    
-    // check if any of the devices return a "user-induced" error
-    
-    for (i = 0; i < ci->num_stripes; i++) {
-        if (context->stripes[i].status == ReadDataStatus_Error && IoIsErrorUserInduced(context->stripes[i].iosb.Status)) {
-            if (Irp && context->stripes[i].iosb.Status == STATUS_VERIFY_REQUIRED) {
-                PDEVICE_OBJECT dev;
-                
-                dev = IoGetDeviceToVerify(Irp->Tail.Overlay.Thread);
-                IoSetDeviceToVerify(Irp->Tail.Overlay.Thread, NULL);
-                
-                if (!dev) {
-                    dev = IoGetDeviceToVerify(PsGetCurrentThread());
-                    IoSetDeviceToVerify(PsGetCurrentThread(), NULL);
-                }
-                
-                dev = Vcb->Vpb ? Vcb->Vpb->RealDevice : NULL;
+            if (!context->tree) {
+                time2 = KeQueryPerformanceCounter(NULL);
                 
-                if (dev)
-                    IoVerifyVolume(dev, FALSE);
+                Vcb->stats.read_disk_time += time2.QuadPart - time1.QuadPart;
             }
-//             IoSetHardErrorOrVerifyDevice(context->stripes[i].Irp, devices[i]->devobj);
-            
-            Status = context->stripes[i].iosb.Status;
-            goto exit;
-        }
-    }
-    
-    if (type == BLOCK_FLAG_RAID0) {
-        UINT32 pos, *stripeoff;
-        UINT8 stripe;
-        
-        for (i = 0; i < ci->num_stripes; i++) {
-            if (context->stripes[i].status == ReadDataStatus_Error) {
-                WARN("stripe %llu returned error %08x\n", i, context->stripes[i].iosb.Status); 
-                Status = context->stripes[i].iosb.Status;
-                goto exit;
+#endif
+
+            if (context->stripes[reconstruct_stripe].status != ReadDataStatus_Success) {
+                ERR("unrecoverable checksum error\n");
+                return STATUS_CRC_ERROR;
             }
         }
         
-        pos = 0;
-        stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(UINT32) * ci->num_stripes, ALLOC_TAG);
-        if (!stripeoff) {
-            ERR("out of memory\n");
-            Status = STATUS_INSUFFICIENT_RESOURCES;
-            goto exit;
+        if (context->tree) {
+            off = origoff;
+            pos = 0;
+            stripeoff = 0;
+            if (!raid5_decode_with_checksum_metadata(addr, off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize, Vcb->superblock.node_size)) {
+                ERR("unrecoverable metadata checksum error\n");
+                return STATUS_CRC_ERROR;
+            }
+        } else {
+            off = origoff;
+            pos = 0;
+            stripeoff = 0;
+            if (!raid5_decode_with_checksum(off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize, context->csum, Vcb->superblock.sector_size))
+                return STATUS_CRC_ERROR;
+            
+            while (pos < length) {
+                off += (ci->num_stripes - 1) * ci->stripe_length;
+                if (!raid5_decode_with_checksum(off, 0, context, ci, &stripeoff, buf, &pos, length, 0, context->csum, Vcb->superblock.sector_size))
+                    return STATUS_CRC_ERROR;
+            }
         }
         
-        RtlZeroMemory(stripeoff, sizeof(UINT32) * ci->num_stripes);
+        // write good data over bad
         
-        stripe = startoffstripe;
-        while (pos < length) {
-            if (pos == 0) {
-                UINT32 readlen = min(stripeend[stripe] - stripestart[stripe], ci->stripe_length - (stripestart[stripe] % ci->stripe_length));
-                
-                RtlCopyMemory(buf, context->stripes[stripe].buf, readlen);
-                stripeoff[stripe] += readlen;
-                pos += readlen;
-            } else if (length - pos < ci->stripe_length) {
-                RtlCopyMemory(buf + pos, &context->stripes[stripe].buf[stripeoff[stripe]], length - pos);
-                pos = length;
-            } else {
-                RtlCopyMemory(buf + pos, &context->stripes[stripe].buf[stripeoff[stripe]], ci->stripe_length);
-                stripeoff[stripe] += ci->stripe_length;
-                pos += ci->stripe_length;
+        if (!Vcb->readonly) {
+            for (i = 0; i < ci->num_stripes; i++) {
+                if (context->stripes[i].rewrite && devices[i] && !devices[i]->readonly) {
+                    Status = write_data_phys(devices[i]->devobj, cis[i].offset + stripestart[i], context->stripes[i].buf, stripeend[i] - stripestart[i]);
+                    
+                    if (!NT_SUCCESS(Status))
+                        WARN("write_data_phys returned %08x\n", Status);
+                }
             }
-            
-            stripe = (stripe + 1) % ci->num_stripes;
         }
+    }
+    
+    if (check_nocsum_parity && !context->tree && !context->csum) {
+        UINT32* parity_buf;
         
-        ExFreePool(stripeoff);
+        // We are reading a nodatacsum extent. Even though there's no checksum, we
+        // can still identify errors by checking if the parity is consistent.
         
-        // FIXME - handle the case where one of the stripes doesn't read everything, i.e. Irp->IoStatus.Information is short
+        parity_buf = ExAllocatePoolWithTag(NonPagedPool, stripeend[0] - stripestart[0], ALLOC_TAG);
         
-        if (is_tree) { // shouldn't happen, as trees shouldn't cross stripe boundaries
-            tree_header* th = (tree_header*)buf;
-            UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
-            
-            if (addr != th->address || crc32 != *((UINT32*)th->csum)) {
-                WARN("crc32 was %08x, expected %08x\n", crc32, *((UINT32*)th->csum));
-                Status = STATUS_CRC_ERROR;
-                goto exit;
-            }
-        } else if (csum) {
-#ifdef DEBUG_STATS
-            time1 = KeQueryPerformanceCounter(NULL);
-#endif
-            for (i = 0; i < length / Vcb->superblock.sector_size; i++) {
-                UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                
-                if (crc32 != csum[i]) {
-                    WARN("checksum error (%08x != %08x)\n", crc32, csum[i]);
-                    Status = STATUS_CRC_ERROR;
-                    goto exit;
-                }
+        if (!parity_buf) {
+            ERR("out of memory\n");
+            return STATUS_INSUFFICIENT_RESOURCES;
+        }
+        
+        RtlCopyMemory(parity_buf, context->stripes[0].buf, stripeend[0] - stripestart[0]);
+        
+        for (i = 0; i < ci->num_stripes; i++) {
+            do_xor((UINT8*)parity_buf, context->stripes[i].buf, stripeend[0] - stripestart[0]);
+        }
+        
+        for (i = 0; i < (stripeend[0] - stripestart[0]) / sizeof(UINT32); i++) {
+            if (parity_buf[i] != 0) {
+                ERR("parity error on nodatacsum inode\n");
+                ExFreePool(parity_buf);
+                return STATUS_CRC_ERROR;
             }
-#ifdef DEBUG_STATS
-            time2 = KeQueryPerformanceCounter(NULL);
-            
-            Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
-#endif
         }
         
-        Status = STATUS_SUCCESS;
-    } else if (type == BLOCK_FLAG_RAID10) {
-        BOOL checksum_error = FALSE;
-        UINT32 pos, *stripeoff;
-        UINT8 stripe;
-        read_data_stripe** stripes;
+        ExFreePool(parity_buf);
+    }
+    
+    return STATUS_SUCCESS;
+}
 
-        stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
-        if (!stripes) {
-            ERR("out of memory\n");
-            Status = STATUS_INSUFFICIENT_RESOURCES;
-            goto exit;
+static NTSTATUS read_data_raid6(device_extension* Vcb, UINT8* buf, UINT64 addr, UINT32 length, PIRP Irp, read_data_context* context, CHUNK_ITEM* ci,
+                                device** devices, UINT64* stripestart, UINT64* stripeend, UINT64 offset, UINT32 firststripesize, BOOL check_nocsum_parity) {
+    NTSTATUS Status;
+    UINT32 pos, skip;
+    int num_errors = 0;
+    UINT64 i, off, stripeoff, origoff;
+    UINT8 needs_reconstruct = 0;
+    UINT16 missing1, missing2;
+    BOOL checksum_error = FALSE;
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        if (context->stripes[i].status == ReadDataStatus_Error) {
+            num_errors++;
+            if (num_errors > 2)
+                break;
+        }
+    }
+    
+    if (num_errors > 2) {
+        for (i = 0; i < ci->num_stripes; i++) {
+            if (context->stripes[i].status == ReadDataStatus_Error) {
+                WARN("stripe %llu returned error %08x\n", i, context->stripes[i].iosb.Status); 
+                return context->stripes[i].iosb.Status;
+            }
+        }
+    }
+    
+    off = addr - offset;
+    off -= off % ((ci->num_stripes - 2) * ci->stripe_length);
+    skip = addr - offset - off;
+    origoff = off;
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        if (context->stripes[i].status == ReadDataStatus_Cancelled) {
+            if (needs_reconstruct == 2) {
+                ERR("more than two stripes need reconstruction\n");
+                return STATUS_INTERNAL_ERROR;
+            } else if (needs_reconstruct == 1) {
+                needs_reconstruct++;
+                missing2 = i;
+            } else {
+                needs_reconstruct++;
+                missing1 = i;
+            }
         }
+    }
+    
+    if (needs_reconstruct > 0) {
+        stripeoff = 0;
         
-        RtlZeroMemory(stripes, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes);
+        if (needs_reconstruct == 2) {
+            TRACE("reconstructing stripes %u and %u\n", missing1, missing2);
         
-        for (i = 0; i < ci->num_stripes; i += ci->sub_stripes) {
-            UINT16 j;
+            raid6_reconstruct2(off, skip, context, ci, &stripeoff, stripeend[missing1] - stripestart[missing1],
+                                TRUE, firststripesize, missing1, missing2);
             
-            for (j = 0; j < ci->sub_stripes; j++) {
-                if (context->stripes[i+j].status == ReadDataStatus_Success) {
-                    stripes[i / ci->sub_stripes] = &context->stripes[i+j];
-                    break;
-                }
+            while (stripeoff < stripeend[0] - stripestart[0]) {
+                off += (ci->num_stripes - 2) * ci->stripe_length;
+                raid6_reconstruct2(off, 0, context, ci, &stripeoff, stripeend[missing1] - stripestart[missing1],
+                                    FALSE, 0, missing1, missing2);
             }
+        } else {
+            TRACE("reconstructing stripe %u\n", missing1);
             
-            if (!stripes[i / ci->sub_stripes]) {
-                for (j = 0; j < ci->sub_stripes; j++) {
-                    if (context->stripes[i+j].status == ReadDataStatus_Error) {
-                        // both stripes must have errored if we get here
-                        WARN("stripe %llu returned error %08x\n", i+j, context->stripes[i+j].iosb.Status);
-                        Status = context->stripes[i].iosb.Status;
-                        ExFreePool(stripes);
-                        goto exit;
-                    }
-                }
+            raid6_reconstruct1(off, skip, context, ci, &stripeoff, stripeend[missing1] - stripestart[missing1], TRUE, firststripesize, missing1);
+            
+            while (stripeoff < stripeend[0] - stripestart[0]) {
+                off += (ci->num_stripes - 2) * ci->stripe_length;
+                raid6_reconstruct1(off, 0, context, ci, &stripeoff, stripeend[missing1] - stripestart[missing1], FALSE, 0, missing1);
             }
         }
         
-        pos = 0;
-        stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(UINT32) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
-        if (!stripeoff) {
+        off = origoff;
+    }
+    
+    if (check_nocsum_parity && !context->tree && !context->csum) {
+        UINT8* scratch;
+        
+        scratch = ExAllocatePoolWithTag(NonPagedPool, ci->stripe_length, ALLOC_TAG);
+        if (!scratch) {
             ERR("out of memory\n");
-            Status = STATUS_INSUFFICIENT_RESOURCES;
-            ExFreePool(stripes);
-            goto exit;
+            return STATUS_INSUFFICIENT_RESOURCES;
         }
         
-        RtlZeroMemory(stripeoff, sizeof(UINT32) * ci->num_stripes / ci->sub_stripes);
+        stripeoff = 0;
+        Status = check_raid6_nocsum_parity(off, skip, context, ci, &stripeoff, stripeend[0] - stripestart[0], TRUE, firststripesize, scratch);
+        if (!NT_SUCCESS(Status)) {
+            ERR("check_raid6_nocsum_parity returned %08x\n", Status);
+            ExFreePool(scratch);
+            return Status;
+        }
+            
+        while (stripeoff < stripeend[0] - stripestart[0]) {
+            off += (ci->num_stripes - 2) * ci->stripe_length;
+            Status = check_raid6_nocsum_parity(off, 0, context, ci, &stripeoff, stripeend[0] - stripestart[0], FALSE, 0, scratch);
+            
+            if (!NT_SUCCESS(Status)) {
+                ERR("check_raid6_nocsum_parity returned %08x\n", Status);
+                ExFreePool(scratch);
+                return Status;
+            }
+        }
         
-        stripe = startoffstripe / ci->sub_stripes;
-        while (pos < length) {
-            if (pos == 0) {
-                UINT32 readlen = min(stripeend[stripe * ci->sub_stripes] - stripestart[stripe * ci->sub_stripes], ci->stripe_length - (stripestart[stripe * ci->sub_stripes] % ci->stripe_length));
-                
-                RtlCopyMemory(buf, stripes[stripe]->buf, readlen);
-                stripeoff[stripe] += readlen;
-                pos += readlen;
-                
-                if (context->csum) {
+        ExFreePool(scratch);
+        
+        off = origoff;
+    }
+    
+    pos = 0;
+    stripeoff = 0;
+    raid6_decode(off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize);
+    
+    while (pos < length) {
+        off += (ci->num_stripes - 2) * ci->stripe_length;
+        raid6_decode(off, 0, context, ci, &stripeoff, buf, &pos, length, 0);
+    }
+    
+    if (context->tree) {
+        tree_header* th = (tree_header*)buf;
+        UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
+        
+        if (addr != th->address || crc32 != *((UINT32*)th->csum))
+            checksum_error = TRUE;
+    } else if (context->csum) {
 #ifdef DEBUG_STATS
-                    time1 = KeQueryPerformanceCounter(NULL);
+        LARGE_INTEGER time1, time2;
+        
+        time1 = KeQueryPerformanceCounter(NULL);
 #endif
-                    for (i = 0; i < readlen / Vcb->superblock.sector_size; i++) {
-                        UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                    
-                        if (crc32 != csum[i]) {
-                            checksum_error = TRUE;
-                            stripes[stripe]->status = ReadDataStatus_CRCError;
-                        }
-                    }
+        Status = check_csum(Vcb, buf, length / Vcb->superblock.sector_size, context->csum);
+        
+        if (Status == STATUS_CRC_ERROR) {
+            WARN("checksum error\n");
+            checksum_error = TRUE;
+        } else if (!NT_SUCCESS(Status)) {
+            ERR("check_csum returned %08x\n", Status);
+            return Status;
+        }
 #ifdef DEBUG_STATS
-                    time2 = KeQueryPerformanceCounter(NULL);
-                    
-                    Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
+        time2 = KeQueryPerformanceCounter(NULL);
+        
+        Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
 #endif
+    }
+    
+    if (checksum_error) {
+        CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
+        
+        for (i = 0; i < needs_reconstruct; i++) {
+            PIO_STACK_LOCATION IrpSp;
+            UINT16 reconstruct_stripe = i == 0 ? missing1 : missing2;
+            
+            // re-run Irps that we cancelled
+            
+            if (context->stripes[reconstruct_stripe].Irp) {
+                if (devices[reconstruct_stripe]->devobj->Flags & DO_DIRECT_IO) {
+                    MmUnlockPages(context->stripes[reconstruct_stripe].Irp->MdlAddress);
+                    IoFreeMdl(context->stripes[reconstruct_stripe].Irp->MdlAddress);
                 }
-            } else if (length - pos < ci->stripe_length) {
-                RtlCopyMemory(buf + pos, &stripes[stripe]->buf[stripeoff[stripe]], length - pos);
+                IoFreeIrp(context->stripes[reconstruct_stripe].Irp);
+            }
+            
+            if (!Irp) {
+                context->stripes[reconstruct_stripe].Irp = IoAllocateIrp(devices[reconstruct_stripe]->devobj->StackSize, FALSE);
                 
-                if (context->csum) {
-#ifdef DEBUG_STATS
-                    time1 = KeQueryPerformanceCounter(NULL);
-#endif
-                    for (i = 0; i < (length - pos) / Vcb->superblock.sector_size; i++) {
-                        UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + pos + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                    
-                        if (crc32 != csum[(pos / Vcb->superblock.sector_size) + i]) {
-                            checksum_error = TRUE;
-                            stripes[stripe]->status = ReadDataStatus_CRCError;
-                        }
-                    }
-#ifdef DEBUG_STATS
-                    time2 = KeQueryPerformanceCounter(NULL);
-                    
-                    Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
-#endif
+                if (!context->stripes[reconstruct_stripe].Irp) {
+                    ERR("IoAllocateIrp failed\n");
+                    return STATUS_INSUFFICIENT_RESOURCES;
                 }
-                
-                pos = length;
             } else {
-                RtlCopyMemory(buf + pos, &stripes[stripe]->buf[stripeoff[stripe]], ci->stripe_length);
-                stripeoff[stripe] += ci->stripe_length;
+                context->stripes[reconstruct_stripe].Irp = IoMakeAssociatedIrp(Irp, devices[reconstruct_stripe]->devobj->StackSize);
                 
-                if (context->csum) {
-#ifdef DEBUG_STATS
-                    time1 = KeQueryPerformanceCounter(NULL);
-#endif
-                    for (i = 0; i < ci->stripe_length / Vcb->superblock.sector_size; i++) {
-                        UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + pos + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                    
-                        if (crc32 != csum[(pos / Vcb->superblock.sector_size) + i]) {
-                            checksum_error = TRUE;
-                            stripes[stripe]->status = ReadDataStatus_CRCError;
-                        }
-                    }
-#ifdef DEBUG_STATS
-                    time2 = KeQueryPerformanceCounter(NULL);
-                    
-                    Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
-#endif
+                if (!context->stripes[reconstruct_stripe].Irp) {
+                    ERR("IoMakeAssociatedIrp failed\n");
+                    return STATUS_INSUFFICIENT_RESOURCES;
                 }
-                
-                pos += ci->stripe_length;
             }
             
-            stripe = (stripe + 1) % (ci->num_stripes / ci->sub_stripes);
-        }
-        
-        if (is_tree) {
-            tree_header* th = (tree_header*)buf;
-            UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
+            IrpSp = IoGetNextIrpStackLocation(context->stripes[reconstruct_stripe].Irp);
+            IrpSp->MajorFunction = IRP_MJ_READ;
             
-            if (addr != th->address || crc32 != *((UINT32*)th->csum)) {
-                WARN("crc32 was %08x, expected %08x\n", crc32, *((UINT32*)th->csum));
-                checksum_error = TRUE;
-                stripes[startoffstripe]->status = ReadDataStatus_CRCError;
+            if (devices[reconstruct_stripe]->devobj->Flags & DO_BUFFERED_IO) {
+                FIXME("FIXME - buffered IO\n");
+            } else if (devices[reconstruct_stripe]->devobj->Flags & DO_DIRECT_IO) {
+                context->stripes[reconstruct_stripe].Irp->MdlAddress = IoAllocateMdl(context->stripes[reconstruct_stripe].buf,
+                                                                                        stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe], FALSE, FALSE, NULL);
+                if (!context->stripes[reconstruct_stripe].Irp->MdlAddress) {
+                    ERR("IoAllocateMdl failed\n");
+                    return STATUS_INSUFFICIENT_RESOURCES;
+                }
+                
+                MmProbeAndLockPages(context->stripes[reconstruct_stripe].Irp->MdlAddress, KernelMode, IoWriteAccess);
+            } else {
+                context->stripes[reconstruct_stripe].Irp->UserBuffer = context->stripes[reconstruct_stripe].buf;
             }
-        }
-        
-        if (checksum_error) {
-            // FIXME - update dev stats
-            
-            WARN("checksum error\n");
+
+            IrpSp->Parameters.Read.Length = stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe];
+            IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[reconstruct_stripe] + cis[reconstruct_stripe].offset;
             
-            context->stripes_left = 0;
+            context->stripes[reconstruct_stripe].Irp->UserIosb = &context->stripes[reconstruct_stripe].iosb;
             
-            for (i = 0; i < ci->num_stripes; i++) {
-                if (context->stripes[i].status == ReadDataStatus_CRCError) {
-                    UINT16 other_stripe = (i % 1) ? (i - 1) : (i + 1);
-                    
-                    if (context->stripes[other_stripe].status == ReadDataStatus_Cancelled) {
-                        PIO_STACK_LOCATION IrpSp;
-                        
-                        // re-run Irp that we cancelled
-                        
-                        if (context->stripes[other_stripe].Irp) {
-                            if (devices[other_stripe]->devobj->Flags & DO_DIRECT_IO) {
-                                MmUnlockPages(context->stripes[other_stripe].Irp->MdlAddress);
-                                IoFreeMdl(context->stripes[other_stripe].Irp->MdlAddress);
-                            }
-                            IoFreeIrp(context->stripes[other_stripe].Irp);
-                        }
-                        
-                        if (!Irp) {
-                            context->stripes[other_stripe].Irp = IoAllocateIrp(devices[other_stripe]->devobj->StackSize, FALSE);
-                            
-                            if (!context->stripes[other_stripe].Irp) {
-                                ERR("IoAllocateIrp failed\n");
-                                Status = STATUS_INSUFFICIENT_RESOURCES;
-                                goto exit;
-                            }
-                        } else {
-                            context->stripes[other_stripe].Irp = IoMakeAssociatedIrp(Irp, devices[other_stripe]->devobj->StackSize);
-                            
-                            if (!context->stripes[other_stripe].Irp) {
-                                ERR("IoMakeAssociatedIrp failed\n");
-                                Status = STATUS_INSUFFICIENT_RESOURCES;
-                                goto exit;
-                            }
-                        }
-                        
-                        IrpSp = IoGetNextIrpStackLocation(context->stripes[other_stripe].Irp);
-                        IrpSp->MajorFunction = IRP_MJ_READ;
-                        
-                        if (devices[other_stripe]->devobj->Flags & DO_BUFFERED_IO) {
-                            FIXME("FIXME - buffered IO\n");
-                        } else if (devices[other_stripe]->devobj->Flags & DO_DIRECT_IO) {
-                            context->stripes[other_stripe].Irp->MdlAddress = IoAllocateMdl(context->stripes[other_stripe].buf, stripeend[other_stripe] - stripestart[other_stripe], FALSE, FALSE, NULL);
-                            if (!context->stripes[other_stripe].Irp->MdlAddress) {
-                                ERR("IoAllocateMdl failed\n");
-                                Status = STATUS_INSUFFICIENT_RESOURCES;
-                                goto exit;
-                            }
-                            
-                            MmProbeAndLockPages(context->stripes[other_stripe].Irp->MdlAddress, KernelMode, IoWriteAccess);
-                        } else {
-                            context->stripes[other_stripe].Irp->UserBuffer = context->stripes[other_stripe].buf;
-                        }
+            IoSetCompletionRoutine(context->stripes[reconstruct_stripe].Irp, read_data_completion, &context->stripes[reconstruct_stripe], TRUE, TRUE, TRUE);
 
-                        IrpSp->Parameters.Read.Length = stripeend[other_stripe] - stripestart[other_stripe];
-                        IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[other_stripe] + cis[other_stripe].offset;
-                        
-                        context->stripes[other_stripe].Irp->UserIosb = &context->stripes[other_stripe].iosb;
-                        
-                        IoSetCompletionRoutine(context->stripes[other_stripe].Irp, read_data_completion, &context->stripes[other_stripe], TRUE, TRUE, TRUE);
-                        
-                        context->stripes_left++;
-                        context->stripes[other_stripe].status = ReadDataStatus_Pending;
-                    }
-                }
-            }
-            
-            if (context->stripes_left == 0) {
-                WARN("could not recover from checksum error\n");
-                ExFreePool(stripes);
-                ExFreePool(stripeoff);
-                Status = STATUS_CRC_ERROR;
-                goto exit;
-            }
+            context->stripes[reconstruct_stripe].status = ReadDataStatus_Pending;
+        }
             
-            context->stripes_cancel = 0;
+        if (needs_reconstruct > 0) {
+#ifdef DEBUG_STATS
+            LARGE_INTEGER time1, time2;
+#endif
+            context->stripes_left = needs_reconstruct;
             KeClearEvent(&context->Event);
             
 #ifdef DEBUG_STATS
-            if (!is_tree)
+            if (!context->tree)
                 time1 = KeQueryPerformanceCounter(NULL);
 #endif
-
-            for (i = 0; i < ci->num_stripes; i++) {
-                if (context->stripes[i].status == ReadDataStatus_Pending) {
-                    IoCallDriver(devices[i]->devobj, context->stripes[i].Irp);
-                }
+            
+            for (i = 0; i < needs_reconstruct; i++) {
+                UINT16 reconstruct_stripe = i == 0 ? missing1 : missing2;
+                
+                IoCallDriver(devices[reconstruct_stripe]->devobj, context->stripes[reconstruct_stripe].Irp);
             }
             
             KeWaitForSingleObject(&context->Event, Executive, KernelMode, FALSE, NULL);
             
 #ifdef DEBUG_STATS
-            if (!is_tree) {
+            if (!context->tree) {
                 time2 = KeQueryPerformanceCounter(NULL);
                 
                 Vcb->stats.read_disk_time += time2.QuadPart - time1.QuadPart;
             }
 #endif
 
-            for (i = 0; i < ci->num_stripes; i++) {
-                if (context->stripes[i].status == ReadDataStatus_CRCError) {
-                    UINT16 other_stripe = (i % 1) ? (i - 1) : (i + 1);
-                    
-                    if (context->stripes[other_stripe].status != ReadDataStatus_Success) {
-                        WARN("could not recover from checksum error\n");
-                        ExFreePool(stripes);
-                        ExFreePool(stripeoff);
-                        Status = STATUS_CRC_ERROR;
-                        goto exit;
-                    }
+            for (i = 0; i < needs_reconstruct; i++) {
+                UINT16 reconstruct_stripe = i == 0 ? missing1 : missing2;
+                
+                if (context->stripes[reconstruct_stripe].status != ReadDataStatus_Success) {
+                    ERR("unrecoverable checksum error\n");
+                    return STATUS_CRC_ERROR;
                 }
             }
-
-            RtlZeroMemory(stripeoff, sizeof(UINT32) * ci->num_stripes / ci->sub_stripes);
+        }
+        
+        off = origoff;
         
+        if (context->tree) {
             pos = 0;
-            stripe = startoffstripe / ci->sub_stripes;
-            while (pos < length) {
-                if (pos == 0) {
-                    UINT32 readlen = min(stripeend[stripe * ci->sub_stripes] - stripestart[stripe * ci->sub_stripes], ci->stripe_length - (stripestart[stripe * ci->sub_stripes] % ci->stripe_length));
-                    
-                    stripeoff[stripe] += readlen;
-                    pos += readlen;
-                    
-                    if (context->csum && stripes[stripe]->status == ReadDataStatus_CRCError) {
-                        for (i = 0; i < readlen / Vcb->superblock.sector_size; i++) {
-                            UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                        
-                            if (crc32 != csum[i]) {
-                                UINT16 other_stripe = (stripe * ci->sub_stripes) + (context->stripes[stripe * ci->sub_stripes].status == ReadDataStatus_CRCError ? 1 : 0);
-                                UINT32 crc32b = ~calc_crc32c(0xffffffff, context->stripes[other_stripe].buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                                
-                                if (crc32b == csum[i]) {
-                                    RtlCopyMemory(buf + (i * Vcb->superblock.sector_size), context->stripes[other_stripe].buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                                    RtlCopyMemory(stripes[stripe]->buf + (i * Vcb->superblock.sector_size), context->stripes[other_stripe].buf + (i * Vcb->superblock.sector_size),
-                                                  Vcb->superblock.sector_size);
-                                    stripes[stripe]->rewrite = TRUE;
-                                } else {
-                                    WARN("could not recover from checksum error\n");
-                                    ExFreePool(stripes);
-                                    ExFreePool(stripeoff);
-                                    Status = STATUS_CRC_ERROR;
-                                    goto exit;
-                                }
-                            }
-                        }
-                    } else if (is_tree) {
-                        UINT16 other_stripe = (stripe * ci->sub_stripes) + (context->stripes[stripe * ci->sub_stripes].status == ReadDataStatus_CRCError ? 1 : 0);
-                        tree_header* th = (tree_header*)buf;
-                        UINT32 crc32;
-                        
-                        RtlCopyMemory(buf, context->stripes[other_stripe].buf, readlen);
-                        
-                        crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
-                        
-                        if (addr != th->address || crc32 != *((UINT32*)th->csum)) {
-                            WARN("could not recover from checksum error\n");
-                            ExFreePool(stripes);
-                            ExFreePool(stripeoff);
-                            Status = STATUS_CRC_ERROR;
-                            goto exit;
-                        }
-                        
-                        RtlCopyMemory(stripes[stripe]->buf, buf, readlen);
-                        stripes[stripe]->rewrite = TRUE;
-                    }
-                } else if (length - pos < ci->stripe_length) {
-                    if (context->csum && stripes[stripe]->status == ReadDataStatus_CRCError) {
-                        for (i = 0; i < (length - pos) / Vcb->superblock.sector_size; i++) {
-                            UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + pos + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                        
-                            if (crc32 != csum[(pos / Vcb->superblock.sector_size) + i]) {
-                                UINT16 other_stripe = (stripe * ci->sub_stripes) + (context->stripes[stripe * ci->sub_stripes].status == ReadDataStatus_CRCError ? 1 : 0);
-                                UINT32 crc32b = ~calc_crc32c(0xffffffff, &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
-                                                                Vcb->superblock.sector_size);
-                                
-                                if (crc32b == csum[i]) {
-                                    RtlCopyMemory(buf + pos + (i * Vcb->superblock.sector_size),
-                                                    &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size);
-                                    RtlCopyMemory(&stripes[stripe]->buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
-                                                  &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
-                                                  Vcb->superblock.sector_size);
-                                    stripes[stripe]->rewrite = TRUE;
-                                } else {
-                                    WARN("could not recover from checksum error\n");
-                                    ExFreePool(stripes);
-                                    ExFreePool(stripeoff);
-                                    Status = STATUS_CRC_ERROR;
-                                    goto exit;
-                                }
-                            }
-                        }
-                    }
-                    
-                    pos = length;
-                } else {
-                    if (context->csum && stripes[stripe]->status == ReadDataStatus_CRCError) {
-                        for (i = 0; i < ci->stripe_length / Vcb->superblock.sector_size; i++) {
-                            UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + pos + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                        
-                            if (crc32 != csum[(pos / Vcb->superblock.sector_size) + i]) {
-                                UINT16 other_stripe = (stripe * ci->sub_stripes) + (context->stripes[stripe * ci->sub_stripes].status == ReadDataStatus_CRCError ? 1 : 0);
-                                UINT32 crc32b = ~calc_crc32c(0xffffffff, &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
-                                                                Vcb->superblock.sector_size);
-                                
-                                if (crc32b == csum[i]) {
-                                    RtlCopyMemory(buf + pos + (i * Vcb->superblock.sector_size),
-                                                    &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size);
-                                    RtlCopyMemory(&stripes[stripe]->buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
-                                                  &context->stripes[other_stripe].buf[stripeoff[stripe] + (i * Vcb->superblock.sector_size)],
-                                                  Vcb->superblock.sector_size);
-                                    stripes[stripe]->rewrite = TRUE;
-                                } else {
-                                    WARN("could not recover from checksum error\n");
-                                    ExFreePool(stripes);
-                                    ExFreePool(stripeoff);
-                                    Status = STATUS_CRC_ERROR;
-                                    goto exit;
-                                }
-                            }
-                        }
-                    }
-                    
-                    stripeoff[stripe] += ci->stripe_length;
-                    pos += ci->stripe_length;
-                }
-                
-                stripe = (stripe + 1) % (ci->num_stripes / ci->sub_stripes);
+            stripeoff = 0;
+            if (!raid6_decode_with_checksum_metadata(addr, off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize, Vcb->superblock.node_size)) {
+                ERR("unrecoverable metadata checksum error\n");
+                return STATUS_CRC_ERROR;
             }
+        } else {
+            pos = 0;
+            stripeoff = 0;
+            if (!raid6_decode_with_checksum(off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize, context->csum, Vcb->superblock.sector_size))
+                return STATUS_CRC_ERROR;
             
-            // write good data over bad
-            
-            if (!Vcb->readonly) {
-                for (i = 0; i < ci->num_stripes; i++) {
-                    if (context->stripes[i].rewrite && devices[i] && !devices[i]->readonly) {
-                        Status = write_data_phys(devices[i]->devobj, cis[i].offset + stripestart[i], context->stripes[i].buf, stripeend[i] - stripestart[i]);
-                        
-                        if (!NT_SUCCESS(Status))
-                            WARN("write_data_phys returned %08x\n", Status);
-                    }
-                }
+            while (pos < length) {
+                off += (ci->num_stripes - 1) * ci->stripe_length;
+                if (!raid6_decode_with_checksum(off, 0, context, ci, &stripeoff, buf, &pos, length, 0, context->csum, Vcb->superblock.sector_size))
+                    return STATUS_CRC_ERROR;
             }
         }
-        
-        ExFreePool(stripes);
-        ExFreePool(stripeoff);
-        
-        // FIXME - handle the case where one of the stripes doesn't read everything, i.e. Irp->IoStatus.Information is short
-        
-        Status = STATUS_SUCCESS;
-    } else if (type == BLOCK_FLAG_DUPLICATE) {
-        BOOL checksum_error = FALSE;
-        UINT16 cancelled = 0;
+    }
+    
+    // write good data over bad
+    
+    if (!Vcb->readonly) {
+        CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
         
         for (i = 0; i < ci->num_stripes; i++) {
-            if (context->stripes[i].status == ReadDataStatus_Success) {
-                if (context->tree) {
-                    tree_header* th = (tree_header*)context->stripes[i].buf;
-                    UINT32 crc32;
-                    
-                    crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, context->buflen - sizeof(th->csum));
-                    
-                    if (th->address != context->address || crc32 != *((UINT32*)th->csum)) {
-                        context->stripes[i].status = ReadDataStatus_CRCError;
-                        checksum_error = TRUE;
-                    }
-                } else if (context->csum) {
-                    UINT32 j;
-                    
-#ifdef DEBUG_STATS
-                    time1 = KeQueryPerformanceCounter(NULL);
-#endif
-        
-                    for (j = 0; j < context->stripes[i].Irp->IoStatus.Information / context->sector_size; j++) {
-                        UINT32 crc32 = ~calc_crc32c(0xffffffff, context->stripes[i].buf + (j * context->sector_size), context->sector_size);
-                        
-                        if (crc32 != context->csum[j]) {
-                            context->stripes[i].status = ReadDataStatus_CRCError;
-                            checksum_error = TRUE;
-                            break;
-                        }
-                    }
-#ifdef DEBUG_STATS
-                    time2 = KeQueryPerformanceCounter(NULL);
-                    
-                    Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
-#endif
-                }
-            } else if (context->stripes[i].status == ReadDataStatus_Cancelled) {
-                cancelled++;
+            if (context->stripes[i].rewrite && devices[i] && !devices[i]->readonly) {
+                Status = write_data_phys(devices[i]->devobj, cis[i].offset + stripestart[i], context->stripes[i].buf, stripeend[i] - stripestart[i]);
+                
+                if (!NT_SUCCESS(Status))
+                    WARN("write_data_phys returned %08x\n", Status);
             }
         }
-        
-        if (checksum_error) {
-            // FIXME - update dev stats
-            
-            if (cancelled > 0) {
-                context->stripes_left = 0;
-                
-                for (i = 0; i < ci->num_stripes; i++) {
-                    if (context->stripes[i].status == ReadDataStatus_Cancelled) {
-                        PIO_STACK_LOCATION IrpSp;
-                        
-                        // re-run Irp that we cancelled
-                        
-                        if (context->stripes[i].Irp) {
-                            if (devices[i]->devobj->Flags & DO_DIRECT_IO) {
-                                MmUnlockPages(context->stripes[i].Irp->MdlAddress);
-                                IoFreeMdl(context->stripes[i].Irp->MdlAddress);
-                            }
-                            IoFreeIrp(context->stripes[i].Irp);
-                        }
-                        
-                        if (!Irp) {
-                            context->stripes[i].Irp = IoAllocateIrp(devices[i]->devobj->StackSize, FALSE);
-                            
-                            if (!context->stripes[i].Irp) {
-                                ERR("IoAllocateIrp failed\n");
-                                Status = STATUS_INSUFFICIENT_RESOURCES;
-                                goto exit;
-                            }
-                        } else {
-                            context->stripes[i].Irp = IoMakeAssociatedIrp(Irp, devices[i]->devobj->StackSize);
-                            
-                            if (!context->stripes[i].Irp) {
-                                ERR("IoMakeAssociatedIrp failed\n");
-                                Status = STATUS_INSUFFICIENT_RESOURCES;
-                                goto exit;
-                            }
-                        }
-                        
-                        IrpSp = IoGetNextIrpStackLocation(context->stripes[i].Irp);
-                        IrpSp->MajorFunction = IRP_MJ_READ;
-                        
-                        if (devices[i]->devobj->Flags & DO_BUFFERED_IO) {
-                            FIXME("FIXME - buffered IO\n");
-                        } else if (devices[i]->devobj->Flags & DO_DIRECT_IO) {
-                            context->stripes[i].Irp->MdlAddress = IoAllocateMdl(context->stripes[i].buf, stripeend[i] - stripestart[i], FALSE, FALSE, NULL);
-                            if (!context->stripes[i].Irp->MdlAddress) {
-                                ERR("IoAllocateMdl failed\n");
-                                Status = STATUS_INSUFFICIENT_RESOURCES;
-                                goto exit;
-                            }
-                            
-                            MmProbeAndLockPages(context->stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
-                        } else {
-                            context->stripes[i].Irp->UserBuffer = context->stripes[i].buf;
-                        }
-
-                        IrpSp->Parameters.Read.Length = stripeend[i] - stripestart[i];
-                        IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[i] + cis[i].offset;
-                        
-                        context->stripes[i].Irp->UserIosb = &context->stripes[i].iosb;
-                        
-                        IoSetCompletionRoutine(context->stripes[i].Irp, read_data_completion, &context->stripes[i], TRUE, TRUE, TRUE);
-                        
-                        context->stripes_left++;
-                        context->stripes[i].status = ReadDataStatus_Pending;
-                    }
-                }
-                
-                context->stripes_cancel = 0;
-                KeClearEvent(&context->Event);
-                
-#ifdef DEBUG_STATS
-                if (!is_tree)
-                    time1 = KeQueryPerformanceCounter(NULL);
-#endif
+    }
+    
+    return STATUS_SUCCESS;
+}
 
-                for (i = 0; i < ci->num_stripes; i++) {
-                    if (context->stripes[i].status == ReadDataStatus_Pending) {
-                        IoCallDriver(devices[i]->devobj, context->stripes[i].Irp);
-                    }
-                }
-                
-                KeWaitForSingleObject(&context->Event, Executive, KernelMode, FALSE, NULL);
-                
-#ifdef DEBUG_STATS
-                if (!is_tree) {
-                    time2 = KeQueryPerformanceCounter(NULL);
-                    
-                    Vcb->stats.read_disk_time += time2.QuadPart - time1.QuadPart;
-                }
-#endif
-                for (i = 0; i < ci->num_stripes; i++) {
-                    if (context->stripes[i].status == ReadDataStatus_Success) {
-                        if (context->tree) {
-                            tree_header* th = (tree_header*)context->stripes[i].buf;
-                            UINT32 crc32;
-                            
-                            crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, context->buflen - sizeof(th->csum));
-                            
-                            if (th->address != context->address || crc32 != *((UINT32*)th->csum)) {
-                                context->stripes[i].status = ReadDataStatus_CRCError;
-                                checksum_error = TRUE;
-                            }
-                        } else if (context->csum) {
-                            UINT32 j;
-                            
-#ifdef DEBUG_STATS
-                            time1 = KeQueryPerformanceCounter(NULL);
-#endif
-                            for (j = 0; j < context->stripes[i].Irp->IoStatus.Information / context->sector_size; j++) {
-                                UINT32 crc32 = ~calc_crc32c(0xffffffff, context->stripes[i].buf + (j * context->sector_size), context->sector_size);
-                                
-                                if (crc32 != context->csum[j]) {
-                                    context->stripes[i].status = ReadDataStatus_CRCError;
-                                    checksum_error = TRUE;
-                                    break;
-                                }
-                            }
-#ifdef DEBUG_STATS
-                            time2 = KeQueryPerformanceCounter(NULL);
-                            
-                            Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
-#endif
-                        }
-                    }
-                }
-            }
-            
-            for (i = 0; i < ci->num_stripes; i++) {
-                if (context->stripes[i].status == ReadDataStatus_Success) {
-                    RtlCopyMemory(buf, context->stripes[i].buf, length);
-                    goto raid1write;
-                }
-            }
-            
-            if (context->tree || ci->num_stripes == 1) { // unable to recover from checksum error
-                ERR("unrecoverable checksum error at %llx\n", addr);
-                
-#ifdef _DEBUG
-                if (context->tree) {
-                    for (i = 0; i < ci->num_stripes; i++) {
-                        if (context->stripes[i].status == ReadDataStatus_CRCError) {
-                            tree_header* th = (tree_header*)context->stripes[i].buf;
-                            UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, context->buflen - sizeof(th->csum));
-                            
-                            WARN("crc32 was %08x, expected %08x\n", crc32, *((UINT32*)th->csum));
-                        }
-                    }
-                }
-#endif
-                Status = STATUS_CRC_ERROR;
-                goto exit;
-            }
-            
-            // checksum errors on both stripes - we need to check sector by sector
-            
-            for (i = 0; i < (stripeend[0] - stripestart[0]) / context->sector_size; i++) {
-                UINT16 j;
-                BOOL success = FALSE;
-                
-#ifdef DEBUG_STATS
-                time1 = KeQueryPerformanceCounter(NULL);
-#endif
-                
-                for (j = 0; j < ci->num_stripes; j++) {
-                    if (context->stripes[j].status == ReadDataStatus_CRCError) {
-                        UINT32 crc32 = ~calc_crc32c(0xffffffff, context->stripes[j].buf + (i * context->sector_size), context->sector_size);
-                        
-                        if (crc32 == context->csum[i]) {
-                            RtlCopyMemory(buf + (i * context->sector_size), context->stripes[j].buf + (i * context->sector_size), context->sector_size);
-                            success = TRUE;
-                            break;
-                        }
-                    }
-                }
-                
+NTSTATUS STDCALL read_data(device_extension* Vcb, UINT64 addr, UINT32 length, UINT32* csum, BOOL is_tree, UINT8* buf, chunk* c, chunk** pc,
+                           PIRP Irp, BOOL check_nocsum_parity) {
+    CHUNK_ITEM* ci;
+    CHUNK_ITEM_STRIPE* cis;
+    read_data_context* context;
+    UINT64 i, type, offset;
+    NTSTATUS Status;
+    device** devices;
+    UINT64 *stripestart = NULL, *stripeend = NULL;
+    UINT32 firststripesize;
+    UINT16 startoffstripe, allowed_missing, missing_devices = 0;
 #ifdef DEBUG_STATS
-                time2 = KeQueryPerformanceCounter(NULL);
-
-                Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
+    LARGE_INTEGER time1, time2;
 #endif
-                if (!success) {
-                    ERR("unrecoverable checksum error at %llx\n", addr + (i * context->sector_size));
-                    Status = STATUS_CRC_ERROR;
-                    goto exit;
-                }
-            }
-            
-raid1write:
-            // write good data over bad
-            
-            if (!Vcb->readonly) {
-                for (i = 0; i < ci->num_stripes; i++) {
-                    if (context->stripes[i].status == ReadDataStatus_CRCError && devices[i] && !devices[i]->readonly) {
-                        Status = write_data_phys(devices[i]->devobj, cis[i].offset + stripestart[i], buf, length);
-                        
-                        if (!NT_SUCCESS(Status))
-                            WARN("write_data_phys returned %08x\n", Status);
-                    }
-                }
-            }
-            
-            Status = STATUS_SUCCESS;
-            goto exit;
-        }
-        
-        // check if any of the stripes succeeded
-        
-        for (i = 0; i < ci->num_stripes; i++) {
-            if (context->stripes[i].status == ReadDataStatus_Success) {
-                RtlCopyMemory(buf, context->stripes[i].buf, length);
-                Status = STATUS_SUCCESS;
-                goto exit;
-            }
-        }
-        
-        // failing that, return the first error we encountered
-        
-        for (i = 0; i < ci->num_stripes; i++) {
-            if (context->stripes[i].status == ReadDataStatus_Error) {
-                Status = context->stripes[i].iosb.Status;
-                goto exit;
-            }
-        }
-        
-        // if we somehow get here, return STATUS_INTERNAL_ERROR
-        
-        Status = STATUS_INTERNAL_ERROR;
-    } else if (type == BLOCK_FLAG_RAID5) {
-        UINT32 pos, skip;
-        int num_errors = 0;
-        UINT64 off, stripeoff, origoff;
-        BOOL needs_reconstruct = FALSE;
-        UINT64 reconstruct_stripe;
-        BOOL checksum_error = FALSE;
-        
-        for (i = 0; i < ci->num_stripes; i++) {
-            if (context->stripes[i].status == ReadDataStatus_Error) {
-                num_errors++;
-                if (num_errors > 1)
-                    break;
-            }
-        }
-        
-        if (num_errors > 1) {
-            for (i = 0; i < ci->num_stripes; i++) {
-                if (context->stripes[i].status == ReadDataStatus_Error) {
-                    WARN("stripe %llu returned error %08x\n", i, context->stripes[i].iosb.Status); 
-                    Status = context->stripes[i].iosb.Status;
-                    goto exit;
-                }
-            }
-        }
-        
-        off = addr - offset;
-        off -= off % ((ci->num_stripes - 1) * ci->stripe_length);
-        skip = addr - offset - off;
-        origoff = off;
-        
-        for (i = 0; i < ci->num_stripes; i++) {
-            if (context->stripes[i].status == ReadDataStatus_Cancelled) {
-                if (needs_reconstruct) {
-                    ERR("more than one stripe needs reconstruction\n");
-                    Status = STATUS_INTERNAL_ERROR;
-                    goto exit;
-                } else {
-                    needs_reconstruct = TRUE;
-                    reconstruct_stripe = i;
-                }
-            }
-        }
-        
-        if (needs_reconstruct) {
-            TRACE("reconstructing stripe %u\n", reconstruct_stripe);
-            
-            stripeoff = 0;
-            
-            raid5_reconstruct(off, skip, context, ci, &stripeoff, stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe], TRUE, firststripesize, reconstruct_stripe);
+    
+    if (Vcb->log_to_phys_loaded) {
+        if (!c) {
+            c = get_chunk_from_address(Vcb, addr);
             
-            while (stripeoff < stripeend[0] - stripestart[0]) {
-                off += (ci->num_stripes - 1) * ci->stripe_length;
-                raid5_reconstruct(off, 0, context, ci, &stripeoff, stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe], FALSE, 0, reconstruct_stripe);
+            if (!c) {
+                ERR("get_chunk_from_address failed\n");
+                return STATUS_INTERNAL_ERROR;
             }
-            
-            off = addr - offset;
-            off -= off % ((ci->num_stripes - 1) * ci->stripe_length);
         }
         
-        pos = 0;
-        stripeoff = 0;
-        raid5_decode(off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize);
+        ci = c->chunk_item;
+        offset = c->offset;
+        devices = c->devices;
+           
+        if (pc)
+            *pc = c;
+    } else {
+        LIST_ENTRY* le = Vcb->sys_chunks.Flink;
         
-        while (pos < length) {
-            off += (ci->num_stripes - 1) * ci->stripe_length;
-            raid5_decode(off, 0, context, ci, &stripeoff, buf, &pos, length, 0);
-        }
+        ci = NULL;
         
-        if (is_tree) {
-            tree_header* th = (tree_header*)buf;
-            UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
-            
-            if (addr != th->address || crc32 != *((UINT32*)th->csum))
-                checksum_error = TRUE;
-        } else if (csum) {
-#ifdef DEBUG_STATS
-            time1 = KeQueryPerformanceCounter(NULL);
-#endif
-            for (i = 0; i < length / Vcb->superblock.sector_size; i++) {
-                UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                
-                if (crc32 != csum[i]) {
-                    checksum_error = TRUE;
-                    break;
-                }
-            }
-#ifdef DEBUG_STATS
-            time2 = KeQueryPerformanceCounter(NULL);
+        while (le != &Vcb->sys_chunks) {
+            sys_chunk* sc = CONTAINING_RECORD(le, sys_chunk, list_entry);
             
-            Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
-#endif
-        }
-        
-        if (checksum_error) {
-            if (needs_reconstruct) {
-                PIO_STACK_LOCATION IrpSp;
-                
-                // re-run Irp that we cancelled
-                
-                if (context->stripes[reconstruct_stripe].Irp) {
-                    if (devices[reconstruct_stripe]->devobj->Flags & DO_DIRECT_IO) {
-                        MmUnlockPages(context->stripes[reconstruct_stripe].Irp->MdlAddress);
-                        IoFreeMdl(context->stripes[reconstruct_stripe].Irp->MdlAddress);
-                    }
-                    IoFreeIrp(context->stripes[reconstruct_stripe].Irp);
-                }
+            if (sc->key.obj_id == 0x100 && sc->key.obj_type == TYPE_CHUNK_ITEM && sc->key.offset <= addr) {
+                CHUNK_ITEM* chunk_item = sc->data;
                 
-                if (!Irp) {
-                    context->stripes[reconstruct_stripe].Irp = IoAllocateIrp(devices[reconstruct_stripe]->devobj->StackSize, FALSE);
+                if ((addr - sc->key.offset) < chunk_item->size && chunk_item->num_stripes > 0) {
+                    ci = chunk_item;
+                    offset = sc->key.offset;
+                    cis = (CHUNK_ITEM_STRIPE*)&chunk_item[1];
                     
-                    if (!context->stripes[reconstruct_stripe].Irp) {
-                        ERR("IoAllocateIrp failed\n");
-                        Status = STATUS_INSUFFICIENT_RESOURCES;
-                        goto exit;
+                    devices = ExAllocatePoolWithTag(PagedPool, sizeof(device*) * ci->num_stripes, ALLOC_TAG);
+                    if (!devices) {
+                        ERR("out of memory\n");
+                        return STATUS_INSUFFICIENT_RESOURCES;
                     }
-                } else {
-                    context->stripes[reconstruct_stripe].Irp = IoMakeAssociatedIrp(Irp, devices[reconstruct_stripe]->devobj->StackSize);
                     
-                    if (!context->stripes[reconstruct_stripe].Irp) {
-                        ERR("IoMakeAssociatedIrp failed\n");
-                        Status = STATUS_INSUFFICIENT_RESOURCES;
-                        goto exit;
-                    }
-                }
-                
-                IrpSp = IoGetNextIrpStackLocation(context->stripes[reconstruct_stripe].Irp);
-                IrpSp->MajorFunction = IRP_MJ_READ;
-                
-                if (devices[reconstruct_stripe]->devobj->Flags & DO_BUFFERED_IO) {
-                    FIXME("FIXME - buffered IO\n");
-                } else if (devices[reconstruct_stripe]->devobj->Flags & DO_DIRECT_IO) {
-                    context->stripes[reconstruct_stripe].Irp->MdlAddress = IoAllocateMdl(context->stripes[reconstruct_stripe].buf,
-                                                                                         stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe], FALSE, FALSE, NULL);
-                    if (!context->stripes[reconstruct_stripe].Irp->MdlAddress) {
-                        ERR("IoAllocateMdl failed\n");
-                        Status = STATUS_INSUFFICIENT_RESOURCES;
-                        goto exit;
+                    for (i = 0; i < ci->num_stripes; i++) {
+                        devices[i] = find_device_from_uuid(Vcb, &cis[i].dev_uuid);
                     }
                     
-                    MmProbeAndLockPages(context->stripes[reconstruct_stripe].Irp->MdlAddress, KernelMode, IoWriteAccess);
-                } else {
-                    context->stripes[reconstruct_stripe].Irp->UserBuffer = context->stripes[reconstruct_stripe].buf;
+                    break;
                 }
+            }
+            
+            le = le->Flink;
+        }
+        
+        if (!ci) {
+            ERR("could not find chunk for %llx in bootstrap\n", addr);
+            return STATUS_INTERNAL_ERROR;
+        }
+        
+        if (pc)
+            *pc = NULL;
+    }
+    
+    if (ci->type & BLOCK_FLAG_DUPLICATE) {
+        type = BLOCK_FLAG_DUPLICATE;
+        allowed_missing = 0;
+    } else if (ci->type & BLOCK_FLAG_RAID0) {
+        type = BLOCK_FLAG_RAID0;
+        allowed_missing = 0;
+    } else if (ci->type & BLOCK_FLAG_RAID1) {
+        type = BLOCK_FLAG_DUPLICATE;
+        allowed_missing = 1;
+    } else if (ci->type & BLOCK_FLAG_RAID10) {
+        type = BLOCK_FLAG_RAID10;
+        allowed_missing = 1;
+    } else if (ci->type & BLOCK_FLAG_RAID5) {
+        type = BLOCK_FLAG_RAID5;
+        allowed_missing = 1;
+    } else if (ci->type & BLOCK_FLAG_RAID6) {
+        type = BLOCK_FLAG_RAID6;
+        allowed_missing = 2;
+    } else { // SINGLE
+        type = BLOCK_FLAG_DUPLICATE;
+        allowed_missing = 0;
+    }
 
-                IrpSp->Parameters.Read.Length = stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe];
-                IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[reconstruct_stripe] + cis[reconstruct_stripe].offset;
-                
-                context->stripes[reconstruct_stripe].Irp->UserIosb = &context->stripes[reconstruct_stripe].iosb;
-                
-                IoSetCompletionRoutine(context->stripes[reconstruct_stripe].Irp, read_data_completion, &context->stripes[reconstruct_stripe], TRUE, TRUE, TRUE);
+    cis = (CHUNK_ITEM_STRIPE*)&ci[1];
 
-                context->stripes[reconstruct_stripe].status = ReadDataStatus_Pending;
-                
-                context->stripes_left = 1;
-                KeClearEvent(&context->Event);
-                
-#ifdef DEBUG_STATS
-                if (!is_tree)
-                    time1 = KeQueryPerformanceCounter(NULL);
-#endif
+    context = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_context), ALLOC_TAG);
+    if (!context) {
+        ERR("out of memory\n");
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
     
-                IoCallDriver(devices[reconstruct_stripe]->devobj, context->stripes[reconstruct_stripe].Irp);
-                
-                KeWaitForSingleObject(&context->Event, Executive, KernelMode, FALSE, NULL);
-                
-#ifdef DEBUG_STATS
-                if (!is_tree) {
-                    time2 = KeQueryPerformanceCounter(NULL);
-                    
-                    Vcb->stats.read_disk_time += time2.QuadPart - time1.QuadPart;
-                }
-#endif
+    RtlZeroMemory(context, sizeof(read_data_context));
+    KeInitializeEvent(&context->Event, NotificationEvent, FALSE);
     
-                if (context->stripes[reconstruct_stripe].status != ReadDataStatus_Success) {
-                    ERR("unrecoverable checksum error\n");
-                    Status = STATUS_CRC_ERROR;
-                    goto exit;
-                }
-            }
-            
-            if (context->tree) {
-                off = origoff;
-                pos = 0;
-                stripeoff = 0;
-                if (!raid5_decode_with_checksum_metadata(addr, off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize, Vcb->superblock.node_size)) {
-                    ERR("unrecoverable metadata checksum error\n");
-                    Status = STATUS_CRC_ERROR;
-                    goto exit;
-                }
+    context->stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe) * ci->num_stripes, ALLOC_TAG);
+    if (!context->stripes) {
+        ERR("out of memory\n");
+        ExFreePool(context);
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+    
+    RtlZeroMemory(context->stripes, sizeof(read_data_stripe) * ci->num_stripes);
+    
+    context->buflen = length;
+    context->num_stripes = ci->num_stripes;
+    context->stripes_left = context->num_stripes;
+    context->sector_size = Vcb->superblock.sector_size;
+    context->csum = csum;
+    context->tree = is_tree;
+    context->type = type;
+    context->check_nocsum_parity = check_nocsum_parity;
+    
+    stripestart = ExAllocatePoolWithTag(NonPagedPool, sizeof(UINT64) * ci->num_stripes, ALLOC_TAG);
+    if (!stripestart) {
+        ERR("out of memory\n");
+        ExFreePool(context);
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+    
+    stripeend = ExAllocatePoolWithTag(NonPagedPool, sizeof(UINT64) * ci->num_stripes, ALLOC_TAG);
+    if (!stripeend) {
+        ERR("out of memory\n");
+        ExFreePool(stripestart);
+        ExFreePool(context);
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+    
+    if (type == BLOCK_FLAG_RAID0) {
+        UINT64 startoff, endoff;
+        UINT16 endoffstripe;
+        
+        get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes, &startoff, &startoffstripe);
+        get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes, &endoff, &endoffstripe);
+        
+        for (i = 0; i < ci->num_stripes; i++) {
+            if (startoffstripe > i) {
+                stripestart[i] = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
+            } else if (startoffstripe == i) {
+                stripestart[i] = startoff;
             } else {
-                off = origoff;
-                pos = 0;
-                stripeoff = 0;
-                if (!raid5_decode_with_checksum(off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize, csum, Vcb->superblock.sector_size)) {
-                    Status = STATUS_CRC_ERROR;
-                    goto exit;
-                }
-                
-                while (pos < length) {
-                    off += (ci->num_stripes - 1) * ci->stripe_length;
-                    if (!raid5_decode_with_checksum(off, 0, context, ci, &stripeoff, buf, &pos, length, 0, csum, Vcb->superblock.sector_size)) {
-                        Status = STATUS_CRC_ERROR;
-                        goto exit;
-                    }
-                }
+                stripestart[i] = startoff - (startoff % ci->stripe_length);
             }
             
-            // write good data over bad
-            
-            if (!Vcb->readonly) {
-                for (i = 0; i < ci->num_stripes; i++) {
-                    if (context->stripes[i].rewrite && devices[i] && !devices[i]->readonly) {
-                        Status = write_data_phys(devices[i]->devobj, cis[i].offset + stripestart[i], context->stripes[i].buf, stripeend[i] - stripestart[i]);
-                        
-                        if (!NT_SUCCESS(Status))
-                            WARN("write_data_phys returned %08x\n", Status);
-                    }
-                }
+            if (endoffstripe > i) {
+                stripeend[i] = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
+            } else if (endoffstripe == i) {
+                stripeend[i] = endoff + 1;
+            } else {
+                stripeend[i] = endoff - (endoff % ci->stripe_length);
             }
         }
+    } else if (type == BLOCK_FLAG_RAID10) {
+        UINT64 startoff, endoff;
+        UINT16 endoffstripe, j;
         
-        if (!context->tree && !context->csum) {
-            UINT32* parity_buf;
-            
-            // We are reading a nodatacsum extent. Even though there's no checksum, we
-            // can still identify errors by checking if the parity is consistent.
-            
-            parity_buf = ExAllocatePoolWithTag(NonPagedPool, stripeend[0] - stripestart[0], ALLOC_TAG);
-            
-            if (!parity_buf) {
-                ERR("out of memory\n");
-                Status = STATUS_INSUFFICIENT_RESOURCES;
-                goto exit;
+        get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &startoff, &startoffstripe);
+        get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &endoff, &endoffstripe);
+        
+        if ((ci->num_stripes % ci->sub_stripes) != 0) {
+            ERR("chunk %llx: num_stripes %x was not a multiple of sub_stripes %x!\n", offset, ci->num_stripes, ci->sub_stripes);
+            Status = STATUS_INTERNAL_ERROR;
+            goto exit;
+        }
+        
+        context->firstoff = (startoff % ci->stripe_length) / Vcb->superblock.sector_size;
+        context->startoffstripe = startoffstripe;
+        context->sectors_per_stripe = ci->stripe_length / Vcb->superblock.sector_size;
+        
+        startoffstripe *= ci->sub_stripes;
+        endoffstripe *= ci->sub_stripes;
+        
+        for (i = 0; i < ci->num_stripes; i += ci->sub_stripes) {
+            if (startoffstripe > i) {
+                stripestart[i] = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
+            } else if (startoffstripe == i) {
+                stripestart[i] = startoff;
+            } else {
+                stripestart[i] = startoff - (startoff % ci->stripe_length);
             }
             
-            RtlCopyMemory(parity_buf, context->stripes[0].buf, stripeend[0] - stripestart[0]);
-            
-            for (i = 0; i < ci->num_stripes; i++) {
-                do_xor((UINT8*)parity_buf, context->stripes[i].buf, stripeend[0] - stripestart[0]);
+            if (endoffstripe > i) {
+                stripeend[i] = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
+            } else if (endoffstripe == i) {
+                stripeend[i] = endoff + 1;
+            } else {
+                stripeend[i] = endoff - (endoff % ci->stripe_length);
             }
             
-            for (i = 0; i < (stripeend[0] - stripestart[0]) / sizeof(UINT32); i++) {
-                if (parity_buf[i] != 0) {
-                    ERR("parity error on nodatacsum inode\n");
-                    ExFreePool(parity_buf);
-                    Status = STATUS_CRC_ERROR;
-                    goto exit;
-                }
+            for (j = 1; j < ci->sub_stripes; j++) {
+                stripestart[i+j] = stripestart[i];
+                stripeend[i+j] = stripeend[i];
             }
-            
-            ExFreePool(parity_buf);
         }
         
-        Status = STATUS_SUCCESS;
-    } else if (type == BLOCK_FLAG_RAID6) {
-        UINT32 pos, skip;
-        int num_errors = 0;
-        UINT64 off, stripeoff, origoff;
-        UINT8 needs_reconstruct = 0;
-        UINT16 missing1, missing2;
-        BOOL checksum_error = FALSE;
-        
+        context->stripes_cancel = 1;
+    } else if (type == BLOCK_FLAG_DUPLICATE) {
         for (i = 0; i < ci->num_stripes; i++) {
-            if (context->stripes[i].status == ReadDataStatus_Error) {
-                num_errors++;
-                if (num_errors > 2)
-                    break;
-            }
-        }
-        
-        if (num_errors > 2) {
-            for (i = 0; i < ci->num_stripes; i++) {
-                if (context->stripes[i].status == ReadDataStatus_Error) {
-                    WARN("stripe %llu returned error %08x\n", i, context->stripes[i].iosb.Status); 
-                    Status = context->stripes[i].iosb.Status;
-                    goto exit;
-                }
-            }
+            stripestart[i] = addr - offset;
+            stripeend[i] = stripestart[i] + length;
         }
         
-        off = addr - offset;
-        off -= off % ((ci->num_stripes - 2) * ci->stripe_length);
-        skip = addr - offset - off;
-        origoff = off;
+        context->stripes_cancel = ci->num_stripes - 1;
+    } else if (type == BLOCK_FLAG_RAID5) {
+        UINT64 startoff, endoff;
+        UINT16 endoffstripe;
+        UINT64 start = 0xffffffffffffffff, end = 0;
         
-        for (i = 0; i < ci->num_stripes; i++) {
-            if (context->stripes[i].status == ReadDataStatus_Cancelled) {
-                if (needs_reconstruct == 2) {
-                    ERR("more than two stripes need reconstruction\n");
-                    Status = STATUS_INTERNAL_ERROR;
-                    goto exit;
-                } else if (needs_reconstruct == 1) {
-                    needs_reconstruct++;
-                    missing2 = i;
-                } else {
-                    needs_reconstruct++;
-                    missing1 = i;
-                }
-            }
-        }
+        get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 1, &startoff, &startoffstripe);
+        get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 1, &endoff, &endoffstripe);
         
-        if (needs_reconstruct > 0) {
-            stripeoff = 0;
+        for (i = 0; i < ci->num_stripes - 1; i++) {
+            UINT64 ststart, stend;
             
-            if (needs_reconstruct == 2) {
-                TRACE("reconstructing stripes %u and %u\n", missing1, missing2);
+            if (startoffstripe > i) {
+                ststart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
+            } else if (startoffstripe == i) {
+                ststart = startoff;
+            } else {
+                ststart = startoff - (startoff % ci->stripe_length);
+            }
+              
+            if (endoffstripe > i) {
+                stend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
+            } else if (endoffstripe == i) {
+                stend = endoff + 1;
+            } else {
+                stend = endoff - (endoff % ci->stripe_length);
+            }
             
-                raid6_reconstruct2(off, skip, context, ci, &stripeoff, stripeend[missing1] - stripestart[missing1],
-                                   TRUE, firststripesize, missing1, missing2);
-                
-                while (stripeoff < stripeend[0] - stripestart[0]) {
-                    off += (ci->num_stripes - 2) * ci->stripe_length;
-                    raid6_reconstruct2(off, 0, context, ci, &stripeoff, stripeend[missing1] - stripestart[missing1],
-                                       FALSE, 0, missing1, missing2);
+            if (ststart != stend) {
+                if (ststart < start) {
+                    start = ststart;
+                    firststripesize = ci->stripe_length - (ststart % ci->stripe_length);
                 }
-            } else {
-                TRACE("reconstructing stripe %u\n", missing1);
-                
-                raid6_reconstruct1(off, skip, context, ci, &stripeoff, stripeend[missing1] - stripestart[missing1], TRUE, firststripesize, missing1);
                 
-                while (stripeoff < stripeend[0] - stripestart[0]) {
-                    off += (ci->num_stripes - 2) * ci->stripe_length;
-                    raid6_reconstruct1(off, 0, context, ci, &stripeoff, stripeend[missing1] - stripestart[missing1], FALSE, 0, missing1);
-                }
+                if (stend > end)
+                    end = stend;
             }
-            
-            off = origoff;
         }
         
-        if (!context->tree && !context->csum) {
-            UINT8* scratch;
+        for (i = 0; i < ci->num_stripes; i++) {
+            stripestart[i] = start;
+            stripeend[i] = end;
+        }
+        
+        context->stripes_cancel = Vcb->options.raid5_recalculation;
+    } else if (type == BLOCK_FLAG_RAID6) {
+        UINT64 startoff, endoff;
+        UINT16 endoffstripe;
+        UINT64 start = 0xffffffffffffffff, end = 0;
+        
+        get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 2, &startoff, &startoffstripe);
+        get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 2, &endoff, &endoffstripe);
+        
+        for (i = 0; i < ci->num_stripes - 2; i++) {
+            UINT64 ststart, stend;
             
-            scratch = ExAllocatePoolWithTag(NonPagedPool, ci->stripe_length, ALLOC_TAG);
-            if (!scratch) {
-                ERR("out of memory\n");
-                Status = STATUS_INSUFFICIENT_RESOURCES;
-                goto exit;
+            if (startoffstripe > i) {
+                ststart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
+            } else if (startoffstripe == i) {
+                ststart = startoff;
+            } else {
+                ststart = startoff - (startoff % ci->stripe_length);
             }
-            
-            stripeoff = 0;
-            Status = check_raid6_nocsum_parity(off, skip, context, ci, &stripeoff, stripeend[0] - stripestart[0], TRUE, firststripesize, scratch);
-            if (!NT_SUCCESS(Status)) {
-                ERR("check_raid6_nocsum_parity returned %08x\n", Status);
-                ExFreePool(scratch);
-                goto exit;
+              
+            if (endoffstripe > i) {
+                stend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
+            } else if (endoffstripe == i) {
+                stend = endoff + 1;
+            } else {
+                stend = endoff - (endoff % ci->stripe_length);
             }
-                
-            while (stripeoff < stripeend[0] - stripestart[0]) {
-                off += (ci->num_stripes - 2) * ci->stripe_length;
-                Status = check_raid6_nocsum_parity(off, 0, context, ci, &stripeoff, stripeend[0] - stripestart[0], FALSE, 0, scratch);
-                
-                if (!NT_SUCCESS(Status)) {
-                    ERR("check_raid6_nocsum_parity returned %08x\n", Status);
-                    ExFreePool(scratch);
-                    goto exit;
+            
+            if (ststart != stend) {
+                if (ststart < start) {
+                    start = ststart;
+                    firststripesize = ci->stripe_length - (ststart % ci->stripe_length);
                 }
+                
+                if (stend > end)
+                    end = stend;
             }
-            
-            ExFreePool(scratch);
-            
-            off = origoff;
         }
         
-        pos = 0;
-        stripeoff = 0;
-        raid6_decode(off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize);
-        
-        while (pos < length) {
-            off += (ci->num_stripes - 2) * ci->stripe_length;
-            raid6_decode(off, 0, context, ci, &stripeoff, buf, &pos, length, 0);
+        for (i = 0; i < ci->num_stripes; i++) {
+            stripestart[i] = start;
+            stripeend[i] = end;
         }
         
-        if (is_tree) {
-            tree_header* th = (tree_header*)buf;
-            UINT32 crc32 = ~calc_crc32c(0xffffffff, (UINT8*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
-            
-            if (addr != th->address || crc32 != *((UINT32*)th->csum))
-                checksum_error = TRUE;
-        } else if (csum) {
-#ifdef DEBUG_STATS
-            time1 = KeQueryPerformanceCounter(NULL);
-#endif
-            for (i = 0; i < length / Vcb->superblock.sector_size; i++) {
-                UINT32 crc32 = ~calc_crc32c(0xffffffff, buf + (i * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
-                
-                if (crc32 != csum[i]) {
-                    checksum_error = TRUE;
-                    break;
-                }
-            }
-#ifdef DEBUG_STATS
-            time2 = KeQueryPerformanceCounter(NULL);
+        context->stripes_cancel = Vcb->options.raid6_recalculation;
+    }
+    
+    KeInitializeSpinLock(&context->spin_lock);
+    
+    context->address = addr;
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        if (!devices[i] || stripestart[i] == stripeend[i]) {
+            context->stripes[i].status = ReadDataStatus_MissingDevice;
+            context->stripes[i].buf = NULL;
+            context->stripes_left--;
             
-            Vcb->stats.read_csum_time += time2.QuadPart - time1.QuadPart;
-#endif
+            if (!devices[i])
+                missing_devices++;
         }
-        
-        if (checksum_error) {
-            for (i = 0; i < needs_reconstruct; i++) {
-                PIO_STACK_LOCATION IrpSp;
-                UINT16 reconstruct_stripe = i == 0 ? missing1 : missing2;
-                
-                // re-run Irps that we cancelled
-                
-                if (context->stripes[reconstruct_stripe].Irp) {
-                    if (devices[reconstruct_stripe]->devobj->Flags & DO_DIRECT_IO) {
-                        MmUnlockPages(context->stripes[reconstruct_stripe].Irp->MdlAddress);
-                        IoFreeMdl(context->stripes[reconstruct_stripe].Irp->MdlAddress);
-                    }
-                    IoFreeIrp(context->stripes[reconstruct_stripe].Irp);
-                }
-                
-                if (!Irp) {
-                    context->stripes[reconstruct_stripe].Irp = IoAllocateIrp(devices[reconstruct_stripe]->devobj->StackSize, FALSE);
-                    
-                    if (!context->stripes[reconstruct_stripe].Irp) {
-                        ERR("IoAllocateIrp failed\n");
-                        Status = STATUS_INSUFFICIENT_RESOURCES;
-                        goto exit;
-                    }
-                } else {
-                    context->stripes[reconstruct_stripe].Irp = IoMakeAssociatedIrp(Irp, devices[reconstruct_stripe]->devobj->StackSize);
-                    
-                    if (!context->stripes[reconstruct_stripe].Irp) {
-                        ERR("IoMakeAssociatedIrp failed\n");
-                        Status = STATUS_INSUFFICIENT_RESOURCES;
-                        goto exit;
-                    }
-                }
-                
-                IrpSp = IoGetNextIrpStackLocation(context->stripes[reconstruct_stripe].Irp);
-                IrpSp->MajorFunction = IRP_MJ_READ;
-                
-                if (devices[reconstruct_stripe]->devobj->Flags & DO_BUFFERED_IO) {
-                    FIXME("FIXME - buffered IO\n");
-                } else if (devices[reconstruct_stripe]->devobj->Flags & DO_DIRECT_IO) {
-                    context->stripes[reconstruct_stripe].Irp->MdlAddress = IoAllocateMdl(context->stripes[reconstruct_stripe].buf,
-                                                                                         stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe], FALSE, FALSE, NULL);
-                    if (!context->stripes[reconstruct_stripe].Irp->MdlAddress) {
-                        ERR("IoAllocateMdl failed\n");
-                        Status = STATUS_INSUFFICIENT_RESOURCES;
-                        goto exit;
-                    }
-                    
-                    MmProbeAndLockPages(context->stripes[reconstruct_stripe].Irp->MdlAddress, KernelMode, IoWriteAccess);
-                } else {
-                    context->stripes[reconstruct_stripe].Irp->UserBuffer = context->stripes[reconstruct_stripe].buf;
-                }
-
-                IrpSp->Parameters.Read.Length = stripeend[reconstruct_stripe] - stripestart[reconstruct_stripe];
-                IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[reconstruct_stripe] + cis[reconstruct_stripe].offset;
-                
-                context->stripes[reconstruct_stripe].Irp->UserIosb = &context->stripes[reconstruct_stripe].iosb;
-                
-                IoSetCompletionRoutine(context->stripes[reconstruct_stripe].Irp, read_data_completion, &context->stripes[reconstruct_stripe], TRUE, TRUE, TRUE);
-
-                context->stripes[reconstruct_stripe].status = ReadDataStatus_Pending;
+    }
+      
+    if (missing_devices > allowed_missing) {
+        ERR("not enough devices to service request (%u missing)\n", missing_devices);
+        Status = STATUS_UNEXPECTED_IO_ERROR;
+        goto exit;
+    }
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        PIO_STACK_LOCATION IrpSp;
+        
+        if (devices[i] && stripestart[i] != stripeend[i]) {
+            context->stripes[i].context = (struct read_data_context*)context;
+            context->stripes[i].buf = ExAllocatePoolWithTag(NonPagedPool, stripeend[i] - stripestart[i], ALLOC_TAG);
+            
+            if (!context->stripes[i].buf) {
+                ERR("out of memory\n");
+                Status = STATUS_INSUFFICIENT_RESOURCES;
+                goto exit;
             }
-             
-            if (needs_reconstruct > 0) {
-                context->stripes_left = needs_reconstruct;
-                KeClearEvent(&context->Event);
-                
-#ifdef DEBUG_STATS
-                if (!is_tree)
-                    time1 = KeQueryPerformanceCounter(NULL);
-#endif
+            
+            if (type == BLOCK_FLAG_RAID10) {
+                context->stripes[i].stripenum = i / ci->sub_stripes;
+            }
+
+            if (!Irp) {
+                context->stripes[i].Irp = IoAllocateIrp(devices[i]->devobj->StackSize, FALSE);
                 
-                for (i = 0; i < needs_reconstruct; i++) {
-                    UINT16 reconstruct_stripe = i == 0 ? missing1 : missing2;
-                    
-                    IoCallDriver(devices[reconstruct_stripe]->devobj, context->stripes[reconstruct_stripe].Irp);
+                if (!context->stripes[i].Irp) {
+                    ERR("IoAllocateIrp failed\n");
+                    Status = STATUS_INSUFFICIENT_RESOURCES;
+                    goto exit;
                 }
+            } else {
+                context->stripes[i].Irp = IoMakeAssociatedIrp(Irp, devices[i]->devobj->StackSize);
                 
-                KeWaitForSingleObject(&context->Event, Executive, KernelMode, FALSE, NULL);
-                
-#ifdef DEBUG_STATS
-                if (!is_tree) {
-                    time2 = KeQueryPerformanceCounter(NULL);
-                    
-                    Vcb->stats.read_disk_time += time2.QuadPart - time1.QuadPart;
-                }
-#endif
-    
-                for (i = 0; i < needs_reconstruct; i++) {
-                    UINT16 reconstruct_stripe = i == 0 ? missing1 : missing2;
-                    
-                    if (context->stripes[reconstruct_stripe].status != ReadDataStatus_Success) {
-                        ERR("unrecoverable checksum error\n");
-                        Status = STATUS_CRC_ERROR;
-                        goto exit;
-                    }
+                if (!context->stripes[i].Irp) {
+                    ERR("IoMakeAssociatedIrp failed\n");
+                    Status = STATUS_INSUFFICIENT_RESOURCES;
+                    goto exit;
                 }
             }
             
-            off = origoff;
+            IrpSp = IoGetNextIrpStackLocation(context->stripes[i].Irp);
+            IrpSp->MajorFunction = IRP_MJ_READ;
             
-            if (context->tree) {
-                pos = 0;
-                stripeoff = 0;
-                if (!raid6_decode_with_checksum_metadata(addr, off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize, Vcb->superblock.node_size)) {
-                    ERR("unrecoverable metadata checksum error\n");
-                    Status = STATUS_CRC_ERROR;
-                    goto exit;
-                }
-            } else {
-                pos = 0;
-                stripeoff = 0;
-                if (!raid6_decode_with_checksum(off, skip, context, ci, &stripeoff, buf, &pos, length, firststripesize, csum, Vcb->superblock.sector_size)) {
-                    Status = STATUS_CRC_ERROR;
+            if (devices[i]->devobj->Flags & DO_BUFFERED_IO) {
+                FIXME("FIXME - buffered IO\n");
+            } else if (devices[i]->devobj->Flags & DO_DIRECT_IO) {
+                context->stripes[i].Irp->MdlAddress = IoAllocateMdl(context->stripes[i].buf, stripeend[i] - stripestart[i], FALSE, FALSE, NULL);
+                if (!context->stripes[i].Irp->MdlAddress) {
+                    ERR("IoAllocateMdl failed\n");
+                    Status = STATUS_INSUFFICIENT_RESOURCES;
                     goto exit;
                 }
                 
-                while (pos < length) {
-                    off += (ci->num_stripes - 1) * ci->stripe_length;
-                    if (!raid6_decode_with_checksum(off, 0, context, ci, &stripeoff, buf, &pos, length, 0, csum, Vcb->superblock.sector_size)) {
-                        Status = STATUS_CRC_ERROR;
-                        goto exit;
-                    }
-                }
+                MmProbeAndLockPages(context->stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
+            } else {
+                context->stripes[i].Irp->UserBuffer = context->stripes[i].buf;
             }
+
+            IrpSp->Parameters.Read.Length = stripeend[i] - stripestart[i];
+            IrpSp->Parameters.Read.ByteOffset.QuadPart = stripestart[i] + cis[i].offset;
+            
+            context->stripes[i].Irp->UserIosb = &context->stripes[i].iosb;
+            
+            IoSetCompletionRoutine(context->stripes[i].Irp, read_data_completion, &context->stripes[i], TRUE, TRUE, TRUE);
+
+            context->stripes[i].status = ReadDataStatus_Pending;
         }
+    }
+    
+#ifdef DEBUG_STATS
+    if (!is_tree)
+        time1 = KeQueryPerformanceCounter(NULL);
+#endif
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        if (context->stripes[i].status != ReadDataStatus_MissingDevice) {
+            IoCallDriver(devices[i]->devobj, context->stripes[i].Irp);
+        }
+    }
+
+    KeWaitForSingleObject(&context->Event, Executive, KernelMode, FALSE, NULL);
+   
+#ifdef DEBUG_STATS
+    if (!is_tree) {
+        time2 = KeQueryPerformanceCounter(NULL);
         
-        // write good data over bad
-        
-        if (!Vcb->readonly) {
-            for (i = 0; i < ci->num_stripes; i++) {
-                if (context->stripes[i].rewrite && devices[i] && !devices[i]->readonly) {
-                    Status = write_data_phys(devices[i]->devobj, cis[i].offset + stripestart[i], context->stripes[i].buf, stripeend[i] - stripestart[i]);
-                    
-                    if (!NT_SUCCESS(Status))
-                        WARN("write_data_phys returned %08x\n", Status);
+        Vcb->stats.read_disk_time += time2.QuadPart - time1.QuadPart;
+    }
+#endif
+    
+    // check if any of the devices return a "user-induced" error
+    
+    for (i = 0; i < ci->num_stripes; i++) {
+        if (context->stripes[i].status == ReadDataStatus_Error && IoIsErrorUserInduced(context->stripes[i].iosb.Status)) {
+            if (Irp && context->stripes[i].iosb.Status == STATUS_VERIFY_REQUIRED) {
+                PDEVICE_OBJECT dev;
+                
+                dev = IoGetDeviceToVerify(Irp->Tail.Overlay.Thread);
+                IoSetDeviceToVerify(Irp->Tail.Overlay.Thread, NULL);
+                
+                if (!dev) {
+                    dev = IoGetDeviceToVerify(PsGetCurrentThread());
+                    IoSetDeviceToVerify(PsGetCurrentThread(), NULL);
                 }
+                
+                dev = Vcb->Vpb ? Vcb->Vpb->RealDevice : NULL;
+                
+                if (dev)
+                    IoVerifyVolume(dev, FALSE);
             }
+//             IoSetHardErrorOrVerifyDevice(context->stripes[i].Irp, devices[i]->devobj);
+            
+            Status = context->stripes[i].iosb.Status;
+            goto exit;
+        }
+    }
+    
+    if (type == BLOCK_FLAG_RAID0) {
+        Status = read_data_raid0(Vcb, buf, addr, length, context, ci, stripestart, stripeend, startoffstripe);
+        if (!NT_SUCCESS(Status)) {
+            ERR("read_data_raid0 returned %08x\n", Status);
+            goto exit;
+        }
+    } else if (type == BLOCK_FLAG_RAID10) {
+        Status = read_data_raid10(Vcb, buf, addr, length, Irp, context, ci, devices, stripestart, stripeend, startoffstripe);
+        if (!NT_SUCCESS(Status)) {
+            ERR("read_data_raid10 returned %08x\n", Status);
+            goto exit;
+        }
+    } else if (type == BLOCK_FLAG_DUPLICATE) {
+        Status = read_data_dup(Vcb, buf, addr, length, Irp, context, ci, devices, stripestart, stripeend);
+        if (!NT_SUCCESS(Status)) {
+            ERR("read_data_dup returned %08x\n", Status);
+            goto exit;
+        }
+    } else if (type == BLOCK_FLAG_RAID5) {
+        Status = read_data_raid5(Vcb, buf, addr, length, Irp, context, ci, devices, stripestart, stripeend, offset, firststripesize, check_nocsum_parity);
+        if (!NT_SUCCESS(Status)) {
+            ERR("read_data_raid5 returned %08x\n", Status);
+            goto exit;
+        }
+    } else if (type == BLOCK_FLAG_RAID6) {
+        Status = read_data_raid6(Vcb, buf, addr, length, Irp, context, ci, devices, stripestart, stripeend, offset, firststripesize, check_nocsum_parity);
+        if (!NT_SUCCESS(Status)) {
+            ERR("read_data_raid6 returned %08x\n", Status);
+            goto exit;
         }
-        
-        Status = STATUS_SUCCESS;
     }
 
 exit:
@@ -2888,155 +2990,7 @@ static NTSTATUS STDCALL read_stream(fcb* fcb, UINT8* data, UINT64 start, ULONG l
     return Status;
 }
 
-static NTSTATUS load_csum_from_disk(device_extension* Vcb, UINT32* csum, UINT64 start, UINT64 length, PIRP Irp) {
-    NTSTATUS Status;
-    KEY searchkey;
-    traverse_ptr tp, next_tp;
-    UINT64 i, j;
-    BOOL b;
-    
-    searchkey.obj_id = EXTENT_CSUM_ID;
-    searchkey.obj_type = TYPE_EXTENT_CSUM;
-    searchkey.offset = start;
-    
-    Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, FALSE, Irp);
-    if (!NT_SUCCESS(Status)) {
-        ERR("error - find_item returned %08x\n", Status);
-        return Status;
-    }
-    
-    i = 0;
-    do {
-        if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
-            ULONG readlen;
-            
-            if (start < tp.item->key.offset)
-                j = 0;
-            else
-                j = ((start - tp.item->key.offset) / Vcb->superblock.sector_size) + i;
-            
-            if (j * sizeof(UINT32) > tp.item->size || tp.item->key.offset > start + (i * Vcb->superblock.sector_size)) {
-                ERR("checksum not found for %llx\n", start + (i * Vcb->superblock.sector_size));
-                return STATUS_INTERNAL_ERROR;
-            }
-            
-            readlen = min((tp.item->size / sizeof(UINT32)) - j, length - i);
-            RtlCopyMemory(&csum[i], tp.item->data + (j * sizeof(UINT32)), readlen * sizeof(UINT32));
-            i += readlen;
-            
-            if (i == length)
-                break;
-        }
-        
-        b = find_next_item(Vcb, &tp, &next_tp, FALSE, Irp);
-        
-        if (b)
-            tp = next_tp;
-    } while (b);
-    
-    if (i < length) {
-        ERR("could not read checksums: offset %llx, length %llx sectors\n", start, length);
-        return STATUS_INTERNAL_ERROR;
-    }
-    
-    return STATUS_SUCCESS;
-}
-
-static NTSTATUS load_csum(device_extension* Vcb, UINT64 start, UINT64 length, UINT32** pcsum, PIRP Irp) {
-    UINT32* csum = NULL;
-    NTSTATUS Status;
-    UINT64 end;
-    RTL_BITMAP bmp;
-    ULONG *bmpbuf = NULL, bmpbuflen, index, runlength;
-    LIST_ENTRY* le;
-    
-    if (length == 0) {
-        *pcsum = NULL;
-        return STATUS_SUCCESS;
-    }
-    
-    bmpbuflen = sector_align(length, sizeof(ULONG) * 8) / 8;
-    bmpbuf = ExAllocatePoolWithTag(PagedPool, bmpbuflen, ALLOC_TAG);
-    if (!bmpbuf) {
-        ERR("out of memory\n");
-        Status = STATUS_INSUFFICIENT_RESOURCES;
-        goto end;
-    }
-    
-    RtlInitializeBitMap(&bmp, bmpbuf, length);
-    RtlClearAllBits(&bmp);
-    
-    csum = ExAllocatePoolWithTag(NonPagedPool, sizeof(UINT32) * length, ALLOC_TAG);
-    if (!csum) {
-        ERR("out of memory\n");
-        Status = STATUS_INSUFFICIENT_RESOURCES;
-        goto end;
-    }
-    
-    ExAcquireResourceSharedLite(&Vcb->checksum_lock, TRUE);
-    
-    end = start + (length * Vcb->superblock.sector_size);
-    
-    le = Vcb->sector_checksums.Flink;
-    while (le != &Vcb->sector_checksums) {
-        changed_sector* cs = (changed_sector*)le;
-        UINT64 cs_end = cs->ol.key + (cs->length * Vcb->superblock.sector_size);
-        
-        if (cs->ol.key <= start && cs_end >= end) { // outer
-            if (cs->deleted) {
-                RtlClearAllBits(&bmp);
-            } else {
-                RtlSetAllBits(&bmp);
-                RtlCopyMemory(csum, &cs->checksums[(start - cs->ol.key) / Vcb->superblock.sector_size], sizeof(UINT32) * length);
-            }
-        } else if (cs->ol.key >= start && cs->ol.key <= end) { // right or inner
-            if (cs->deleted) {
-                RtlClearBits(&bmp, (cs->ol.key - start) / Vcb->superblock.sector_size, (min(end, cs_end) - cs->ol.key) / Vcb->superblock.sector_size);
-            } else {
-                RtlSetBits(&bmp, (cs->ol.key - start) / Vcb->superblock.sector_size, (min(end, cs_end) - cs->ol.key) / Vcb->superblock.sector_size);
-                RtlCopyMemory(&csum[(cs->ol.key - start) / Vcb->superblock.sector_size], cs->checksums, (min(end, cs_end) - cs->ol.key) * sizeof(UINT32) / Vcb->superblock.sector_size);
-            }
-        } else if (cs_end >= start && cs_end <= end) { // left
-            if (cs->deleted) {
-                RtlClearBits(&bmp, 0, (cs_end - start) / Vcb->superblock.sector_size);
-            } else {
-                RtlSetBits(&bmp, 0, (cs_end - start) / Vcb->superblock.sector_size);
-                RtlCopyMemory(csum, &cs->checksums[(start - cs->ol.key) / Vcb->superblock.sector_size], (cs_end - start) * sizeof(UINT32) / Vcb->superblock.sector_size);
-            }
-        }
-        
-        le = le->Flink;
-    }
-    
-    ExReleaseResourceLite(&Vcb->checksum_lock);
-    
-    runlength = RtlFindFirstRunClear(&bmp, &index);
-            
-    while (runlength != 0) {
-        Status = load_csum_from_disk(Vcb, &csum[index], start + (index * Vcb->superblock.sector_size), runlength, Irp);
-        if (!NT_SUCCESS(Status)) {
-            ERR("load_csum_from_disk returned %08x\n", Status);
-            goto end;
-        }
-       
-        runlength = RtlFindNextForwardRunClear(&bmp, index + runlength, &index);
-    }
-    
-    Status = STATUS_SUCCESS;
-    
-end:
-    if (bmpbuf)
-        ExFreePool(bmpbuf);
-    
-    if (NT_SUCCESS(Status))
-        *pcsum = csum;
-    else if (csum)
-        ExFreePool(csum);
-
-    return Status;
-}
-
-NTSTATUS STDCALL read_file(fcb* fcb, UINT8* data, UINT64 start, UINT64 length, ULONG* pbr, PIRP Irp) {
+NTSTATUS STDCALL read_file(fcb* fcb, UINT8* data, UINT64 start, UINT64 length, ULONG* pbr, PIRP Irp, BOOL check_nocsum_parity) {
     NTSTATUS Status;
     EXTENT_DATA* ed;
     UINT64 bytes_read = 0;
@@ -3128,7 +3082,7 @@ NTSTATUS STDCALL read_file(fcb* fcb, UINT8* data, UINT64 start, UINT64 length, U
                     UINT32 to_read, read;
                     UINT8* buf;
                     BOOL buf_free;
-                    UINT32 *csum, bumpoff = 0;
+                    UINT32 bumpoff = 0;
                     UINT64 addr, lockaddr, locklen;
                     chunk* c;
                     
@@ -3164,20 +3118,6 @@ NTSTATUS STDCALL read_file(fcb* fcb, UINT8* data, UINT64 start, UINT64 length, U
                         }
                     }
                     
-                    if (!(fcb->inode_item.flags & BTRFS_INODE_NODATASUM)) {
-                        Status = load_csum(fcb->Vcb, addr, to_read / fcb->Vcb->superblock.sector_size, &csum, Irp);
-                        
-                        if (!NT_SUCCESS(Status)) {
-                            ERR("load_csum returned %08x\n", Status);
-                            
-                            if (buf_free)
-                                ExFreePool(buf);
-                            
-                            goto exit;
-                        }
-                    } else
-                        csum = NULL;
-                    
                     c = get_chunk_from_address(fcb->Vcb, addr);
                     
                     if (!c) {
@@ -3194,8 +3134,8 @@ NTSTATUS STDCALL read_file(fcb* fcb, UINT8* data, UINT64 start, UINT64 length, U
                         chunk_lock_range(fcb->Vcb, c, lockaddr, locklen);
                     }
                     
-                    
-                    Status = read_data(fcb->Vcb, addr, to_read, csum, FALSE, buf, c, NULL, Irp);
+                    Status = read_data(fcb->Vcb, addr, to_read, ext->csum ? &ext->csum[off / fcb->Vcb->superblock.sector_size] : NULL, FALSE,
+                                       buf, c, NULL, Irp, check_nocsum_parity);
                     if (!NT_SUCCESS(Status)) {
                         ERR("read_data returned %08x\n", Status);
                         
@@ -3244,9 +3184,6 @@ NTSTATUS STDCALL read_file(fcb* fcb, UINT8* data, UINT64 start, UINT64 length, U
                     if (buf_free)
                         ExFreePool(buf);
                     
-                    if (csum)
-                        ExFreePool(csum);
-                    
                     bytes_read += read;
                     length -= read;
                     
@@ -3380,25 +3317,34 @@ NTSTATUS do_read(PIRP Irp, BOOL wait, ULONG* bytes_read) {
                 ccfs.FileSize = fcb->Header.FileSize;
                 ccfs.ValidDataLength = fcb->Header.ValidDataLength;
                 
-                TRACE("calling CcInitializeCacheMap (%llx, %llx, %llx)\n",
-                            ccfs.AllocationSize.QuadPart, ccfs.FileSize.QuadPart, ccfs.ValidDataLength.QuadPart);
-                CcInitializeCacheMap(FileObject, &ccfs, FALSE, cache_callbacks, FileObject);
-
-                CcSetReadAheadGranularity(FileObject, READ_AHEAD_GRANULARITY);
+                init_file_cache(FileObject, &ccfs);
             }
             
             if (IrpSp->MinorFunction & IRP_MN_MDL) {
                 CcMdlRead(FileObject,&IrpSp->Parameters.Read.ByteOffset, length, &Irp->MdlAddress, &Irp->IoStatus);
             } else {
-                TRACE("CcCopyRead(%p, %llx, %x, %u, %p, %p)\n", FileObject, IrpSp->Parameters.Read.ByteOffset.QuadPart, length, wait, data, &Irp->IoStatus);
-                TRACE("sizes = %llx, %llx, %llx\n", fcb->Header.AllocationSize, fcb->Header.FileSize, fcb->Header.ValidDataLength);
-                if (!CcCopyRead(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, wait, data, &Irp->IoStatus)) {
-                    TRACE("CcCopyRead could not wait\n");
-                    
-                    IoMarkIrpPending(Irp);
-                    return STATUS_PENDING;
+                if (CcCopyReadEx) {
+                    TRACE("CcCopyReadEx(%p, %llx, %x, %u, %p, %p, %p, %p)\n", FileObject, IrpSp->Parameters.Read.ByteOffset.QuadPart,
+                          length, wait, data, &Irp->IoStatus, Irp->Tail.Overlay.Thread);
+                    TRACE("sizes = %llx, %llx, %llx\n", fcb->Header.AllocationSize, fcb->Header.FileSize, fcb->Header.ValidDataLength);
+                    if (!CcCopyReadEx(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, wait, data, &Irp->IoStatus, Irp->Tail.Overlay.Thread)) {
+                        TRACE("CcCopyReadEx could not wait\n");
+                        
+                        IoMarkIrpPending(Irp);
+                        return STATUS_PENDING;
+                    }
+                    TRACE("CcCopyReadEx finished\n");
+                } else {
+                    TRACE("CcCopyRead(%p, %llx, %x, %u, %p, %p)\n", FileObject, IrpSp->Parameters.Read.ByteOffset.QuadPart, length, wait, data, &Irp->IoStatus);
+                    TRACE("sizes = %llx, %llx, %llx\n", fcb->Header.AllocationSize, fcb->Header.FileSize, fcb->Header.ValidDataLength);
+                    if (!CcCopyRead(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, wait, data, &Irp->IoStatus)) {
+                        TRACE("CcCopyRead could not wait\n");
+                        
+                        IoMarkIrpPending(Irp);
+                        return STATUS_PENDING;
+                    }
+                    TRACE("CcCopyRead finished\n");
                 }
-                TRACE("CcCopyRead finished\n");
             }
         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
             Status = _SEH2_GetExceptionCode();
@@ -3431,20 +3377,30 @@ NTSTATUS do_read(PIRP Irp, BOOL wait, ULONG* bytes_read) {
             }
         }
         
-        ExAcquireResourceSharedLite(&fcb->Vcb->tree_lock, TRUE);
-    
         if (fcb->ads)
             Status = read_stream(fcb, data, start, length, bytes_read);
         else
-            Status = read_file(fcb, data, start, length, bytes_read, Irp);
-        
-        ExReleaseResourceLite(&fcb->Vcb->tree_lock);
+            Status = read_file(fcb, data, start, length, bytes_read, Irp, TRUE);
         
         *bytes_read += addon;
         TRACE("read %u bytes\n", *bytes_read);
         
         Irp->IoStatus.Information = *bytes_read;
         
+        if (diskacc && Status != STATUS_PENDING) {
+            PETHREAD thread = NULL;
+            
+            if (Irp->Tail.Overlay.Thread && !IoIsSystemThread(Irp->Tail.Overlay.Thread))
+                thread = Irp->Tail.Overlay.Thread;
+            else if (!IoIsSystemThread(PsGetCurrentThread()))
+                thread = PsGetCurrentThread();
+            else if (IoIsSystemThread(PsGetCurrentThread()) && IoGetTopLevelIrp() == Irp)
+                thread = PsGetCurrentThread();
+            
+            if (thread)
+                PsUpdateDiskCounters(PsGetThreadProcess(thread), *bytes_read, 0, 1, 0, 0);
+        }
+        
         return Status;
     }
 }
@@ -3458,7 +3414,7 @@ NTSTATUS STDCALL drv_read(PDEVICE_OBJECT DeviceObject, PIRP Irp) {
     BOOL top_level;
     fcb* fcb;
     ccb* ccb;
-    BOOL tree_lock = FALSE, fcb_lock = FALSE, pagefile;
+    BOOL fcb_lock = FALSE, wait;
     
     FsRtlEnterFileSystem();
     
@@ -3491,12 +3447,6 @@ NTSTATUS STDCALL drv_read(PDEVICE_OBJECT DeviceObject, PIRP Irp) {
         goto exit;
     }
     
-    if (fcb == Vcb->volume_fcb) {
-        TRACE("not allowing read of volume FCB\n");
-        Status = STATUS_INVALID_PARAMETER;
-        goto exit;
-    }
-    
     ccb = FileObject->FsContext2;
     
     if (!ccb) {
@@ -3511,22 +3461,25 @@ NTSTATUS STDCALL drv_read(PDEVICE_OBJECT DeviceObject, PIRP Irp) {
         goto exit;
     }
     
-    pagefile = fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE && Irp->Flags & IRP_PAGING_IO;
+    if (fcb == Vcb->volume_fcb) {
+        TRACE("reading volume FCB\n");
+        
+        IoSkipCurrentIrpStackLocation(Irp);
     
-    if (Irp->Flags & IRP_NOCACHE) {
-        if (!pagefile) {
-            if (!ExAcquireResourceSharedLite(&Vcb->tree_lock, IoIsOperationSynchronous(Irp))) {
-                Status = STATUS_PENDING;
-                IoMarkIrpPending(Irp);
-                goto exit;
-            }
-            
-            tree_lock = TRUE;
-        }
+        Status = IoCallDriver(Vcb->Vpb->RealDevice, Irp);
+        
+        goto exit2;
     }
     
+    wait = IoIsOperationSynchronous(Irp);
+    
+    // Don't offload jobs when doing paging IO - otherwise this can lead to
+    // deadlocks in CcCopyRead.
+    if (Irp->Flags & IRP_PAGING_IO)
+        wait = TRUE;
+    
     if (!ExIsResourceAcquiredSharedLite(fcb->Header.Resource)) {
-        if (!ExAcquireResourceSharedLite(fcb->Header.Resource, IoIsOperationSynchronous(Irp))) {
+        if (!ExAcquireResourceSharedLite(fcb->Header.Resource, wait)) {
             Status = STATUS_PENDING;
             IoMarkIrpPending(Irp);
             goto exit;
@@ -3535,14 +3488,11 @@ NTSTATUS STDCALL drv_read(PDEVICE_OBJECT DeviceObject, PIRP Irp) {
         fcb_lock = TRUE;
     }
     
-    Status = do_read(Irp, IoIsOperationSynchronous(Irp), &bytes_read);
+    Status = do_read(Irp, wait, &bytes_read);
     
 exit:
     if (fcb_lock)
         ExReleaseResourceLite(fcb->Header.Resource);
-    
-    if (tree_lock)
-        ExReleaseResourceLite(&Vcb->tree_lock);
 
     Irp->IoStatus.Status = Status;