Merge pull request #12 from chrislusf/master

sync
2024-12-03 18:19:04 +08:00 · 2020-08-20 19:18:23 +08:00 · 2020-08-20 19:18:23 +08:00 · b0d6330cf4
commit b0d6330cf4
parent 6a93e26fc3 f48567c5c6
152 changed files with 7589 additions and 1592 deletions
--- a/k8s/seaweedfs/Chart.yaml
+++ b/k8s/seaweedfs/Chart.yaml
@ -1,4 +1,4 @@
 apiVersion: v1
 description: SeaweedFS
 name: seaweedfs
-version: 1.88
+version: 1.90
--- a/k8s/seaweedfs/values.yaml
+++ b/k8s/seaweedfs/values.yaml
@ -4,7 +4,7 @@ global:
  registry: ""
  repository: ""
  imageName: chrislusf/seaweedfs
-  imageTag: "1.88"
+  imageTag: "1.90"
  imagePullPolicy: IfNotPresent
  imagePullSecrets: imagepullsecret
  restartPolicy: Always
--- a/other/java/client/pom.xml
+++ b/other/java/client/pom.xml
@ -5,7 +5,7 @@

    <groupId>com.github.chrislusf</groupId>
    <artifactId>seaweedfs-client</artifactId>
-    <version>1.4.5</version>
+    <version>1.4.6</version>

    <parent>
        <groupId>org.sonatype.oss</groupId>
--- a/other/java/client/pom.xml.deploy
+++ b/other/java/client/pom.xml.deploy
@ -5,7 +5,7 @@

    <groupId>com.github.chrislusf</groupId>
    <artifactId>seaweedfs-client</artifactId>
-    <version>1.4.5</version>
+    <version>1.4.6</version>

    <parent>
        <groupId>org.sonatype.oss</groupId>
--- a/other/java/client/pom_debug.xml
+++ b/other/java/client/pom_debug.xml
@ -5,7 +5,7 @@

    <groupId>com.github.chrislusf</groupId>
    <artifactId>seaweedfs-client</artifactId>
-    <version>1.4.5</version>
+    <version>1.4.6</version>

    <parent>
        <groupId>org.sonatype.oss</groupId>
--- a/other/java/client/src/main/java/seaweedfs/client/SeaweedRead.java
+++ b/other/java/client/src/main/java/seaweedfs/client/SeaweedRead.java
@ -23,7 +23,7 @@ public class SeaweedRead {
    // returns bytesRead
    public static long read(FilerGrpcClient filerGrpcClient, List<VisibleInterval> visibleIntervals,
                            final long position, final byte[] buffer, final int bufferOffset,
-                            final int bufferLength) throws IOException {
+                            final int bufferLength, final long fileSize) throws IOException {

        List<ChunkView> chunkViews = viewFromVisibles(visibleIntervals, position, bufferLength);

@ -42,6 +42,14 @@ public class SeaweedRead {
        long readCount = 0;
        int startOffset = bufferOffset;
        for (ChunkView chunkView : chunkViews) {
+
+            if (startOffset < chunkView.logicOffset) {
+                long gap = chunkView.logicOffset - startOffset;
+                LOG.debug("zero [{},{})", startOffset, startOffset + gap);
+                readCount += gap;
+                startOffset += gap;
+            }
+
            FilerProto.Locations locations = vid2Locations.get(parseVolumeId(chunkView.fileId));
            if (locations == null || locations.getLocationsCount() == 0) {
                LOG.error("failed to locate {}", chunkView.fileId);
@ -51,11 +59,22 @@ public class SeaweedRead {

            int len = readChunkView(position, buffer, startOffset, chunkView, locations);

+            LOG.debug("read [{},{}) {} size {}", startOffset, startOffset + len, chunkView.fileId, chunkView.size);
+
            readCount += len;
            startOffset += len;

        }

+        long limit = Math.min(bufferLength, fileSize);
+
+        if (startOffset < limit) {
+            long gap = limit - startOffset;
+            LOG.debug("zero2 [{},{})", startOffset, startOffset + gap);
+            readCount += gap;
+            startOffset += gap;
+        }
+
        return readCount;
    }

@ -71,7 +90,7 @@ public class SeaweedRead {
        int len = (int) chunkView.size;
        LOG.debug("readChunkView fid:{} chunkData.length:{} chunkView.offset:{} buffer.length:{} startOffset:{} len:{}",
                chunkView.fileId, chunkData.length, chunkView.offset, buffer.length, startOffset, len);
-        System.arraycopy(chunkData, (int) chunkView.offset, buffer, startOffset, len);
+        System.arraycopy(chunkData, startOffset - (int) (chunkView.logicOffset - chunkView.offset), buffer, startOffset, len);

        return len;
    }
@ -93,7 +112,7 @@ public class SeaweedRead {
            Header contentEncodingHeader = entity.getContentEncoding();

            if (contentEncodingHeader != null) {
-                HeaderElement[] encodings =contentEncodingHeader.getElements();
+                HeaderElement[] encodings = contentEncodingHeader.getElements();
                for (int i = 0; i < encodings.length; i++) {
                    if (encodings[i].getName().equalsIgnoreCase("gzip")) {
                        entity = new GzipDecompressingEntity(entity);
@ -134,18 +153,19 @@ public class SeaweedRead {

        long stop = offset + size;
        for (VisibleInterval chunk : visibleIntervals) {
-            if (chunk.start <= offset && offset < chunk.stop && offset < stop) {
+            long chunkStart = Math.max(offset, chunk.start);
+            long chunkStop = Math.min(stop, chunk.stop);
+            if (chunkStart < chunkStop) {
                boolean isFullChunk = chunk.isFullChunk && chunk.start == offset && chunk.stop <= stop;
                views.add(new ChunkView(
                        chunk.fileId,
-                        offset - chunk.start,
-                        Math.min(chunk.stop, stop) - offset,
-                        offset,
+                        chunkStart - chunk.start + chunk.chunkOffset,
+                        chunkStop - chunkStart,
+                        chunkStart,
                        isFullChunk,
                        chunk.cipherKey,
                        chunk.isCompressed
                ));
-                offset = Math.min(chunk.stop, stop);
            }
        }
        return views;
@ -160,7 +180,13 @@ public class SeaweedRead {
        Arrays.sort(chunks, new Comparator<FilerProto.FileChunk>() {
            @Override
            public int compare(FilerProto.FileChunk a, FilerProto.FileChunk b) {
-                return (int) (a.getMtime() - b.getMtime());
+                // if just a.getMtime() - b.getMtime(), it will overflow!
+                if (a.getMtime() < b.getMtime()) {
+                    return -1;
+                } else if (a.getMtime() > b.getMtime()) {
+                    return 1;
+                }
+                return 0;
            }
        });

@ -181,6 +207,7 @@ public class SeaweedRead {
                chunk.getOffset() + chunk.getSize(),
                chunk.getFileId(),
                chunk.getMtime(),
+                0,
                true,
                chunk.getCipherKey().toByteArray(),
                chunk.getIsCompressed()
@ -203,6 +230,7 @@ public class SeaweedRead {
                        chunk.getOffset(),
                        v.fileId,
                        v.modifiedTime,
+                        v.chunkOffset,
                        false,
                        v.cipherKey,
                        v.isCompressed
@ -215,6 +243,7 @@ public class SeaweedRead {
                        v.stop,
                        v.fileId,
                        v.modifiedTime,
+                        v.chunkOffset + (chunkStop - v.start),
                        false,
                        v.cipherKey,
                        v.isCompressed
@ -247,6 +276,10 @@ public class SeaweedRead {
        return fileId;
    }

+    public static long fileSize(FilerProto.Entry entry) {
+        return Math.max(totalSize(entry.getChunksList()), entry.getAttributes().getFileSize());
+    }
+
    public static long totalSize(List<FilerProto.FileChunk> chunksList) {
        long size = 0;
        for (FilerProto.FileChunk chunk : chunksList) {
@ -263,15 +296,17 @@ public class SeaweedRead {
        public final long stop;
        public final long modifiedTime;
        public final String fileId;
+        public final long chunkOffset;
        public final boolean isFullChunk;
        public final byte[] cipherKey;
        public final boolean isCompressed;

-        public VisibleInterval(long start, long stop, String fileId, long modifiedTime, boolean isFullChunk, byte[] cipherKey, boolean isCompressed) {
+        public VisibleInterval(long start, long stop, String fileId, long modifiedTime, long chunkOffset, boolean isFullChunk, byte[] cipherKey, boolean isCompressed) {
            this.start = start;
            this.stop = stop;
            this.modifiedTime = modifiedTime;
            this.fileId = fileId;
+            this.chunkOffset = chunkOffset;
            this.isFullChunk = isFullChunk;
            this.cipherKey = cipherKey;
            this.isCompressed = isCompressed;
--- a/other/java/hdfs2/dependency-reduced-pom.xml
+++ b/other/java/hdfs2/dependency-reduced-pom.xml
@ -301,7 +301,7 @@
    </snapshotRepository>
  </distributionManagement>
  <properties>
-    <seaweedfs.client.version>1.4.5</seaweedfs.client.version>
+    <seaweedfs.client.version>1.4.6</seaweedfs.client.version>
    <hadoop.version>2.9.2</hadoop.version>
  </properties>
 </project>
--- a/other/java/hdfs2/pom.xml
+++ b/other/java/hdfs2/pom.xml
@ -5,7 +5,7 @@
    <modelVersion>4.0.0</modelVersion>

    <properties>
-        <seaweedfs.client.version>1.4.5</seaweedfs.client.version>
+        <seaweedfs.client.version>1.4.6</seaweedfs.client.version>
        <hadoop.version>2.9.2</hadoop.version>
    </properties>

--- a/other/java/hdfs2/src/main/java/seaweed/hdfs/SeaweedFileSystemStore.java
+++ b/other/java/hdfs2/src/main/java/seaweed/hdfs/SeaweedFileSystemStore.java
@ -124,7 +124,7 @@ public class SeaweedFileSystemStore {

    private FileStatus doGetFileStatus(Path path, FilerProto.Entry entry) {
        FilerProto.FuseAttributes attributes = entry.getAttributes();
-        long length = SeaweedRead.totalSize(entry.getChunksList());
+        long length = SeaweedRead.fileSize(entry);
        boolean isDir = entry.getIsDirectory();
        int block_replication = 1;
        int blocksize = 512;
@ -185,7 +185,7 @@ public class SeaweedFileSystemStore {
                entry.mergeFrom(existingEntry);
                entry.getAttributesBuilder().setMtime(now);
                LOG.debug("createFile merged entry path:{} entry:{} from:{}", path, entry, existingEntry);
-                writePosition = SeaweedRead.totalSize(existingEntry.getChunksList());
+                writePosition = SeaweedRead.fileSize(existingEntry);
                replication = existingEntry.getAttributes().getReplication();
            }
        }
--- a/other/java/hdfs2/src/main/java/seaweed/hdfs/SeaweedInputStream.java
+++ b/other/java/hdfs2/src/main/java/seaweed/hdfs/SeaweedInputStream.java
@ -41,7 +41,7 @@ public class SeaweedInputStream extends FSInputStream {
        this.statistics = statistics;
        this.path = path;
        this.entry = entry;
-        this.contentLength = SeaweedRead.totalSize(entry.getChunksList());
+        this.contentLength = SeaweedRead.fileSize(entry);
        this.bufferSize = bufferSize;

        this.visibleIntervalList = SeaweedRead.nonOverlappingVisibleIntervals(filerGrpcClient, entry.getChunksList());
@ -87,7 +87,7 @@ public class SeaweedInputStream extends FSInputStream {
            throw new IllegalArgumentException("requested read length is more than will fit after requested offset in buffer");
        }

-        long bytesRead = SeaweedRead.read(this.filerGrpcClient, this.visibleIntervalList, this.position, b, off, len);
+        long bytesRead = SeaweedRead.read(this.filerGrpcClient, this.visibleIntervalList, this.position, b, off, len, SeaweedRead.fileSize(entry));
        if (bytesRead > Integer.MAX_VALUE) {
            throw new IOException("Unexpected Content-Length");
        }
--- a/other/java/hdfs3/dependency-reduced-pom.xml
+++ b/other/java/hdfs3/dependency-reduced-pom.xml
@ -120,6 +120,188 @@
      </plugin>
    </plugins>
  </build>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+      <version>3.1.1</version>
+      <scope>provided</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>hadoop-hdfs-client</artifactId>
+          <groupId>org.apache.hadoop</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>hadoop-yarn-api</artifactId>
+          <groupId>org.apache.hadoop</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>hadoop-yarn-client</artifactId>
+          <groupId>org.apache.hadoop</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>hadoop-mapreduce-client-core</artifactId>
+          <groupId>org.apache.hadoop</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+          <groupId>org.apache.hadoop</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>hadoop-annotations</artifactId>
+          <groupId>org.apache.hadoop</groupId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <version>3.1.1</version>
+      <scope>provided</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>commons-cli</artifactId>
+          <groupId>commons-cli</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>commons-math3</artifactId>
+          <groupId>org.apache.commons</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>commons-io</artifactId>
+          <groupId>commons-io</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>commons-net</artifactId>
+          <groupId>commons-net</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>commons-collections</artifactId>
+          <groupId>commons-collections</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>javax.servlet-api</artifactId>
+          <groupId>javax.servlet</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jetty-server</artifactId>
+          <groupId>org.eclipse.jetty</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jetty-util</artifactId>
+          <groupId>org.eclipse.jetty</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jetty-servlet</artifactId>
+          <groupId>org.eclipse.jetty</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jetty-webapp</artifactId>
+          <groupId>org.eclipse.jetty</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jsp-api</artifactId>
+          <groupId>javax.servlet.jsp</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jersey-core</artifactId>
+          <groupId>com.sun.jersey</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jersey-servlet</artifactId>
+          <groupId>com.sun.jersey</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jersey-json</artifactId>
+          <groupId>com.sun.jersey</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jersey-server</artifactId>
+          <groupId>com.sun.jersey</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>log4j</artifactId>
+          <groupId>log4j</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>commons-lang</artifactId>
+          <groupId>commons-lang</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>commons-beanutils</artifactId>
+          <groupId>commons-beanutils</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>commons-configuration2</artifactId>
+          <groupId>org.apache.commons</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>commons-lang3</artifactId>
+          <groupId>org.apache.commons</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>slf4j-log4j12</artifactId>
+          <groupId>org.slf4j</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>avro</artifactId>
+          <groupId>org.apache.avro</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>re2j</artifactId>
+          <groupId>com.google.re2j</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>hadoop-auth</artifactId>
+          <groupId>org.apache.hadoop</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jsch</artifactId>
+          <groupId>com.jcraft</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>curator-client</artifactId>
+          <groupId>org.apache.curator</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>curator-recipes</artifactId>
+          <groupId>org.apache.curator</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>htrace-core4</artifactId>
+          <groupId>org.apache.htrace</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>zookeeper</artifactId>
+          <groupId>org.apache.zookeeper</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>commons-compress</artifactId>
+          <groupId>org.apache.commons</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>kerb-simplekdc</artifactId>
+          <groupId>org.apache.kerby</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>jackson-databind</artifactId>
+          <groupId>com.fasterxml.jackson.core</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>stax2-api</artifactId>
+          <groupId>org.codehaus.woodstox</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>woodstox-core</artifactId>
+          <groupId>com.fasterxml.woodstox</groupId>
+        </exclusion>
+        <exclusion>
+          <artifactId>hadoop-annotations</artifactId>
+          <groupId>org.apache.hadoop</groupId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+  </dependencies>
  <distributionManagement>
    <snapshotRepository>
      <id>ossrh</id>
@ -127,7 +309,7 @@
    </snapshotRepository>
  </distributionManagement>
  <properties>
-    <seaweedfs.client.version>1.4.5</seaweedfs.client.version>
+    <seaweedfs.client.version>1.4.6</seaweedfs.client.version>
    <hadoop.version>3.1.1</hadoop.version>
  </properties>
 </project>
--- a/other/java/hdfs3/pom.xml
+++ b/other/java/hdfs3/pom.xml
@ -5,7 +5,7 @@
    <modelVersion>4.0.0</modelVersion>

    <properties>
-        <seaweedfs.client.version>1.4.5</seaweedfs.client.version>
+        <seaweedfs.client.version>1.4.6</seaweedfs.client.version>
        <hadoop.version>3.1.1</hadoop.version>
    </properties>

--- a/other/java/hdfs3/src/main/java/seaweed/hdfs/SeaweedFileSystemStore.java
+++ b/other/java/hdfs3/src/main/java/seaweed/hdfs/SeaweedFileSystemStore.java
@ -124,7 +124,7 @@ public class SeaweedFileSystemStore {

    private FileStatus doGetFileStatus(Path path, FilerProto.Entry entry) {
        FilerProto.FuseAttributes attributes = entry.getAttributes();
-        long length = SeaweedRead.totalSize(entry.getChunksList());
+        long length = SeaweedRead.fileSize(entry);
        boolean isDir = entry.getIsDirectory();
        int block_replication = 1;
        int blocksize = 512;
@ -185,7 +185,7 @@ public class SeaweedFileSystemStore {
                entry.mergeFrom(existingEntry);
                entry.getAttributesBuilder().setMtime(now);
                LOG.debug("createFile merged entry path:{} entry:{} from:{}", path, entry, existingEntry);
-                writePosition = SeaweedRead.totalSize(existingEntry.getChunksList());
+                writePosition = SeaweedRead.fileSize(existingEntry);
                replication = existingEntry.getAttributes().getReplication();
            }
        }
--- a/other/java/hdfs3/src/main/java/seaweed/hdfs/SeaweedInputStream.java
+++ b/other/java/hdfs3/src/main/java/seaweed/hdfs/SeaweedInputStream.java
@ -41,7 +41,7 @@ public class SeaweedInputStream extends FSInputStream {
        this.statistics = statistics;
        this.path = path;
        this.entry = entry;
-        this.contentLength = SeaweedRead.totalSize(entry.getChunksList());
+        this.contentLength = SeaweedRead.fileSize(entry);
        this.bufferSize = bufferSize;

        this.visibleIntervalList = SeaweedRead.nonOverlappingVisibleIntervals(filerGrpcClient, entry.getChunksList());
@ -87,7 +87,7 @@ public class SeaweedInputStream extends FSInputStream {
            throw new IllegalArgumentException("requested read length is more than will fit after requested offset in buffer");
        }

-        long bytesRead = SeaweedRead.read(this.filerGrpcClient, this.visibleIntervalList, this.position, b, off, len);
+        long bytesRead = SeaweedRead.read(this.filerGrpcClient, this.visibleIntervalList, this.position, b, off, len, SeaweedRead.fileSize(entry));
        if (bytesRead > Integer.MAX_VALUE) {
            throw new IOException("Unexpected Content-Length");
        }
--- a/test/random_access/pom.xml
+++ b/test/random_access/pom.xml
@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>com.seaweedfs.test</groupId>
+    <artifactId>random_access</artifactId>
+    <packaging>jar</packaging>
+    <version>1.0-SNAPSHOT</version>
+
+    <properties>
+        <guava.version>28.0-jre</guava.version>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.google.guava</groupId>
+            <artifactId>guava</artifactId>
+            <version>${guava.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <version>1.7.25</version>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.12</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.esotericsoftware.kryo</groupId>
+            <artifactId>kryo</artifactId>
+            <version>2.24.0</version>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <extensions>
+            <extension>
+                <groupId>kr.motd.maven</groupId>
+                <artifactId>os-maven-plugin</artifactId>
+                <version>1.6.2</version>
+            </extension>
+        </extensions>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>8</source>
+                    <target>8</target>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
--- a/test/random_access/src/main/java/seaweedfs/client/btree/BTreePersistentIndexedCache.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/BTreePersistentIndexedCache.java
@ -0,0 +1,753 @@
+/*
+ * Copyright 2010 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import com.google.common.collect.ImmutableSet;
+import seaweedfs.client.btree.serialize.Serializer;
+import seaweedfs.client.btree.serialize.kryo.KryoBackedDecoder;
+import seaweedfs.client.btree.serialize.kryo.KryoBackedEncoder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+// todo - stream serialised value to file
+// todo - handle hash collisions (properly, this time)
+// todo - don't store null links to child blocks in leaf index blocks
+// todo - align block boundaries
+// todo - thread safety control
+// todo - merge small values into a single data block
+// todo - discard when file corrupt
+// todo - include data directly in index entry when serializer can guarantee small fixed sized data
+// todo - free list leaks disk space
+// todo - merge adjacent free blocks
+// todo - use more efficient lookup for free block with nearest size
+@SuppressWarnings("unchecked")
+public class BTreePersistentIndexedCache<K, V> {
+    private static final Logger LOGGER = LoggerFactory.getLogger(BTreePersistentIndexedCache.class);
+    private final File cacheFile;
+    private final KeyHasher<K> keyHasher;
+    private final Serializer<V> serializer;
+    private final short maxChildIndexEntries;
+    private final int minIndexChildNodes;
+    private final StateCheckBlockStore store;
+    private HeaderBlock header;
+
+    public BTreePersistentIndexedCache(File cacheFile, Serializer<K> keySerializer, Serializer<V> valueSerializer) {
+        this(cacheFile, keySerializer, valueSerializer, (short) 512, 512);
+    }
+
+    public BTreePersistentIndexedCache(File cacheFile, Serializer<K> keySerializer, Serializer<V> valueSerializer,
+                                       short maxChildIndexEntries, int maxFreeListEntries) {
+        this.cacheFile = cacheFile;
+        this.keyHasher = new KeyHasher<K>(keySerializer);
+        this.serializer = valueSerializer;
+        this.maxChildIndexEntries = maxChildIndexEntries;
+        this.minIndexChildNodes = maxChildIndexEntries / 2;
+        BlockStore cachingStore = new CachingBlockStore(new FileBackedBlockStore(cacheFile), ImmutableSet.of(IndexBlock.class, FreeListBlockStore.FreeListBlock.class));
+        this.store = new StateCheckBlockStore(new FreeListBlockStore(cachingStore, maxFreeListEntries));
+        try {
+            open();
+        } catch (Exception e) {
+            throw new UncheckedIOException(String.format("Could not open %s.", this), e);
+        }
+    }
+
+    @Override
+    public String toString() {
+        return "cache " + cacheFile.getName() + " (" + cacheFile + ")";
+    }
+
+    private void open() throws Exception {
+        LOGGER.debug("Opening {}", this);
+        try {
+            doOpen();
+        } catch (CorruptedCacheException e) {
+            rebuild();
+        }
+    }
+
+    private void doOpen() throws Exception {
+        BlockStore.Factory factory = new BlockStore.Factory() {
+            @Override
+            public Object create(Class<? extends BlockPayload> type) {
+                if (type == HeaderBlock.class) {
+                    return new HeaderBlock();
+                }
+                if (type == IndexBlock.class) {
+                    return new IndexBlock();
+                }
+                if (type == DataBlock.class) {
+                    return new DataBlock();
+                }
+                throw new UnsupportedOperationException();
+            }
+        };
+        Runnable initAction = new Runnable() {
+            @Override
+            public void run() {
+                header = new HeaderBlock();
+                store.write(header);
+                header.index.newRoot();
+                store.flush();
+            }
+        };
+
+        store.open(initAction, factory);
+        header = store.readFirst(HeaderBlock.class);
+    }
+
+    public V get(K key) {
+        try {
+            try {
+                DataBlock block = header.getRoot().get(key);
+                if (block != null) {
+                    return block.getValue();
+                }
+                return null;
+            } catch (CorruptedCacheException e) {
+                rebuild();
+                return null;
+            }
+        } catch (Exception e) {
+            throw new UncheckedIOException(String.format("Could not read entry '%s' from %s.", key, this), e);
+        }
+    }
+
+    public void put(K key, V value) {
+        try {
+            long hashCode = keyHasher.getHashCode(key);
+            Lookup lookup = header.getRoot().find(hashCode);
+            DataBlock newBlock = null;
+            if (lookup.entry != null) {
+                DataBlock block = store.read(lookup.entry.dataBlock, DataBlock.class);
+                DataBlockUpdateResult updateResult = block.useNewValue(value);
+                if (updateResult.isFailed()) {
+                    store.remove(block);
+                    newBlock = new DataBlock(value, updateResult.getSerializedValue());
+                }
+            } else {
+                newBlock = new DataBlock(value);
+            }
+            if (newBlock != null) {
+                store.write(newBlock);
+                lookup.indexBlock.put(hashCode, newBlock.getPos());
+            }
+            store.flush();
+        } catch (Exception e) {
+            throw new UncheckedIOException(String.format("Could not add entry '%s' to %s.", key, this), e);
+        }
+    }
+
+    public void remove(K key) {
+        try {
+            Lookup lookup = header.getRoot().find(key);
+            if (lookup.entry == null) {
+                return;
+            }
+            lookup.indexBlock.remove(lookup.entry);
+            DataBlock block = store.read(lookup.entry.dataBlock, DataBlock.class);
+            store.remove(block);
+            store.flush();
+        } catch (Exception e) {
+            throw new UncheckedIOException(String.format("Could not remove entry '%s' from %s.", key, this), e);
+        }
+    }
+
+    private IndexBlock load(BlockPointer pos, IndexRoot root, IndexBlock parent, int index) {
+        IndexBlock block = store.read(pos, IndexBlock.class);
+        block.root = root;
+        block.parent = parent;
+        block.parentEntryIndex = index;
+        return block;
+    }
+
+    public void reset() {
+        close();
+        try {
+            open();
+        } catch (Exception e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    public void close() {
+        LOGGER.debug("Closing {}", this);
+        try {
+            store.close();
+        } catch (Exception e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    public boolean isOpen() {
+        return store.isOpen();
+    }
+
+    private void rebuild() {
+        LOGGER.warn("{} is corrupt. Discarding.", this);
+        try {
+            clear();
+        } catch (Exception e) {
+            LOGGER.warn("{} couldn't be rebuilt. Closing.", this);
+            close();
+        }
+    }
+
+    public void verify() {
+        try {
+            doVerify();
+        } catch (Exception e) {
+            throw new UncheckedIOException(String.format("Some problems were found when checking the integrity of %s.",
+                    this), e);
+        }
+    }
+
+    private void doVerify() throws Exception {
+        List<BlockPayload> blocks = new ArrayList<BlockPayload>();
+
+        HeaderBlock header = store.readFirst(HeaderBlock.class);
+        blocks.add(header);
+        verifyTree(header.getRoot(), "", blocks, Long.MAX_VALUE, true);
+
+        Collections.sort(blocks, new Comparator<BlockPayload>() {
+            @Override
+            public int compare(BlockPayload block, BlockPayload block1) {
+                return block.getPos().compareTo(block1.getPos());
+            }
+        });
+
+        for (int i = 0; i < blocks.size() - 1; i++) {
+            Block b1 = blocks.get(i).getBlock();
+            Block b2 = blocks.get(i + 1).getBlock();
+            if (b1.getPos().getPos() + b1.getSize() > b2.getPos().getPos()) {
+                throw new IOException(String.format("%s overlaps with %s", b1, b2));
+            }
+        }
+    }
+
+    private void verifyTree(IndexBlock current, String prefix, Collection<BlockPayload> blocks, long maxValue,
+                            boolean loadData) throws Exception {
+        blocks.add(current);
+
+        if (!prefix.equals("") && current.entries.size() < maxChildIndexEntries / 2) {
+            throw new IOException(String.format("Too few entries found in %s", current));
+        }
+        if (current.entries.size() > maxChildIndexEntries) {
+            throw new IOException(String.format("Too many entries found in %s", current));
+        }
+
+        boolean isLeaf = current.entries.size() == 0 || current.entries.get(0).childIndexBlock.isNull();
+        if (isLeaf ^ current.tailPos.isNull()) {
+            throw new IOException(String.format("Mismatched leaf/tail-node in %s", current));
+        }
+
+        long min = Long.MIN_VALUE;
+        for (IndexEntry entry : current.entries) {
+            if (isLeaf ^ entry.childIndexBlock.isNull()) {
+                throw new IOException(String.format("Mismatched leaf/non-leaf entry in %s", current));
+            }
+            if (entry.hashCode >= maxValue || entry.hashCode <= min) {
+                throw new IOException(String.format("Out-of-order key in %s", current));
+            }
+            min = entry.hashCode;
+            if (!entry.childIndexBlock.isNull()) {
+                IndexBlock child = store.read(entry.childIndexBlock, IndexBlock.class);
+                verifyTree(child, "   " + prefix, blocks, entry.hashCode, loadData);
+            }
+            if (loadData) {
+                DataBlock block = store.read(entry.dataBlock, DataBlock.class);
+                blocks.add(block);
+            }
+        }
+        if (!current.tailPos.isNull()) {
+            IndexBlock tail = store.read(current.tailPos, IndexBlock.class);
+            verifyTree(tail, "   " + prefix, blocks, maxValue, loadData);
+        }
+    }
+
+    public void clear() {
+        store.clear();
+        close();
+        try {
+            doOpen();
+        } catch (Exception e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    private class IndexRoot {
+        private BlockPointer rootPos = BlockPointer.start();
+        private HeaderBlock owner;
+
+        private IndexRoot(HeaderBlock owner) {
+            this.owner = owner;
+        }
+
+        public void setRootPos(BlockPointer rootPos) {
+            this.rootPos = rootPos;
+            store.write(owner);
+        }
+
+        public IndexBlock getRoot() {
+            return load(rootPos, this, null, 0);
+        }
+
+        public IndexBlock newRoot() {
+            IndexBlock block = new IndexBlock();
+            store.write(block);
+            setRootPos(block.getPos());
+            return block;
+        }
+    }
+
+    private class HeaderBlock extends BlockPayload {
+        private IndexRoot index;
+
+        private HeaderBlock() {
+            index = new IndexRoot(this);
+        }
+
+        @Override
+        protected byte getType() {
+            return 0x55;
+        }
+
+        @Override
+        protected int getSize() {
+            return Block.LONG_SIZE + Block.SHORT_SIZE;
+        }
+
+        @Override
+        protected void read(DataInputStream instr) throws Exception {
+            index.rootPos = BlockPointer.pos(instr.readLong());
+
+            short actualChildIndexEntries = instr.readShort();
+            if (actualChildIndexEntries != maxChildIndexEntries) {
+                throw blockCorruptedException();
+            }
+        }
+
+        @Override
+        protected void write(DataOutputStream outstr) throws Exception {
+            outstr.writeLong(index.rootPos.getPos());
+            outstr.writeShort(maxChildIndexEntries);
+        }
+
+        public IndexBlock getRoot() throws Exception {
+            return index.getRoot();
+        }
+    }
+
+    private class IndexBlock extends BlockPayload {
+        private final List<IndexEntry> entries = new ArrayList<IndexEntry>();
+        private BlockPointer tailPos = BlockPointer.start();
+        // Transient fields
+        private IndexBlock parent;
+        private int parentEntryIndex;
+        private IndexRoot root;
+
+        @Override
+        protected byte getType() {
+            return 0x77;
+        }
+
+        @Override
+        protected int getSize() {
+            return Block.INT_SIZE + Block.LONG_SIZE + (3 * Block.LONG_SIZE) * maxChildIndexEntries;
+        }
+
+        @Override
+        public void read(DataInputStream instr) throws IOException {
+            int count = instr.readInt();
+            entries.clear();
+            for (int i = 0; i < count; i++) {
+                IndexEntry entry = new IndexEntry();
+                entry.hashCode = instr.readLong();
+                entry.dataBlock = BlockPointer.pos(instr.readLong());
+                entry.childIndexBlock = BlockPointer.pos(instr.readLong());
+                entries.add(entry);
+            }
+            tailPos = BlockPointer.pos(instr.readLong());
+        }
+
+        @Override
+        public void write(DataOutputStream outstr) throws IOException {
+            outstr.writeInt(entries.size());
+            for (IndexEntry entry : entries) {
+                outstr.writeLong(entry.hashCode);
+                outstr.writeLong(entry.dataBlock.getPos());
+                outstr.writeLong(entry.childIndexBlock.getPos());
+            }
+            outstr.writeLong(tailPos.getPos());
+        }
+
+        public void put(long hashCode, BlockPointer pos) throws Exception {
+            int index = Collections.binarySearch(entries, new IndexEntry(hashCode));
+            IndexEntry entry;
+            if (index >= 0) {
+                entry = entries.get(index);
+            } else {
+                assert tailPos.isNull();
+                entry = new IndexEntry();
+                entry.hashCode = hashCode;
+                entry.childIndexBlock = BlockPointer.start();
+                index = -index - 1;
+                entries.add(index, entry);
+            }
+
+            entry.dataBlock = pos;
+            store.write(this);
+
+            maybeSplit();
+        }
+
+        private void maybeSplit() throws Exception {
+            if (entries.size() > maxChildIndexEntries) {
+                int splitPos = entries.size() / 2;
+                IndexEntry splitEntry = entries.remove(splitPos);
+                if (parent == null) {
+                    parent = root.newRoot();
+                }
+                IndexBlock sibling = new IndexBlock();
+                store.write(sibling);
+                List<IndexEntry> siblingEntries = entries.subList(splitPos, entries.size());
+                sibling.entries.addAll(siblingEntries);
+                siblingEntries.clear();
+                sibling.tailPos = tailPos;
+                tailPos = splitEntry.childIndexBlock;
+                splitEntry.childIndexBlock = BlockPointer.start();
+                parent.add(this, splitEntry, sibling);
+            }
+        }
+
+        private void add(IndexBlock left, IndexEntry entry, IndexBlock right) throws Exception {
+            int index = left.parentEntryIndex;
+            if (index < entries.size()) {
+                IndexEntry parentEntry = entries.get(index);
+                assert parentEntry.childIndexBlock.equals(left.getPos());
+                parentEntry.childIndexBlock = right.getPos();
+            } else {
+                assert index == entries.size() && (tailPos.isNull() || tailPos.equals(left.getPos()));
+                tailPos = right.getPos();
+            }
+            entries.add(index, entry);
+            entry.childIndexBlock = left.getPos();
+            store.write(this);
+
+            maybeSplit();
+        }
+
+        public DataBlock get(K key) throws Exception {
+            Lookup lookup = find(key);
+            if (lookup.entry == null) {
+                return null;
+            }
+
+            return store.read(lookup.entry.dataBlock, DataBlock.class);
+        }
+
+        public Lookup find(K key) throws Exception {
+            long checksum = keyHasher.getHashCode(key);
+            return find(checksum);
+        }
+
+        private Lookup find(long hashCode) throws Exception {
+            int index = Collections.binarySearch(entries, new IndexEntry(hashCode));
+            if (index >= 0) {
+                return new Lookup(this, entries.get(index));
+            }
+
+            index = -index - 1;
+            BlockPointer childBlockPos;
+            if (index == entries.size()) {
+                childBlockPos = tailPos;
+            } else {
+                childBlockPos = entries.get(index).childIndexBlock;
+            }
+            if (childBlockPos.isNull()) {
+                return new Lookup(this, null);
+            }
+
+            IndexBlock childBlock = load(childBlockPos, root, this, index);
+            return childBlock.find(hashCode);
+        }
+
+        public void remove(IndexEntry entry) throws Exception {
+            int index = entries.indexOf(entry);
+            assert index >= 0;
+            entries.remove(index);
+            store.write(this);
+
+            if (entry.childIndexBlock.isNull()) {
+                maybeMerge();
+            } else {
+                // Not a leaf node. Move up an entry from a leaf node, then possibly merge the leaf node
+                IndexBlock leafBlock = load(entry.childIndexBlock, root, this, index);
+                leafBlock = leafBlock.findHighestLeaf();
+                IndexEntry highestEntry = leafBlock.entries.remove(leafBlock.entries.size() - 1);
+                highestEntry.childIndexBlock = entry.childIndexBlock;
+                entries.add(index, highestEntry);
+                store.write(leafBlock);
+                leafBlock.maybeMerge();
+            }
+        }
+
+        private void maybeMerge() throws Exception {
+            if (parent == null) {
+                // This is the root block. Can have any number of children <= maxChildIndexEntries
+                if (entries.size() == 0 && !tailPos.isNull()) {
+                    // This is an empty root block, discard it
+                    header.index.setRootPos(tailPos);
+                    store.remove(this);
+                }
+                return;
+            }
+
+            // This is not the root block. Must have children >= minIndexChildNodes
+            if (entries.size() >= minIndexChildNodes) {
+                return;
+            }
+
+            // Attempt to merge with the left sibling
+            IndexBlock left = parent.getPrevious(this);
+            if (left != null) {
+                assert entries.size() + left.entries.size() <= maxChildIndexEntries * 2;
+                if (left.entries.size() > minIndexChildNodes) {
+                    // There are enough entries in this block and the left sibling to make up 2 blocks, so redistribute
+                    // the entries evenly between them
+                    left.mergeFrom(this);
+                    left.maybeSplit();
+                    return;
+                } else {
+                    // There are only enough entries to make up 1 block, so move the entries of the left sibling into
+                    // this block and discard the left sibling. Might also need to merge the parent
+                    left.mergeFrom(this);
+                    parent.maybeMerge();
+                    return;
+                }
+            }
+
+            // Attempt to merge with the right sibling
+            IndexBlock right = parent.getNext(this);
+            if (right != null) {
+                assert entries.size() + right.entries.size() <= maxChildIndexEntries * 2;
+                if (right.entries.size() > minIndexChildNodes) {
+                    // There are enough entries in this block and the right sibling to make up 2 blocks, so redistribute
+                    // the entries evenly between them
+                    mergeFrom(right);
+                    maybeSplit();
+                    return;
+                } else {
+                    // There are only enough entries to make up 1 block, so move the entries of the right sibling into
+                    // this block and discard this block. Might also need to merge the parent
+                    mergeFrom(right);
+                    parent.maybeMerge();
+                    return;
+                }
+            }
+
+            // Should not happen
+            throw new IllegalStateException(String.format("%s does not have any siblings.", getBlock()));
+        }
+
+        private void mergeFrom(IndexBlock right) throws Exception {
+            IndexEntry newChildEntry = parent.entries.remove(parentEntryIndex);
+            if (right.getPos().equals(parent.tailPos)) {
+                parent.tailPos = getPos();
+            } else {
+                IndexEntry newParentEntry = parent.entries.get(parentEntryIndex);
+                assert newParentEntry.childIndexBlock.equals(right.getPos());
+                newParentEntry.childIndexBlock = getPos();
+            }
+            entries.add(newChildEntry);
+            entries.addAll(right.entries);
+            newChildEntry.childIndexBlock = tailPos;
+            tailPos = right.tailPos;
+            store.write(parent);
+            store.write(this);
+            store.remove(right);
+        }
+
+        private IndexBlock getNext(IndexBlock indexBlock) throws Exception {
+            int index = indexBlock.parentEntryIndex + 1;
+            if (index > entries.size()) {
+                return null;
+            }
+            if (index == entries.size()) {
+                return load(tailPos, root, this, index);
+            }
+            return load(entries.get(index).childIndexBlock, root, this, index);
+        }
+
+        private IndexBlock getPrevious(IndexBlock indexBlock) throws Exception {
+            int index = indexBlock.parentEntryIndex - 1;
+            if (index < 0) {
+                return null;
+            }
+            return load(entries.get(index).childIndexBlock, root, this, index);
+        }
+
+        private IndexBlock findHighestLeaf() throws Exception {
+            if (tailPos.isNull()) {
+                return this;
+            }
+            return load(tailPos, root, this, entries.size()).findHighestLeaf();
+        }
+    }
+
+    private static class IndexEntry implements Comparable<IndexEntry> {
+        long hashCode;
+        BlockPointer dataBlock;
+        BlockPointer childIndexBlock;
+
+        private IndexEntry() {
+        }
+
+        private IndexEntry(long hashCode) {
+            this.hashCode = hashCode;
+        }
+
+        @Override
+        public int compareTo(IndexEntry indexEntry) {
+            if (hashCode > indexEntry.hashCode) {
+                return 1;
+            }
+            if (hashCode < indexEntry.hashCode) {
+                return -1;
+            }
+            return 0;
+        }
+    }
+
+    private class Lookup {
+        final IndexBlock indexBlock;
+        final IndexEntry entry;
+
+        private Lookup(IndexBlock indexBlock, IndexEntry entry) {
+            this.indexBlock = indexBlock;
+            this.entry = entry;
+        }
+    }
+
+    private class DataBlock extends BlockPayload {
+        private int size;
+        private StreamByteBuffer buffer;
+        private V value;
+
+        private DataBlock() {
+        }
+
+        public DataBlock(V value) throws Exception {
+            this.value = value;
+            setValue(value);
+            size = buffer.totalBytesUnread();
+        }
+
+        public DataBlock(V value, StreamByteBuffer buffer) throws Exception {
+            this.value = value;
+            this.buffer = buffer;
+            size = buffer.totalBytesUnread();
+        }
+
+        public void setValue(V value) throws Exception {
+            buffer = StreamByteBuffer.createWithChunkSizeInDefaultRange(size);
+            KryoBackedEncoder encoder = new KryoBackedEncoder(buffer.getOutputStream());
+            serializer.write(encoder, value);
+            encoder.flush();
+        }
+
+        public V getValue() throws Exception {
+            if (value == null) {
+                value = serializer.read(new KryoBackedDecoder(buffer.getInputStream()));
+                buffer = null;
+            }
+            return value;
+        }
+
+        @Override
+        protected byte getType() {
+            return 0x33;
+        }
+
+        @Override
+        protected int getSize() {
+            return 2 * Block.INT_SIZE + size;
+        }
+
+        @Override
+        public void read(DataInputStream instr) throws Exception {
+            size = instr.readInt();
+            int bytes = instr.readInt();
+            buffer = StreamByteBuffer.of(instr, bytes);
+        }
+
+        @Override
+        public void write(DataOutputStream outstr) throws Exception {
+            outstr.writeInt(size);
+            outstr.writeInt(buffer.totalBytesUnread());
+            buffer.writeTo(outstr);
+            buffer = null;
+        }
+
+        public DataBlockUpdateResult useNewValue(V value) throws Exception {
+            setValue(value);
+            boolean ok = buffer.totalBytesUnread() <= size;
+            if (ok) {
+                this.value = value;
+                store.write(this);
+                return DataBlockUpdateResult.success();
+            } else {
+                return DataBlockUpdateResult.failed(buffer);
+            }
+        }
+    }
+
+    private static class DataBlockUpdateResult {
+        private static final DataBlockUpdateResult SUCCESS = new DataBlockUpdateResult(true, null);
+        private final boolean success;
+        private final StreamByteBuffer serializedValue;
+
+        private DataBlockUpdateResult(boolean success, StreamByteBuffer serializedValue) {
+            this.success = success;
+            this.serializedValue = serializedValue;
+        }
+
+        static DataBlockUpdateResult success() {
+            return SUCCESS;
+        }
+
+        static DataBlockUpdateResult failed(StreamByteBuffer serializedValue) {
+            return new DataBlockUpdateResult(false, serializedValue);
+        }
+
+        public boolean isFailed() {
+            return !success;
+        }
+
+        public StreamByteBuffer getSerializedValue() {
+            return serializedValue;
+        }
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/Block.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/Block.java
@ -0,0 +1,59 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+public abstract class Block {
+    static final int LONG_SIZE = 8;
+    static final int INT_SIZE = 4;
+    static final int SHORT_SIZE = 2;
+
+    private BlockPayload payload;
+
+    protected Block(BlockPayload payload) {
+        this.payload = payload;
+        payload.setBlock(this);
+    }
+
+    public BlockPayload getPayload() {
+        return payload;
+    }
+
+    protected void detach() {
+        payload.setBlock(null);
+        payload = null;
+    }
+
+    public abstract BlockPointer getPos();
+
+    public abstract int getSize();
+
+    public abstract RuntimeException blockCorruptedException();
+
+    @Override
+    public String toString() {
+        return payload.getClass().getSimpleName() + " " + getPos();
+    }
+
+    public BlockPointer getNextPos() {
+        return BlockPointer.pos(getPos().getPos() + getSize());
+    }
+
+    public abstract boolean hasPos();
+
+    public abstract void setPos(BlockPointer pos);
+
+    public abstract void setSize(int size);
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/BlockPayload.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/BlockPayload.java
@ -0,0 +1,51 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+
+public abstract class BlockPayload {
+    private Block block;
+
+    public Block getBlock() {
+        return block;
+    }
+
+    public void setBlock(Block block) {
+        this.block = block;
+    }
+
+    public BlockPointer getPos() {
+        return getBlock().getPos();
+    }
+
+    public BlockPointer getNextPos() {
+        return getBlock().getNextPos();
+    }
+
+    protected abstract int getSize();
+
+    protected abstract byte getType();
+
+    protected abstract void read(DataInputStream inputStream) throws Exception;
+
+    protected abstract void write(DataOutputStream outputStream) throws Exception;
+
+    protected RuntimeException blockCorruptedException() {
+        return getBlock().blockCorruptedException();
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/BlockPointer.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/BlockPointer.java
@ -0,0 +1,75 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import com.google.common.primitives.Longs;
+
+public class BlockPointer implements Comparable<BlockPointer> {
+
+    private static final BlockPointer NULL = new BlockPointer(-1);
+
+    public static BlockPointer start() {
+        return NULL;
+    }
+
+    public static BlockPointer pos(long pos) {
+        if (pos < -1) {
+            throw new CorruptedCacheException("block pointer must be >= -1, but was" + pos);
+        }
+        if (pos == -1) {
+            return NULL;
+        }
+        return new BlockPointer(pos);
+    }
+
+    private final long pos;
+
+    private BlockPointer(long pos) {
+        this.pos = pos;
+    }
+
+    public boolean isNull() {
+        return pos < 0;
+    }
+
+    public long getPos() {
+        return pos;
+    }
+
+    @Override
+    public String toString() {
+        return String.valueOf(pos);
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (obj == null || obj.getClass() != getClass()) {
+            return false;
+        }
+        BlockPointer other = (BlockPointer) obj;
+        return pos == other.pos;
+    }
+
+    @Override
+    public int hashCode() {
+        return Longs.hashCode(pos);
+    }
+
+    @Override
+    public int compareTo(BlockPointer o) {
+        return Longs.compare(pos, o.pos);
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/BlockStore.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/BlockStore.java
@ -0,0 +1,68 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+public interface BlockStore {
+    /**
+     * Opens this store, calling the given action if the store is empty.
+     */
+    void open(Runnable initAction, Factory factory);
+
+    /**
+     * Closes this store.
+     */
+    void close();
+
+    /**
+     * Discards all blocks from this store.
+     */
+    void clear();
+
+    /**
+     * Removes the given block from this store.
+     */
+    void remove(BlockPayload block);
+
+    /**
+     * Reads the first block from this store.
+     */
+    <T extends BlockPayload> T readFirst(Class<T> payloadType);
+    
+    /**
+     * Reads a block from this store.
+     */
+    <T extends BlockPayload> T read(BlockPointer pos, Class<T> payloadType);
+
+    /**
+     * Writes a block to this store, adding the block if required.
+     */
+    void write(BlockPayload block);
+
+    /**
+     * Adds a new block to this store. Allocates space for the block, but does not write the contents of the block
+     * until {@link #write(BlockPayload)} is called.
+     */
+    void attach(BlockPayload block);
+
+    /**
+     * Flushes any pending updates for this store.
+     */
+    void flush();
+
+    interface Factory {
+        Object create(Class<? extends BlockPayload> type);
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/BufferCaster.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/BufferCaster.java
@ -0,0 +1,30 @@
+/*
+ * Copyright 2018 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree;
+
+import java.nio.Buffer;
+
+public class BufferCaster {
+    /**
+     * Without this cast, when the code compiled by Java 9+ is executed on Java 8, it will throw
+     * java.lang.NoSuchMethodError: Method flip()Ljava/nio/ByteBuffer; does not exist in class java.nio.ByteBuffer
+     */
+    @SuppressWarnings("RedundantCast")
+    public static <T extends Buffer> Buffer cast(T byteBuffer) {
+        return (Buffer) byteBuffer;
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/ByteInput.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/ByteInput.java
@ -0,0 +1,74 @@
+/*
+ * Copyright 2014 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree;
+
+import com.google.common.io.CountingInputStream;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+
+/**
+ * Allows a stream of bytes to be read from a particular location of some backing byte stream.
+ */
+class ByteInput {
+    private final RandomAccessFile file;
+    private final ResettableBufferedInputStream bufferedInputStream;
+    private CountingInputStream countingInputStream;
+
+    public ByteInput(RandomAccessFile file) {
+        this.file = file;
+        bufferedInputStream = new ResettableBufferedInputStream(new RandomAccessFileInputStream(file));
+    }
+
+    /**
+     * Starts reading from the given offset.
+     */
+    public DataInputStream start(long offset) throws IOException {
+        file.seek(offset);
+        bufferedInputStream.clear();
+        countingInputStream = new CountingInputStream(bufferedInputStream);
+        return new DataInputStream(countingInputStream);
+    }
+
+    /**
+     * Returns the number of bytes read since {@link #start(long)} was called.
+     */
+    public long getBytesRead() {
+        return countingInputStream.getCount();
+    }
+
+    /**
+     * Finishes reading, resetting any buffered state.
+     */
+    public void done() {
+        countingInputStream = null;
+    }
+
+    private static class ResettableBufferedInputStream extends BufferedInputStream {
+        ResettableBufferedInputStream(InputStream input) {
+            super(input);
+        }
+
+        void clear() {
+            count = 0;
+            pos = 0;
+        }
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/ByteOutput.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/ByteOutput.java
@ -0,0 +1,74 @@
+/*
+ * Copyright 2014 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree;
+
+import com.google.common.io.CountingOutputStream;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.RandomAccessFile;
+
+/**
+ * Allows a stream of bytes to be written to a particular location of some backing byte stream.
+ */
+class ByteOutput {
+    private final RandomAccessFile file;
+    private final ResettableBufferedOutputStream bufferedOutputStream;
+    private CountingOutputStream countingOutputStream;
+
+    public ByteOutput(RandomAccessFile file) {
+        this.file = file;
+        bufferedOutputStream = new ResettableBufferedOutputStream(new RandomAccessFileOutputStream(file));
+    }
+
+    /**
+     * Starts writing to the given offset. Can be beyond the current length of the file.
+     */
+    public DataOutputStream start(long offset) throws IOException {
+        file.seek(offset);
+        bufferedOutputStream.clear();
+        countingOutputStream = new CountingOutputStream(bufferedOutputStream);
+        return new DataOutputStream(countingOutputStream);
+    }
+
+    /**
+     * Returns the number of byte written since {@link #start(long)} was called.
+     */
+    public long getBytesWritten() {
+        return countingOutputStream.getCount();
+    }
+
+    /**
+     * Finishes writing, flushing and resetting any buffered state
+     */
+    public void done() throws IOException {
+        countingOutputStream.flush();
+        countingOutputStream = null;
+    }
+
+    private static class ResettableBufferedOutputStream extends BufferedOutputStream {
+        ResettableBufferedOutputStream(OutputStream output) {
+            super(output);
+        }
+
+        void clear() {
+            count = 0;
+        }
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/CachingBlockStore.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/CachingBlockStore.java
@ -0,0 +1,129 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
+import com.google.common.collect.ImmutableSet;
+
+import javax.annotation.Nullable;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+public class CachingBlockStore implements BlockStore {
+    private final BlockStore store;
+    private final Map<BlockPointer, BlockPayload> dirty = new LinkedHashMap<BlockPointer, BlockPayload>();
+    private final Cache<BlockPointer, BlockPayload> indexBlockCache = CacheBuilder.newBuilder().maximumSize(100).concurrencyLevel(1).build();
+    private final ImmutableSet<Class<? extends BlockPayload>> cacheableBlockTypes;
+
+    public CachingBlockStore(BlockStore store, Collection<Class<? extends BlockPayload>> cacheableBlockTypes) {
+        this.store = store;
+        this.cacheableBlockTypes = ImmutableSet.copyOf(cacheableBlockTypes);
+    }
+
+    @Override
+    public void open(Runnable initAction, Factory factory) {
+        store.open(initAction, factory);
+    }
+
+    @Override
+    public void close() {
+        flush();
+        indexBlockCache.invalidateAll();
+        store.close();
+    }
+
+    @Override
+    public void clear() {
+        dirty.clear();
+        indexBlockCache.invalidateAll();
+        store.clear();
+    }
+
+    @Override
+    public void flush() {
+        Iterator<BlockPayload> iterator = dirty.values().iterator();
+        while (iterator.hasNext()) {
+            BlockPayload block = iterator.next();
+            iterator.remove();
+            store.write(block);
+        }
+        store.flush();
+    }
+
+    @Override
+    public void attach(BlockPayload block) {
+        store.attach(block);
+    }
+
+    @Override
+    public void remove(BlockPayload block) {
+        dirty.remove(block.getPos());
+        if (isCacheable(block)) {
+            indexBlockCache.invalidate(block.getPos());
+        }
+        store.remove(block);
+    }
+
+    @Override
+    public <T extends BlockPayload> T readFirst(Class<T> payloadType) {
+        T block = store.readFirst(payloadType);
+        maybeCache(block);
+        return block;
+    }
+
+    @Override
+    public <T extends BlockPayload> T read(BlockPointer pos, Class<T> payloadType) {
+        T block = payloadType.cast(dirty.get(pos));
+        if (block != null) {
+            return block;
+        }
+        block = maybeGetFromCache(pos, payloadType);
+        if (block != null) {
+            return block;
+        }
+        block = store.read(pos, payloadType);
+        maybeCache(block);
+        return block;
+    }
+
+    @Nullable
+    private <T extends BlockPayload> T maybeGetFromCache(BlockPointer pos, Class<T> payloadType) {
+        if (cacheableBlockTypes.contains(payloadType)) {
+            return payloadType.cast(indexBlockCache.getIfPresent(pos));
+        }
+        return null;
+    }
+
+    @Override
+    public void write(BlockPayload block) {
+        store.attach(block);
+        maybeCache(block);
+        dirty.put(block.getPos(), block);
+    }
+
+    private <T extends BlockPayload> void maybeCache(T block) {
+        if (isCacheable(block)) {
+            indexBlockCache.put(block.getPos(), block);
+        }
+    }
+
+    private <T extends BlockPayload> boolean isCacheable(T block) {
+        return cacheableBlockTypes.contains(block.getClass());
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/CorruptedCacheException.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/CorruptedCacheException.java
@ -0,0 +1,22 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+class CorruptedCacheException extends RuntimeException {
+    CorruptedCacheException(String message) {
+        super(message);
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/FileBackedBlockStore.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/FileBackedBlockStore.java
@ -0,0 +1,274 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+
+public class FileBackedBlockStore implements BlockStore {
+    private final File cacheFile;
+    private RandomAccessFile file;
+    private ByteOutput output;
+    private ByteInput input;
+    private long nextBlock;
+    private Factory factory;
+    private long currentFileSize;
+
+    public FileBackedBlockStore(File cacheFile) {
+        this.cacheFile = cacheFile;
+    }
+
+    @Override
+    public String toString() {
+        return "cache '" + cacheFile + "'";
+    }
+
+    @Override
+    public void open(Runnable runnable, Factory factory) {
+        this.factory = factory;
+        try {
+            cacheFile.getParentFile().mkdirs();
+            file = openRandomAccessFile();
+            output = new ByteOutput(file);
+            input = new ByteInput(file);
+            currentFileSize = file.length();
+            nextBlock = currentFileSize;
+            if (currentFileSize == 0) {
+                runnable.run();
+            }
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    private RandomAccessFile openRandomAccessFile() throws FileNotFoundException {
+        try {
+            return randomAccessFile("rw");
+        } catch (FileNotFoundException e) {
+            return randomAccessFile("r");
+        }
+    }
+
+    private RandomAccessFile randomAccessFile(String mode) throws FileNotFoundException {
+        return new RandomAccessFile(cacheFile, mode);
+    }
+
+    @Override
+    public void close() {
+        try {
+            file.close();
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    @Override
+    public void clear() {
+        try {
+            file.setLength(0);
+            currentFileSize = 0;
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+        nextBlock = 0;
+    }
+
+    @Override
+    public void attach(BlockPayload block) {
+        if (block.getBlock() == null) {
+            block.setBlock(new BlockImpl(block));
+        }
+    }
+
+    @Override
+    public void remove(BlockPayload block) {
+        BlockImpl blockImpl = (BlockImpl) block.getBlock();
+        blockImpl.detach();
+    }
+
+    @Override
+    public void flush() {
+    }
+
+    @Override
+    public <T extends BlockPayload> T readFirst(Class<T> payloadType) {
+        return read(BlockPointer.pos(0), payloadType);
+    }
+
+    @Override
+    public <T extends BlockPayload> T read(BlockPointer pos, Class<T> payloadType) {
+        assert !pos.isNull();
+        try {
+            T payload = payloadType.cast(factory.create(payloadType));
+            BlockImpl block = new BlockImpl(payload, pos);
+            block.read();
+            return payload;
+        } catch (CorruptedCacheException e) {
+            throw e;
+        } catch (Exception e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    @Override
+    public void write(BlockPayload block) {
+        BlockImpl blockImpl = (BlockImpl) block.getBlock();
+        try {
+            blockImpl.write();
+        } catch (CorruptedCacheException e) {
+            throw e;
+        } catch (Exception e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    private long alloc(long length) {
+        long pos = nextBlock;
+        nextBlock += length;
+        return pos;
+    }
+
+    private final class BlockImpl extends Block {
+        private static final int HEADER_SIZE = 1 + INT_SIZE; // type, payload size
+        private static final int TAIL_SIZE = INT_SIZE;
+
+        private BlockPointer pos;
+        private int payloadSize;
+
+        private BlockImpl(BlockPayload payload, BlockPointer pos) {
+            this(payload);
+            setPos(pos);
+        }
+
+        public BlockImpl(BlockPayload payload) {
+            super(payload);
+            pos = null;
+            payloadSize = -1;
+        }
+
+        @Override
+        public boolean hasPos() {
+            return pos != null;
+        }
+
+        @Override
+        public BlockPointer getPos() {
+            if (pos == null) {
+                pos = BlockPointer.pos(alloc(getSize()));
+            }
+            return pos;
+        }
+
+        @Override
+        public void setPos(BlockPointer pos) {
+            assert this.pos == null && !pos.isNull();
+            this.pos = pos;
+        }
+
+        @Override
+        public int getSize() {
+            if (payloadSize < 0) {
+                payloadSize = getPayload().getSize();
+            }
+            return payloadSize + HEADER_SIZE + TAIL_SIZE;
+        }
+
+        @Override
+        public void setSize(int size) {
+            int newPayloadSize = size - HEADER_SIZE - TAIL_SIZE;
+            assert newPayloadSize >= payloadSize;
+            payloadSize = newPayloadSize;
+        }
+
+        public void write() throws Exception {
+            long pos = getPos().getPos();
+
+            DataOutputStream outputStream = output.start(pos);
+
+            BlockPayload payload = getPayload();
+
+            // Write header
+            outputStream.writeByte(payload.getType());
+            outputStream.writeInt(payloadSize);
+            long finalSize = pos + HEADER_SIZE + TAIL_SIZE + payloadSize;
+
+            // Write body
+            payload.write(outputStream);
+
+            // Write count
+            long bytesWritten = output.getBytesWritten();
+            if (bytesWritten > Integer.MAX_VALUE) {
+                throw new IllegalArgumentException("Block payload exceeds maximum size");
+            }
+            outputStream.writeInt((int) bytesWritten);
+            output.done();
+
+            // System.out.println(String.format("wrote [%d,%d)", pos, pos + bytesWritten + 4));
+
+            // Pad
+            if (currentFileSize < finalSize) {
+                // System.out.println(String.format("pad length %d => %d", currentFileSize, finalSize));
+                file.setLength(finalSize);
+                currentFileSize = finalSize;
+            }
+        }
+
+        public void read() throws Exception {
+            long pos = getPos().getPos();
+            assert pos >= 0;
+            if (pos + HEADER_SIZE >= currentFileSize) {
+                throw blockCorruptedException();
+            }
+
+            DataInputStream inputStream = input.start(pos);
+
+            BlockPayload payload = getPayload();
+
+            // Read header
+            byte type = inputStream.readByte();
+            if (type != payload.getType()) {
+                throw blockCorruptedException();
+            }
+
+            // Read body
+            payloadSize = inputStream.readInt();
+            if (pos + HEADER_SIZE + TAIL_SIZE + payloadSize > currentFileSize) {
+                throw blockCorruptedException();
+            }
+            payload.read(inputStream);
+
+            // Read and verify count
+            long actualCount = input.getBytesRead();
+            long count = inputStream.readInt();
+            if (actualCount != count) {
+                System.out.println(String.format("read expected %d actual %d, pos %d payloadSize %d currentFileSize %d", count, actualCount, pos, payloadSize, currentFileSize));
+                throw blockCorruptedException();
+            }
+            input.done();
+        }
+
+        @Override
+        public RuntimeException blockCorruptedException() {
+            return new CorruptedCacheException(String.format("Corrupted %s found in %s.", this,
+                    FileBackedBlockStore.this));
+        }
+    }
+
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/FreeListBlockStore.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/FreeListBlockStore.java
@ -0,0 +1,283 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class FreeListBlockStore implements BlockStore {
+    private final BlockStore store;
+    private final BlockStore freeListStore;
+    private final int maxBlockEntries;
+    private FreeListBlock freeListBlock;
+
+    public FreeListBlockStore(BlockStore store, int maxBlockEntries) {
+        this.store = store;
+        freeListStore = this;
+        this.maxBlockEntries = maxBlockEntries;
+    }
+
+    @Override
+    public void open(final Runnable initAction, final Factory factory) {
+        Runnable freeListInitAction = new Runnable() {
+            @Override
+            public void run() {
+                freeListBlock = new FreeListBlock();
+                store.write(freeListBlock);
+                store.flush();
+                initAction.run();
+            }
+        };
+        Factory freeListFactory = new Factory() {
+            @Override
+            public Object create(Class<? extends BlockPayload> type) {
+                if (type == FreeListBlock.class) {
+                    return new FreeListBlock();
+                }
+                return factory.create(type);
+            }
+        };
+
+        store.open(freeListInitAction, freeListFactory);
+        freeListBlock = store.readFirst(FreeListBlock.class);
+    }
+
+    @Override
+    public void close() {
+        freeListBlock = null;
+        store.close();
+    }
+
+    @Override
+    public void clear() {
+        store.clear();
+    }
+
+    @Override
+    public void remove(BlockPayload block) {
+        Block container = block.getBlock();
+        store.remove(block);
+        freeListBlock.add(container.getPos(), container.getSize());
+    }
+
+    @Override
+    public <T extends BlockPayload> T readFirst(Class<T> payloadType) {
+        return store.read(freeListBlock.getNextPos(), payloadType);
+    }
+
+    @Override
+    public <T extends BlockPayload> T read(BlockPointer pos, Class<T> payloadType) {
+        return store.read(pos, payloadType);
+    }
+
+    @Override
+    public void write(BlockPayload block) {
+        attach(block);
+        store.write(block);
+    }
+
+    @Override
+    public void attach(BlockPayload block) {
+        store.attach(block);
+        freeListBlock.alloc(block.getBlock());
+    }
+
+    @Override
+    public void flush() {
+        store.flush();
+    }
+
+    private void verify() {
+        FreeListBlock block = store.readFirst(FreeListBlock.class);
+        verify(block, Integer.MAX_VALUE);
+    }
+
+    private void verify(FreeListBlock block, int maxValue) {
+        if (block.largestInNextBlock > maxValue) {
+            throw new RuntimeException("corrupt free list");
+        }
+        int current = 0;
+        for (FreeListEntry entry : block.entries) {
+            if (entry.size > maxValue) {
+                throw new RuntimeException("corrupt free list");
+            }
+            if (entry.size < block.largestInNextBlock) {
+                throw new RuntimeException("corrupt free list");
+            }
+            if (entry.size < current) {
+                throw new RuntimeException("corrupt free list");
+            }
+            current = entry.size;
+        }
+        if (!block.nextBlock.isNull()) {
+            verify(store.read(block.nextBlock, FreeListBlock.class), block.largestInNextBlock);
+        }
+    }
+
+    public class FreeListBlock extends BlockPayload {
+        private List<FreeListEntry> entries = new ArrayList<FreeListEntry>();
+        private int largestInNextBlock;
+        private BlockPointer nextBlock = BlockPointer.start();
+        // Transient fields
+        private FreeListBlock prev;
+        private FreeListBlock next;
+
+        @Override
+        protected int getSize() {
+            return Block.LONG_SIZE + Block.INT_SIZE + Block.INT_SIZE + maxBlockEntries * (Block.LONG_SIZE
+                    + Block.INT_SIZE);
+        }
+
+        @Override
+        protected byte getType() {
+            return 0x44;
+        }
+
+        @Override
+        protected void read(DataInputStream inputStream) throws Exception {
+            nextBlock = BlockPointer.pos(inputStream.readLong());
+            largestInNextBlock = inputStream.readInt();
+            int count = inputStream.readInt();
+            for (int i = 0; i < count; i++) {
+                BlockPointer pos = BlockPointer.pos(inputStream.readLong());
+                int size = inputStream.readInt();
+                entries.add(new FreeListEntry(pos, size));
+            }
+        }
+
+        @Override
+        protected void write(DataOutputStream outputStream) throws Exception {
+            outputStream.writeLong(nextBlock.getPos());
+            outputStream.writeInt(largestInNextBlock);
+            outputStream.writeInt(entries.size());
+            for (FreeListEntry entry : entries) {
+                outputStream.writeLong(entry.pos.getPos());
+                outputStream.writeInt(entry.size);
+            }
+        }
+
+        public void add(BlockPointer pos, int size) {
+            assert !pos.isNull() && size >= 0;
+            if (size == 0) {
+                return;
+            }
+
+            if (size < largestInNextBlock) {
+                FreeListBlock next = getNextBlock();
+                next.add(pos, size);
+                return;
+            }
+
+            FreeListEntry entry = new FreeListEntry(pos, size);
+            int index = Collections.binarySearch(entries, entry);
+            if (index < 0) {
+                index = -index - 1;
+            }
+            entries.add(index, entry);
+
+            if (entries.size() > maxBlockEntries) {
+                FreeListBlock newBlock = new FreeListBlock();
+                newBlock.largestInNextBlock = largestInNextBlock;
+                newBlock.nextBlock = nextBlock;
+                newBlock.prev = this;
+                newBlock.next = next;
+                next = newBlock;
+
+                List<FreeListEntry> newBlockEntries = entries.subList(0, entries.size() / 2);
+                newBlock.entries.addAll(newBlockEntries);
+                newBlockEntries.clear();
+                largestInNextBlock = newBlock.entries.get(newBlock.entries.size() - 1).size;
+                freeListStore.write(newBlock);
+                nextBlock = newBlock.getPos();
+            }
+
+            freeListStore.write(this);
+        }
+
+        private FreeListBlock getNextBlock() {
+            if (next == null) {
+                next = freeListStore.read(nextBlock, FreeListBlock.class);
+                next.prev = this;
+            }
+            return next;
+        }
+
+        public void alloc(Block block) {
+            if (block.hasPos()) {
+                return;
+            }
+
+            int requiredSize = block.getSize();
+
+            if (entries.isEmpty() || requiredSize <= largestInNextBlock) {
+                if (nextBlock.isNull()) {
+                    return;
+                }
+                getNextBlock().alloc(block);
+                return;
+            }
+
+            int index = Collections.binarySearch(entries, new FreeListEntry(null, requiredSize));
+            if (index < 0) {
+                index = -index - 1;
+            }
+            if (index == entries.size()) {
+                // Largest free block is too small
+                return;
+            }
+
+            FreeListEntry entry = entries.remove(index);
+            block.setPos(entry.pos);
+            block.setSize(entry.size);
+            freeListStore.write(this);
+
+            if (entries.size() == 0 && prev != null) {
+                prev.nextBlock = nextBlock;
+                prev.largestInNextBlock = largestInNextBlock;
+                prev.next = next;
+                if (next != null) {
+                    next.prev = prev;
+                }
+                freeListStore.write(prev);
+                freeListStore.remove(this);
+            }
+        }
+    }
+
+    private static class FreeListEntry implements Comparable<FreeListEntry> {
+        final BlockPointer pos;
+        final int size;
+
+        private FreeListEntry(BlockPointer pos, int size) {
+            this.pos = pos;
+            this.size = size;
+        }
+
+        @Override
+        public int compareTo(FreeListEntry o) {
+            if (size > o.size) {
+                return 1;
+            }
+            if (size < o.size) {
+                return -1;
+            }
+            return 0;
+        }
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/KeyHasher.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/KeyHasher.java
@ -0,0 +1,75 @@
+/*
+ * Copyright 2014 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree;
+
+import seaweedfs.client.btree.serialize.Serializer;
+import seaweedfs.client.btree.serialize.kryo.KryoBackedEncoder;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.math.BigInteger;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+class KeyHasher<K> {
+    private final Serializer<K> serializer;
+    private final MessageDigestStream digestStream = new MessageDigestStream();
+    private final KryoBackedEncoder encoder = new KryoBackedEncoder(digestStream);
+
+    public KeyHasher(Serializer<K> serializer) {
+        this.serializer = serializer;
+    }
+
+    long getHashCode(K key) throws Exception {
+        serializer.write(encoder, key);
+        encoder.flush();
+        return digestStream.getChecksum();
+    }
+
+    private static class MessageDigestStream extends OutputStream {
+        MessageDigest messageDigest;
+
+        private MessageDigestStream() {
+            try {
+                messageDigest = MessageDigest.getInstance("MD5");
+            } catch (NoSuchAlgorithmException e) {
+                throw UncheckedException.throwAsUncheckedException(e);
+            }
+        }
+
+        @Override
+        public void write(int b) throws IOException {
+            messageDigest.update((byte) b);
+        }
+
+        @Override
+        public void write(byte[] b) throws IOException {
+            messageDigest.update(b);
+        }
+
+        @Override
+        public void write(byte[] b, int off, int len) throws IOException {
+            messageDigest.update(b, off, len);
+        }
+
+        long getChecksum() {
+            byte[] digest = messageDigest.digest();
+            assert digest.length == 16;
+            return new BigInteger(digest).longValue();
+        }
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/RandomAccessFileInputStream.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/RandomAccessFileInputStream.java
@ -0,0 +1,54 @@
+/*
+ * Copyright 2013 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+
+/**
+ * Reads from a {@link RandomAccessFile}. Each operation reads from and advances the current position of the file.
+ *
+ * <p>Closing this stream does not close the underlying file.
+ */
+public class RandomAccessFileInputStream extends InputStream {
+    private final RandomAccessFile file;
+
+    public RandomAccessFileInputStream(RandomAccessFile file) {
+        this.file = file;
+    }
+
+    @Override
+    public long skip(long n) throws IOException {
+        file.seek(file.getFilePointer() + n);
+        return n;
+    }
+
+    @Override
+    public int read(byte[] bytes) throws IOException {
+        return file.read(bytes);
+    }
+
+    @Override
+    public int read() throws IOException {
+        return file.read();
+    }
+
+    @Override
+    public int read(byte[] bytes, int offset, int length) throws IOException {
+        return file.read(bytes, offset, length);
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/RandomAccessFileOutputStream.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/RandomAccessFileOutputStream.java
@ -0,0 +1,48 @@
+/*
+ * Copyright 2013 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.RandomAccessFile;
+
+/**
+ * Writes to a {@link RandomAccessFile}. Each operation writes to and advances the current position of the file.
+ *
+ * <p>Closing this stream does not close the underlying file. Flushing this stream does nothing.
+ */
+public class RandomAccessFileOutputStream extends OutputStream {
+    private final RandomAccessFile file;
+
+    public RandomAccessFileOutputStream(RandomAccessFile file) {
+        this.file = file;
+    }
+
+    @Override
+    public void write(int i) throws IOException {
+        file.write(i);
+    }
+
+    @Override
+    public void write(byte[] bytes) throws IOException {
+        file.write(bytes);
+    }
+
+    @Override
+    public void write(byte[] bytes, int offset, int length) throws IOException {
+        file.write(bytes, offset, length);
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/StateCheckBlockStore.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/StateCheckBlockStore.java
@ -0,0 +1,87 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+public class StateCheckBlockStore implements BlockStore {
+    private final BlockStore blockStore;
+    private boolean open;
+
+    public StateCheckBlockStore(BlockStore blockStore) {
+        this.blockStore = blockStore;
+    }
+
+    @Override
+    public void open(Runnable initAction, Factory factory) {
+        assert !open;
+        open = true;
+        blockStore.open(initAction, factory);
+    }
+
+    public boolean isOpen() {
+        return open;
+    }
+
+    @Override
+    public void close() {
+        if (!open) {
+            return;
+        }
+        open = false;
+        blockStore.close();
+    }
+
+    @Override
+    public void clear() {
+        assert open;
+        blockStore.clear();
+    }
+
+    @Override
+    public void remove(BlockPayload block) {
+        assert open;
+        blockStore.remove(block);
+    }
+
+    @Override
+    public <T extends BlockPayload> T readFirst(Class<T> payloadType) {
+        assert open;
+        return blockStore.readFirst(payloadType);
+    }
+
+    @Override
+    public <T extends BlockPayload> T read(BlockPointer pos, Class<T> payloadType) {
+        assert open;
+        return blockStore.read(pos, payloadType);
+    }
+
+    @Override
+    public void write(BlockPayload block) {
+        assert open;
+        blockStore.write(block);
+    }
+
+    @Override
+    public void attach(BlockPayload block) {
+        assert open;
+        blockStore.attach(block);
+    }
+
+    @Override
+    public void flush() {
+        assert open;
+        blockStore.flush();
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/StreamByteBuffer.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/StreamByteBuffer.java
@ -0,0 +1,526 @@
+/*
+ * Copyright 2016 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+
+/**
+ * An in-memory buffer that provides OutputStream and InputStream interfaces.
+ *
+ * This is more efficient than using ByteArrayOutputStream/ByteArrayInputStream
+ *
+ * Reading the buffer will clear the buffer.
+ * This is not thread-safe, it is intended to be used by a single Thread.
+ */
+public class StreamByteBuffer {
+    private static final int DEFAULT_CHUNK_SIZE = 4096;
+    private static final int MAX_CHUNK_SIZE = 1024 * 1024;
+    private LinkedList<StreamByteBufferChunk> chunks = new LinkedList<StreamByteBufferChunk>();
+    private StreamByteBufferChunk currentWriteChunk;
+    private StreamByteBufferChunk currentReadChunk;
+    private int chunkSize;
+    private int nextChunkSize;
+    private int maxChunkSize;
+    private StreamByteBufferOutputStream output;
+    private StreamByteBufferInputStream input;
+    private int totalBytesUnreadInList;
+
+    public StreamByteBuffer() {
+        this(DEFAULT_CHUNK_SIZE);
+    }
+
+    public StreamByteBuffer(int chunkSize) {
+        this.chunkSize = chunkSize;
+        this.nextChunkSize = chunkSize;
+        this.maxChunkSize = Math.max(chunkSize, MAX_CHUNK_SIZE);
+        currentWriteChunk = new StreamByteBufferChunk(nextChunkSize);
+        output = new StreamByteBufferOutputStream();
+        input = new StreamByteBufferInputStream();
+    }
+
+    public static StreamByteBuffer of(InputStream inputStream) throws IOException {
+        StreamByteBuffer buffer = new StreamByteBuffer(chunkSizeInDefaultRange(inputStream.available()));
+        buffer.readFully(inputStream);
+        return buffer;
+    }
+
+    public static StreamByteBuffer of(InputStream inputStream, int len) throws IOException {
+        StreamByteBuffer buffer = new StreamByteBuffer(chunkSizeInDefaultRange(len));
+        buffer.readFrom(inputStream, len);
+        return buffer;
+    }
+
+    public static StreamByteBuffer createWithChunkSizeInDefaultRange(int value) {
+        return new StreamByteBuffer(chunkSizeInDefaultRange(value));
+    }
+
+    static int chunkSizeInDefaultRange(int value) {
+        return valueInRange(value, DEFAULT_CHUNK_SIZE, MAX_CHUNK_SIZE);
+    }
+
+    private static int valueInRange(int value, int min, int max) {
+        return Math.min(Math.max(value, min), max);
+    }
+
+    public OutputStream getOutputStream() {
+        return output;
+    }
+
+    public InputStream getInputStream() {
+        return input;
+    }
+
+    public void writeTo(OutputStream target) throws IOException {
+        while (prepareRead() != -1) {
+            currentReadChunk.writeTo(target);
+        }
+    }
+
+    public void readFrom(InputStream inputStream, int len) throws IOException {
+        int bytesLeft = len;
+        while (bytesLeft > 0) {
+            int spaceLeft = allocateSpace();
+            int limit = Math.min(spaceLeft, bytesLeft);
+            int readBytes = currentWriteChunk.readFrom(inputStream, limit);
+            if (readBytes == -1) {
+                throw new EOFException("Unexpected EOF");
+            }
+            bytesLeft -= readBytes;
+        }
+    }
+
+    public void readFully(InputStream inputStream) throws IOException {
+        while (true) {
+            int len = allocateSpace();
+            int readBytes = currentWriteChunk.readFrom(inputStream, len);
+            if (readBytes == -1) {
+                break;
+            }
+        }
+    }
+
+    public byte[] readAsByteArray() {
+        byte[] buf = new byte[totalBytesUnread()];
+        input.readImpl(buf, 0, buf.length);
+        return buf;
+    }
+
+    public List<byte[]> readAsListOfByteArrays() {
+        List<byte[]> listOfByteArrays = new ArrayList<byte[]>(chunks.size() + 1);
+        byte[] buf;
+        while ((buf = input.readNextBuffer()) != null) {
+            if (buf.length > 0) {
+                listOfByteArrays.add(buf);
+            }
+        }
+        return listOfByteArrays;
+    }
+
+    public String readAsString(String encoding) {
+        Charset charset = Charset.forName(encoding);
+        return readAsString(charset);
+    }
+
+    public String readAsString() {
+        return readAsString(Charset.defaultCharset());
+    }
+
+    public String readAsString(Charset charset) {
+        try {
+            return doReadAsString(charset);
+        } catch (CharacterCodingException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    private String doReadAsString(Charset charset) throws CharacterCodingException {
+        int unreadSize = totalBytesUnread();
+        if (unreadSize > 0) {
+            return readAsCharBuffer(charset).toString();
+        }
+        return "";
+    }
+
+    private CharBuffer readAsCharBuffer(Charset charset) throws CharacterCodingException {
+        CharsetDecoder decoder = charset.newDecoder().onMalformedInput(
+                CodingErrorAction.REPLACE).onUnmappableCharacter(
+                CodingErrorAction.REPLACE);
+        CharBuffer charbuffer = CharBuffer.allocate(totalBytesUnread());
+        ByteBuffer buf = null;
+        boolean wasUnderflow = false;
+        ByteBuffer nextBuf = null;
+        boolean needsFlush = false;
+        while (hasRemaining(nextBuf) || hasRemaining(buf) || prepareRead() != -1) {
+            if (hasRemaining(buf)) {
+                // handle decoding underflow, multi-byte unicode character at buffer chunk boundary
+                if (!wasUnderflow) {
+                    throw new IllegalStateException("Unexpected state. Buffer has remaining bytes without underflow in decoding.");
+                }
+                if (!hasRemaining(nextBuf) && prepareRead() != -1) {
+                    nextBuf = currentReadChunk.readToNioBuffer();
+                }
+                // copy one by one until the underflow has been resolved
+                buf = ByteBuffer.allocate(buf.remaining() + 1).put(buf);
+                buf.put(nextBuf.get());
+                BufferCaster.cast(buf).flip();
+            } else {
+                if (hasRemaining(nextBuf)) {
+                    buf = nextBuf;
+                } else if (prepareRead() != -1) {
+                    buf = currentReadChunk.readToNioBuffer();
+                    if (!hasRemaining(buf)) {
+                        throw new IllegalStateException("Unexpected state. Buffer is empty.");
+                    }
+                }
+                nextBuf = null;
+            }
+            boolean endOfInput = !hasRemaining(nextBuf) && prepareRead() == -1;
+            int bufRemainingBefore = buf.remaining();
+            CoderResult result = decoder.decode(buf, charbuffer, false);
+            if (bufRemainingBefore > buf.remaining()) {
+                needsFlush = true;
+            }
+            if (endOfInput) {
+                result = decoder.decode(ByteBuffer.allocate(0), charbuffer, true);
+                if (!result.isUnderflow()) {
+                    result.throwException();
+                }
+                break;
+            }
+            wasUnderflow = result.isUnderflow();
+        }
+        if (needsFlush) {
+            CoderResult result = decoder.flush(charbuffer);
+            if (!result.isUnderflow()) {
+                result.throwException();
+            }
+        }
+        clear();
+        // push back remaining bytes of multi-byte unicode character
+        while (hasRemaining(buf)) {
+            byte b = buf.get();
+            try {
+                getOutputStream().write(b);
+            } catch (IOException e) {
+                throw new UncheckedIOException(e);
+            }
+        }
+        BufferCaster.cast(charbuffer).flip();
+        return charbuffer;
+    }
+
+    private boolean hasRemaining(ByteBuffer nextBuf) {
+        return nextBuf != null && nextBuf.hasRemaining();
+    }
+
+    public int totalBytesUnread() {
+        int total = totalBytesUnreadInList;
+        if (currentReadChunk != null) {
+            total += currentReadChunk.bytesUnread();
+        }
+        if (currentWriteChunk != currentReadChunk && currentWriteChunk != null) {
+            total += currentWriteChunk.bytesUnread();
+        }
+        return total;
+    }
+
+    protected int allocateSpace() {
+        int spaceLeft = currentWriteChunk.spaceLeft();
+        if (spaceLeft == 0) {
+            addChunk(currentWriteChunk);
+            currentWriteChunk = new StreamByteBufferChunk(nextChunkSize);
+            if (nextChunkSize < maxChunkSize) {
+                nextChunkSize = Math.min(nextChunkSize * 2, maxChunkSize);
+            }
+            spaceLeft = currentWriteChunk.spaceLeft();
+        }
+        return spaceLeft;
+    }
+
+    protected int prepareRead() {
+        int bytesUnread = (currentReadChunk != null) ? currentReadChunk.bytesUnread() : 0;
+        if (bytesUnread == 0) {
+            if (!chunks.isEmpty()) {
+                currentReadChunk = chunks.removeFirst();
+                bytesUnread = currentReadChunk.bytesUnread();
+                totalBytesUnreadInList -= bytesUnread;
+            } else if (currentReadChunk != currentWriteChunk) {
+                currentReadChunk = currentWriteChunk;
+                bytesUnread = currentReadChunk.bytesUnread();
+            } else {
+                bytesUnread = -1;
+            }
+        }
+        return bytesUnread;
+    }
+
+    public static StreamByteBuffer of(List<byte[]> listOfByteArrays) {
+        StreamByteBuffer buffer = new StreamByteBuffer();
+        buffer.addChunks(listOfByteArrays);
+        return buffer;
+    }
+
+    private void addChunks(List<byte[]> listOfByteArrays) {
+        for (byte[] buf : listOfByteArrays) {
+            addChunk(new StreamByteBufferChunk(buf));
+        }
+    }
+
+    private void addChunk(StreamByteBufferChunk chunk) {
+        chunks.add(chunk);
+        totalBytesUnreadInList += chunk.bytesUnread();
+    }
+
+    static class StreamByteBufferChunk {
+        private int pointer;
+        private byte[] buffer;
+        private int size;
+        private int used;
+
+        public StreamByteBufferChunk(int size) {
+            this.size = size;
+            buffer = new byte[size];
+        }
+
+        public StreamByteBufferChunk(byte[] buf) {
+            this.size = buf.length;
+            this.buffer = buf;
+            this.used = buf.length;
+        }
+
+        public ByteBuffer readToNioBuffer() {
+            if (pointer < used) {
+                ByteBuffer result;
+                if (pointer > 0 || used < size) {
+                    result = ByteBuffer.wrap(buffer, pointer, used - pointer);
+                } else {
+                    result = ByteBuffer.wrap(buffer);
+                }
+                pointer = used;
+                return result;
+            }
+
+            return null;
+        }
+
+        public boolean write(byte b) {
+            if (used < size) {
+                buffer[used++] = b;
+                return true;
+            }
+
+            return false;
+        }
+
+        public void write(byte[] b, int off, int len) {
+            System.arraycopy(b, off, buffer, used, len);
+            used = used + len;
+        }
+
+        public void read(byte[] b, int off, int len) {
+            System.arraycopy(buffer, pointer, b, off, len);
+            pointer = pointer + len;
+        }
+
+        public void writeTo(OutputStream target) throws IOException {
+            if (pointer < used) {
+                target.write(buffer, pointer, used - pointer);
+                pointer = used;
+            }
+        }
+
+        public void reset() {
+            pointer = 0;
+        }
+
+        public int bytesUsed() {
+            return used;
+        }
+
+        public int bytesUnread() {
+            return used - pointer;
+        }
+
+        public int read() {
+            if (pointer < used) {
+                return buffer[pointer++] & 0xff;
+            }
+
+            return -1;
+        }
+
+        public int spaceLeft() {
+            return size - used;
+        }
+
+        public int readFrom(InputStream inputStream, int len) throws IOException {
+            int readBytes = inputStream.read(buffer, used, len);
+            if(readBytes > 0) {
+                used += readBytes;
+            }
+            return readBytes;
+        }
+
+        public void clear() {
+            used = pointer = 0;
+        }
+
+        public byte[] readBuffer() {
+            if (used == buffer.length && pointer == 0) {
+                pointer = used;
+                return buffer;
+            } else if (pointer < used) {
+                byte[] buf = new byte[used - pointer];
+                read(buf, 0, used - pointer);
+                return buf;
+            } else {
+                return new byte[0];
+            }
+        }
+    }
+
+    class StreamByteBufferOutputStream extends OutputStream {
+        private boolean closed;
+
+        @Override
+        public void write(byte[] b, int off, int len) throws IOException {
+            if (b == null) {
+                throw new NullPointerException();
+            }
+
+            if ((off < 0) || (off > b.length) || (len < 0)
+                    || ((off + len) > b.length) || ((off + len) < 0)) {
+                throw new IndexOutOfBoundsException();
+            }
+
+            if (len == 0) {
+                return;
+            }
+
+            int bytesLeft = len;
+            int currentOffset = off;
+            while (bytesLeft > 0) {
+                int spaceLeft = allocateSpace();
+                int writeBytes = Math.min(spaceLeft, bytesLeft);
+                currentWriteChunk.write(b, currentOffset, writeBytes);
+                bytesLeft -= writeBytes;
+                currentOffset += writeBytes;
+            }
+        }
+
+        @Override
+        public void close() throws IOException {
+            closed = true;
+        }
+
+        public boolean isClosed() {
+            return closed;
+        }
+
+        @Override
+        public void write(int b) throws IOException {
+            allocateSpace();
+            currentWriteChunk.write((byte) b);
+        }
+
+        public StreamByteBuffer getBuffer() {
+            return StreamByteBuffer.this;
+        }
+    }
+
+    class StreamByteBufferInputStream extends InputStream {
+        @Override
+        public int read() throws IOException {
+            prepareRead();
+            return currentReadChunk.read();
+        }
+
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            return readImpl(b, off, len);
+        }
+
+        int readImpl(byte[] b, int off, int len) {
+            if (b == null) {
+                throw new NullPointerException();
+            }
+
+            if ((off < 0) || (off > b.length) || (len < 0)
+                    || ((off + len) > b.length) || ((off + len) < 0)) {
+                throw new IndexOutOfBoundsException();
+            }
+
+            if (len == 0) {
+                return 0;
+            }
+
+            int bytesLeft = len;
+            int currentOffset = off;
+            int bytesUnread = prepareRead();
+            int totalBytesRead = 0;
+            while (bytesLeft > 0 && bytesUnread != -1) {
+                int readBytes = Math.min(bytesUnread, bytesLeft);
+                currentReadChunk.read(b, currentOffset, readBytes);
+                bytesLeft -= readBytes;
+                currentOffset += readBytes;
+                totalBytesRead += readBytes;
+                bytesUnread = prepareRead();
+            }
+            if (totalBytesRead > 0) {
+                return totalBytesRead;
+            }
+
+            return -1;
+        }
+
+        @Override
+        public int available() throws IOException {
+            return totalBytesUnread();
+        }
+
+        public StreamByteBuffer getBuffer() {
+            return StreamByteBuffer.this;
+        }
+
+        public byte[] readNextBuffer() {
+            if (prepareRead() != -1) {
+                return currentReadChunk.readBuffer();
+            }
+            return null;
+        }
+    }
+
+    public void clear() {
+        chunks.clear();
+        currentReadChunk = null;
+        totalBytesUnreadInList = 0;
+        currentWriteChunk.clear();
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/UncheckedException.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/UncheckedException.java
@ -0,0 +1,88 @@
+/*
+ * Copyright 2010 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree;
+
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.util.concurrent.Callable;
+
+/**
+ * Wraps a checked exception. Carries no other context.
+ */
+public final class UncheckedException extends RuntimeException {
+    private UncheckedException(Throwable cause) {
+        super(cause);
+    }
+
+    private UncheckedException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+    /**
+     * Note: always throws the failure in some form. The return value is to keep the compiler happy.
+     */
+    public static RuntimeException throwAsUncheckedException(Throwable t) {
+        return throwAsUncheckedException(t, false);
+    }
+
+    /**
+     * Note: always throws the failure in some form. The return value is to keep the compiler happy.
+     */
+    public static RuntimeException throwAsUncheckedException(Throwable t, boolean preserveMessage) {
+        if (t instanceof InterruptedException) {
+            Thread.currentThread().interrupt();
+        }
+        if (t instanceof RuntimeException) {
+            throw (RuntimeException) t;
+        }
+        if (t instanceof Error) {
+            throw (Error) t;
+        }
+        if (t instanceof IOException) {
+            if (preserveMessage) {
+                throw new UncheckedIOException(t.getMessage(), t);
+            } else {
+                throw new UncheckedIOException(t);
+            }
+        }
+        if (preserveMessage) {
+            throw new UncheckedException(t.getMessage(), t);
+        } else {
+            throw new UncheckedException(t);
+        }
+    }
+
+    public static <T> T callUnchecked(Callable<T> callable) {
+        try {
+            return callable.call();
+        } catch (Exception e) {
+            throw throwAsUncheckedException(e);
+        }
+    }
+
+    /**
+     * Unwraps passed InvocationTargetException hence making the stack of exceptions cleaner without losing information.
+     *
+     * Note: always throws the failure in some form. The return value is to keep the compiler happy.
+     *
+     * @param e to be unwrapped
+     * @return an instance of RuntimeException based on the target exception of the parameter.
+     */
+    public static RuntimeException unwrapAndRethrow(InvocationTargetException e) {
+        return UncheckedException.throwAsUncheckedException(e.getTargetException());
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/UncheckedIOException.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/UncheckedIOException.java
@ -0,0 +1,36 @@
+/*
+ * Copyright 2012 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+/**
+ * <code>UncheckedIOException</code> is used to wrap an {@link java.io.IOException} into an unchecked exception.
+ */
+public class UncheckedIOException extends RuntimeException {
+    public UncheckedIOException() {
+    }
+
+    public UncheckedIOException(String message) {
+        super(message);
+    }
+
+    public UncheckedIOException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+    public UncheckedIOException(Throwable cause) {
+        super(cause);
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/AbstractDecoder.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/AbstractDecoder.java
@ -0,0 +1,133 @@
+/*
+ * Copyright 2013 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+import javax.annotation.Nullable;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+public abstract class AbstractDecoder implements Decoder {
+    private DecoderStream stream;
+
+    @Override
+    public InputStream getInputStream() {
+        if (stream == null) {
+            stream = new DecoderStream();
+        }
+        return stream;
+    }
+
+    @Override
+    public void readBytes(byte[] buffer) throws IOException {
+        readBytes(buffer, 0, buffer.length);
+    }
+
+    @Override
+    public byte[] readBinary() throws EOFException, IOException {
+        int size = readSmallInt();
+        byte[] result = new byte[size];
+        readBytes(result);
+        return result;
+    }
+
+    @Override
+    public int readSmallInt() throws EOFException, IOException {
+        return readInt();
+    }
+
+    @Override
+    public long readSmallLong() throws EOFException, IOException {
+        return readLong();
+    }
+
+    @Nullable
+    @Override
+    public Integer readNullableSmallInt() throws IOException {
+        if (readBoolean()) {
+            return readSmallInt();
+        } else {
+            return null;
+        }
+    }
+
+    @Override
+    public String readNullableString() throws EOFException, IOException {
+        if (readBoolean()) {
+            return readString();
+        } else {
+            return null;
+        }
+    }
+
+    @Override
+    public void skipBytes(long count) throws EOFException, IOException {
+        long remaining = count;
+        while (remaining > 0) {
+            long skipped = maybeSkip(remaining);
+            if (skipped <= 0) {
+                break;
+            }
+            remaining -= skipped;
+        }
+        if (remaining > 0) {
+            throw new EOFException();
+        }
+    }
+
+    @Override
+    public <T> T decodeChunked(DecodeAction<Decoder, T> decodeAction) throws EOFException, Exception {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void skipChunked() throws EOFException, IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    protected abstract int maybeReadBytes(byte[] buffer, int offset, int count) throws IOException;
+
+    protected abstract long maybeSkip(long count) throws IOException;
+
+    private class DecoderStream extends InputStream {
+        byte[] buffer = new byte[1];
+
+        @Override
+        public long skip(long n) throws IOException {
+            return maybeSkip(n);
+        }
+
+        @Override
+        public int read() throws IOException {
+            int read = maybeReadBytes(buffer, 0, 1);
+            if (read <= 0) {
+                return read;
+            }
+            return buffer[0] & 0xff;
+        }
+
+        @Override
+        public int read(byte[] buffer) throws IOException {
+            return maybeReadBytes(buffer, 0, buffer.length);
+        }
+
+        @Override
+        public int read(byte[] buffer, int offset, int count) throws IOException {
+            return maybeReadBytes(buffer, offset, count);
+        }
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/AbstractEncoder.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/AbstractEncoder.java
@ -0,0 +1,101 @@
+/*
+ * Copyright 2013 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+import javax.annotation.Nullable;
+import java.io.IOException;
+import java.io.OutputStream;
+
+public abstract class AbstractEncoder implements Encoder {
+    private EncoderStream stream;
+
+    @Override
+    public OutputStream getOutputStream() {
+        if (stream == null) {
+            stream = new EncoderStream();
+        }
+        return stream;
+    }
+
+    @Override
+    public void writeBytes(byte[] bytes) throws IOException {
+        writeBytes(bytes, 0, bytes.length);
+    }
+
+    @Override
+    public void writeBinary(byte[] bytes) throws IOException {
+        writeBinary(bytes, 0, bytes.length);
+    }
+
+    @Override
+    public void writeBinary(byte[] bytes, int offset, int count) throws IOException {
+        writeSmallInt(count);
+        writeBytes(bytes, offset, count);
+    }
+
+    @Override
+    public void encodeChunked(EncodeAction<Encoder> writeAction) throws Exception {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void writeSmallInt(int value) throws IOException {
+        writeInt(value);
+    }
+
+    @Override
+    public void writeSmallLong(long value) throws IOException {
+        writeLong(value);
+    }
+
+    @Override
+    public void writeNullableSmallInt(@Nullable Integer value) throws IOException {
+        if (value == null) {
+            writeBoolean(false);
+        } else {
+            writeBoolean(true);
+            writeSmallInt(value);
+        }
+    }
+
+    @Override
+    public void writeNullableString(@Nullable CharSequence value) throws IOException {
+        if (value == null) {
+            writeBoolean(false);
+        } else {
+            writeBoolean(true);
+            writeString(value.toString());
+        }
+    }
+
+    private class EncoderStream extends OutputStream {
+        @Override
+        public void write(byte[] buffer) throws IOException {
+            writeBytes(buffer);
+        }
+
+        @Override
+        public void write(byte[] buffer, int offset, int length) throws IOException {
+            writeBytes(buffer, offset, length);
+        }
+
+        @Override
+        public void write(int b) throws IOException {
+            writeByte((byte) b);
+        }
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/AbstractSerializer.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/AbstractSerializer.java
@ -0,0 +1,40 @@
+/*
+ * Copyright 2016 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+import com.google.common.base.Objects;
+
+/**
+ * This abstract class provide a sensible default implementation for {@code Serializer} equality. This equality
+ * implementation is required to enable cache instance reuse within the same Gradle runtime. Serializers are used
+ * as cache parameter which need to be compared to determine compatible cache.
+ */
+public abstract class AbstractSerializer<T> implements Serializer<T> {
+    @Override
+    public boolean equals(Object obj) {
+        if (obj == null) {
+            return false;
+        }
+
+        return Objects.equal(obj.getClass(), getClass());
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hashCode(getClass());
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/Cast.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/Cast.java
@ -0,0 +1,79 @@
+/*
+ * Copyright 2012 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+import javax.annotation.Nullable;
+
+public abstract class Cast {
+
+    /**
+     * Casts the given object to the given type, providing a better error message than the default.
+     *
+     * The standard {@link Class#cast(Object)} method produces unsatisfactory error messages on some platforms
+     * when it fails. All this method does is provide a better, consistent, error message.
+     *
+     * This should be used whenever there is a chance the cast could fail. If in doubt, use this.
+     *
+     * @param outputType The type to cast the input to
+     * @param object The object to be cast (must not be {@code null})
+     * @param <O> The type to be cast to
+     * @param <I> The type of the object to be vast
+     * @return The input object, cast to the output type
+     */
+    public static <O, I> O cast(Class<O> outputType, I object) {
+        try {
+            return outputType.cast(object);
+        } catch (ClassCastException e) {
+            throw new ClassCastException(String.format(
+                    "Failed to cast object %s of type %s to target type %s", object, object.getClass().getName(), outputType.getName()
+            ));
+        }
+    }
+
+    /**
+     * Casts the given object to the given type, providing a better error message than the default.
+     *
+     * The standard {@link Class#cast(Object)} method produces unsatisfactory error messages on some platforms
+     * when it fails. All this method does is provide a better, consistent, error message.
+     *
+     * This should be used whenever there is a chance the cast could fail. If in doubt, use this.
+     *
+     * @param outputType The type to cast the input to
+     * @param object The object to be cast
+     * @param <O> The type to be cast to
+     * @param <I> The type of the object to be vast
+     * @return The input object, cast to the output type
+     */
+    @Nullable
+    public static <O, I> O castNullable(Class<O> outputType, @Nullable I object) {
+        if (object == null) {
+            return null;
+        }
+        return cast(outputType, object);
+    }
+
+    @SuppressWarnings("unchecked")
+    @Nullable
+    public static <T> T uncheckedCast(@Nullable Object object) {
+        return (T) object;
+    }
+
+    @SuppressWarnings("unchecked")
+    public static <T> T uncheckedNonnullCast(Object object) {
+        return (T) object;
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/ClassLoaderObjectInputStream.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/ClassLoaderObjectInputStream.java
@ -0,0 +1,43 @@
+/*
+ * Copyright 2010 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree.serialize;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectStreamClass;
+
+public class ClassLoaderObjectInputStream extends ObjectInputStream {
+    private final ClassLoader loader;
+
+    public ClassLoaderObjectInputStream(InputStream in, ClassLoader loader) throws IOException {
+        super(in);
+        this.loader = loader;
+    }
+
+    public ClassLoader getClassLoader() {
+        return loader;
+    }
+
+    @Override
+    protected Class<?> resolveClass(ObjectStreamClass desc) throws IOException, ClassNotFoundException {
+        try {
+            return Class.forName(desc.getName(), false, loader);
+        } catch (ClassNotFoundException e) {
+            return super.resolveClass(desc);
+        }
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/Decoder.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/Decoder.java
@ -0,0 +1,140 @@
+/*
+ * Copyright 2013 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+import javax.annotation.Nullable;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Provides a way to decode structured data from a backing byte stream. Implementations may buffer incoming bytes read
+ * from the backing stream prior to decoding.
+ */
+public interface Decoder {
+    /**
+     * Returns an InputStream which can be used to read raw bytes.
+     */
+    InputStream getInputStream();
+
+    /**
+     * Reads a signed 64 bit long value. Can read any value that was written using {@link Encoder#writeLong(long)}.
+     *
+     * @throws EOFException when the end of the byte stream is reached before the long value can be fully read.
+     */
+    long readLong() throws EOFException, IOException;
+
+    /**
+     * Reads a signed 64 bit int value. Can read any value that was written using {@link Encoder#writeSmallLong(long)}.
+     *
+     * @throws EOFException when the end of the byte stream is reached before the int value can be fully read.
+     */
+    long readSmallLong() throws EOFException, IOException;
+
+    /**
+     * Reads a signed 32 bit int value. Can read any value that was written using {@link Encoder#writeInt(int)}.
+     *
+     * @throws EOFException when the end of the byte stream is reached before the int value can be fully read.
+     */
+    int readInt() throws EOFException, IOException;
+
+    /**
+     * Reads a signed 32 bit int value. Can read any value that was written using {@link Encoder#writeSmallInt(int)}.
+     *
+     * @throws EOFException when the end of the byte stream is reached before the int value can be fully read.
+     */
+    int readSmallInt() throws EOFException, IOException;
+
+    /**
+     * Reads a nullable signed 32 bit int value.
+     *
+     * @see #readSmallInt()
+     */
+    @Nullable
+    Integer readNullableSmallInt() throws EOFException, IOException;
+
+    /**
+     * Reads a boolean value. Can read any value that was written using {@link Encoder#writeBoolean(boolean)}.
+     *
+     * @throws EOFException when the end of the byte stream is reached before the boolean value can be fully read.
+     */
+    boolean readBoolean() throws EOFException, IOException;
+
+    /**
+     * Reads a non-null string value. Can read any value that was written using {@link Encoder#writeString(CharSequence)}.
+     *
+     * @throws EOFException when the end of the byte stream is reached before the string can be fully read.
+     */
+    String readString() throws EOFException, IOException;
+
+    /**
+     * Reads a nullable string value. Can reads any value that was written using {@link Encoder#writeNullableString(CharSequence)}.
+     *
+     * @throws EOFException when the end of the byte stream is reached before the string can be fully read.
+     */
+    @Nullable
+    String readNullableString() throws EOFException, IOException;
+
+    /**
+     * Reads a byte value. Can read any byte value that was written using one of the raw byte methods on {@link Encoder}, such as {@link Encoder#writeByte(byte)} or {@link Encoder#getOutputStream()}
+     *
+     * @throws EOFException when the end of the byte stream is reached.
+     */
+    byte readByte() throws EOFException, IOException;
+
+    /**
+     * Reads bytes into the given buffer, filling the buffer. Can read any byte values that were written using one of the raw byte methods on {@link Encoder}, such as {@link
+     * Encoder#writeBytes(byte[])} or {@link Encoder#getOutputStream()}
+     *
+     * @throws EOFException when the end of the byte stream is reached before the buffer is full.
+     */
+    void readBytes(byte[] buffer) throws EOFException, IOException;
+
+    /**
+     * Reads the specified number of bytes into the given buffer. Can read any byte values that were written using one of the raw byte methods on {@link Encoder}, such as {@link
+     * Encoder#writeBytes(byte[])} or {@link Encoder#getOutputStream()}
+     *
+     * @throws EOFException when the end of the byte stream is reached before the specified number of bytes were read.
+     */
+    void readBytes(byte[] buffer, int offset, int count) throws EOFException, IOException;
+
+    /**
+     * Reads a byte array. Can read any byte array written using {@link Encoder#writeBinary(byte[])} or {@link Encoder#writeBinary(byte[], int, int)}.
+     *
+     * @throws EOFException when the end of the byte stream is reached before the byte array was fully read.
+     */
+    byte[] readBinary() throws EOFException, IOException;
+
+    /**
+     * Skips the given number of bytes. Can skip over any byte values that were written using one of the raw byte methods on {@link Encoder}.
+     */
+    void skipBytes(long count) throws EOFException, IOException;
+
+    /**
+     * Reads a byte stream written using {@link Encoder#encodeChunked(Encoder.EncodeAction)}.
+     */
+    <T> T decodeChunked(DecodeAction<Decoder, T> decodeAction) throws EOFException, Exception;
+
+    /**
+     * Skips over a byte stream written using {@link Encoder#encodeChunked(Encoder.EncodeAction)}, discarding its content.
+     */
+    void skipChunked() throws EOFException, IOException;
+
+    interface DecodeAction<IN, OUT> {
+        OUT read(IN source) throws Exception;
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/DefaultSerializer.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/DefaultSerializer.java
@ -0,0 +1,73 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree.serialize;
+
+import com.google.common.base.Objects;
+
+import java.io.IOException;
+import java.io.ObjectOutputStream;
+import java.io.StreamCorruptedException;
+
+public class DefaultSerializer<T> extends AbstractSerializer<T> {
+    private ClassLoader classLoader;
+
+    public DefaultSerializer() {
+        classLoader = getClass().getClassLoader();
+    }
+
+    public DefaultSerializer(ClassLoader classLoader) {
+        this.classLoader = classLoader != null ? classLoader : getClass().getClassLoader();
+    }
+
+    public ClassLoader getClassLoader() {
+        return classLoader;
+    }
+
+    public void setClassLoader(ClassLoader classLoader) {
+        this.classLoader = classLoader;
+    }
+
+    @Override
+    public T read(Decoder decoder) throws Exception {
+        try {
+            return Cast.uncheckedNonnullCast(new ClassLoaderObjectInputStream(decoder.getInputStream(), classLoader).readObject());
+        } catch (StreamCorruptedException e) {
+            return null;
+        }
+    }
+
+    @Override
+    public void write(Encoder encoder, T value) throws IOException {
+        ObjectOutputStream objectStr = new ObjectOutputStream(encoder.getOutputStream());
+        objectStr.writeObject(value);
+        objectStr.flush();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (!super.equals(obj)) {
+            return false;
+        }
+
+        DefaultSerializer<?> rhs = (DefaultSerializer<?>) obj;
+        return Objects.equal(classLoader, rhs.classLoader);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hashCode(super.hashCode(), classLoader);
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/Encoder.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/Encoder.java
@ -0,0 +1,110 @@
+/*
+ * Copyright 2013 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+import javax.annotation.Nullable;
+import java.io.IOException;
+import java.io.OutputStream;
+
+/**
+ * Provides a way to encode structured data to a backing byte stream. Implementations may buffer outgoing encoded bytes prior
+ * to writing to the backing byte stream.
+ */
+public interface Encoder {
+    /**
+     * Returns an {@link OutputStream) that can be used to write raw bytes to the stream.
+     */
+    OutputStream getOutputStream();
+
+    /**
+     * Writes a raw byte value to the stream.
+     */
+    void writeByte(byte value) throws IOException;
+
+    /**
+     * Writes the given raw bytes to the stream. Does not encode any length information.
+     */
+    void writeBytes(byte[] bytes) throws IOException;
+
+    /**
+     * Writes the given raw bytes to the stream. Does not encode any length information.
+     */
+    void writeBytes(byte[] bytes, int offset, int count) throws IOException;
+
+    /**
+     * Writes the given byte array to the stream. Encodes the bytes and length information.
+     */
+    void writeBinary(byte[] bytes) throws IOException;
+
+    /**
+     * Writes the given byte array to the stream. Encodes the bytes and length information.
+     */
+    void writeBinary(byte[] bytes, int offset, int count) throws IOException;
+
+    /**
+     * Appends an encoded stream to this stream. Encodes the stream as a series of chunks with length information.
+     */
+    void encodeChunked(EncodeAction<Encoder> writeAction) throws Exception;
+
+    /**
+     * Writes a signed 64 bit long value. The implementation may encode the value as a variable number of bytes, not necessarily as 8 bytes.
+     */
+    void writeLong(long value) throws IOException;
+
+    /**
+     * Writes a signed 64 bit long value whose value is likely to be small and positive but may not be. The implementation may encode the value in a way that is more efficient for small positive
+     * values.
+     */
+    void writeSmallLong(long value) throws IOException;
+
+    /**
+     * Writes a signed 32 bit int value. The implementation may encode the value as a variable number of bytes, not necessarily as 4 bytes.
+     */
+    void writeInt(int value) throws IOException;
+
+    /**
+     * Writes a signed 32 bit int value whose value is likely to be small and positive but may not be. The implementation may encode the value in a way that
+     * is more efficient for small positive values.
+     */
+    void writeSmallInt(int value) throws IOException;
+
+    /**
+     * Writes a nullable signed 32 bit int value whose value is likely to be small and positive but may not be.
+     *
+     * @see #writeSmallInt(int)
+     */
+    void writeNullableSmallInt(@Nullable Integer value) throws IOException;
+
+    /**
+     * Writes a boolean value.
+     */
+    void writeBoolean(boolean value) throws IOException;
+
+    /**
+     * Writes a non-null string value.
+     */
+    void writeString(CharSequence value) throws IOException;
+
+    /**
+     * Writes a nullable string value.
+     */
+    void writeNullableString(@Nullable CharSequence value) throws IOException;
+
+    interface EncodeAction<T> {
+        void write(T target) throws Exception;
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/FlushableEncoder.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/FlushableEncoder.java
@ -0,0 +1,31 @@
+/*
+ * Copyright 2013 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+import java.io.Flushable;
+import java.io.IOException;
+
+/**
+ * Represents an {@link Encoder} that buffers encoded data prior to writing to the backing stream.
+ */
+public interface FlushableEncoder extends Encoder, Flushable {
+    /**
+     * Ensures that all buffered data has been written to the backing stream. Does not flush the backing stream.
+     */
+    @Override
+    void flush() throws IOException;
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/ObjectReader.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/ObjectReader.java
@ -0,0 +1,28 @@
+/*
+ * Copyright 2012 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+import java.io.EOFException;
+
+public interface ObjectReader<T> {
+    /**
+     * Reads the next object from the stream.
+     *
+     * @throws EOFException When the next object cannot be fully read due to reaching the end of stream.
+     */
+    T read() throws EOFException, Exception;
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/ObjectWriter.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/ObjectWriter.java
@ -0,0 +1,21 @@
+/*
+ * Copyright 2012 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+public interface ObjectWriter<T> {
+    void write(T value) throws Exception;
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/Serializer.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/Serializer.java
@ -0,0 +1,33 @@
+/*
+ * Copyright 2009 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree.serialize;
+
+import java.io.EOFException;
+
+public interface Serializer<T> {
+    /**
+     * Reads the next object from the given stream. The implementation must not perform any buffering, so that it reads only those bytes from the input stream that are
+     * required to deserialize the next object.
+     *
+     * @throws EOFException When the next object cannot be fully read due to reaching the end of stream.
+     */
+    T read(Decoder decoder) throws EOFException, Exception;
+
+    /**
+     * Writes the given object to the given stream. The implementation must not perform any buffering.
+     */
+    void write(Encoder encoder, T value) throws Exception;
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/StatefulSerializer.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/StatefulSerializer.java
@ -0,0 +1,33 @@
+/*
+ * Copyright 2012 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize;
+
+/**
+ * Implementations must allow concurrent reading and writing, so that a thread can read and a thread can write at the same time.
+ * Implementations do not need to support multiple read threads or multiple write threads.
+ */
+public interface StatefulSerializer<T> {
+    /**
+     * Should not perform any buffering
+     */
+    ObjectReader<T> newReader(Decoder decoder);
+
+    /**
+     * Should not perform any buffering
+     */
+    ObjectWriter<T> newWriter(Encoder encoder);
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/KryoBackedDecoder.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/KryoBackedDecoder.java
@ -0,0 +1,210 @@
+/*
+ * Copyright 2013 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize.kryo;
+
+import com.esotericsoftware.kryo.KryoException;
+import com.esotericsoftware.kryo.io.Input;
+import seaweedfs.client.btree.serialize.AbstractDecoder;
+import seaweedfs.client.btree.serialize.Decoder;
+
+import java.io.Closeable;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Note that this decoder uses buffering, so will attempt to read beyond the end of the encoded data. This means you should use this type only when this decoder will be used to decode the entire
+ * stream.
+ */
+public class KryoBackedDecoder extends AbstractDecoder implements Decoder, Closeable {
+    private final Input input;
+    private final InputStream inputStream;
+    private long extraSkipped;
+    private KryoBackedDecoder nested;
+
+    public KryoBackedDecoder(InputStream inputStream) {
+        this(inputStream, 4096);
+    }
+
+    public KryoBackedDecoder(InputStream inputStream, int bufferSize) {
+        this.inputStream = inputStream;
+        input = new Input(this.inputStream, bufferSize);
+    }
+
+    @Override
+    protected int maybeReadBytes(byte[] buffer, int offset, int count) {
+        return input.read(buffer, offset, count);
+    }
+
+    @Override
+    protected long maybeSkip(long count) throws IOException {
+        // Work around some bugs in Input.skip()
+        int remaining = input.limit() - input.position();
+        if (remaining == 0) {
+            long skipped = inputStream.skip(count);
+            if (skipped > 0) {
+                extraSkipped += skipped;
+            }
+            return skipped;
+        } else if (count <= remaining) {
+            input.setPosition(input.position() + (int) count);
+            return count;
+        } else {
+            input.setPosition(input.limit());
+            return remaining;
+        }
+    }
+
+    private RuntimeException maybeEndOfStream(KryoException e) throws EOFException {
+        if (e.getMessage().equals("Buffer underflow.")) {
+            throw (EOFException) (new EOFException().initCause(e));
+        }
+        throw e;
+    }
+
+    @Override
+    public byte readByte() throws EOFException {
+        try {
+            return input.readByte();
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public void readBytes(byte[] buffer, int offset, int count) throws EOFException {
+        try {
+            input.readBytes(buffer, offset, count);
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public long readLong() throws EOFException {
+        try {
+            return input.readLong();
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public long readSmallLong() throws EOFException, IOException {
+        try {
+            return input.readLong(true);
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public int readInt() throws EOFException {
+        try {
+            return input.readInt();
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public int readSmallInt() throws EOFException {
+        try {
+            return input.readInt(true);
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public boolean readBoolean() throws EOFException {
+        try {
+            return input.readBoolean();
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public String readString() throws EOFException {
+        return readNullableString();
+    }
+
+    @Override
+    public String readNullableString() throws EOFException {
+        try {
+            return input.readString();
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public void skipChunked() throws EOFException, IOException {
+        while (true) {
+            int count = readSmallInt();
+            if (count == 0) {
+                break;
+            }
+            skipBytes(count);
+        }
+    }
+
+    @Override
+    public <T> T decodeChunked(DecodeAction<Decoder, T> decodeAction) throws EOFException, Exception {
+        if (nested == null) {
+            nested = new KryoBackedDecoder(new InputStream() {
+                @Override
+                public int read() throws IOException {
+                    throw new UnsupportedOperationException();
+                }
+
+                @Override
+                public int read(byte[] buffer, int offset, int length) throws IOException {
+                    int count = readSmallInt();
+                    if (count == 0) {
+                        // End of stream has been reached
+                        return -1;
+                    }
+                    if (count > length) {
+                        // For now, assume same size buffers used to read and write
+                        throw new UnsupportedOperationException();
+                    }
+                    readBytes(buffer, offset, count);
+                    return count;
+                }
+            });
+        }
+        T value = decodeAction.read(nested);
+        if (readSmallInt() != 0) {
+            throw new IllegalStateException("Expecting the end of nested stream.");
+        }
+        return value;
+    }
+
+    /**
+     * Returns the total number of bytes consumed by this decoder. Some additional bytes may also be buffered by this decoder but have not been consumed.
+     */
+    public long getReadPosition() {
+        return input.total() + extraSkipped;
+    }
+
+    @Override
+    public void close() throws IOException {
+        input.close();
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/KryoBackedEncoder.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/KryoBackedEncoder.java
@ -0,0 +1,134 @@
+/*
+ * Copyright 2013 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize.kryo;
+
+import com.esotericsoftware.kryo.io.Output;
+import seaweedfs.client.btree.serialize.AbstractEncoder;
+import seaweedfs.client.btree.serialize.Encoder;
+import seaweedfs.client.btree.serialize.FlushableEncoder;
+
+import javax.annotation.Nullable;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStream;
+
+public class KryoBackedEncoder extends AbstractEncoder implements FlushableEncoder, Closeable {
+    private final Output output;
+    private KryoBackedEncoder nested;
+
+    public KryoBackedEncoder(OutputStream outputStream) {
+        this(outputStream, 4096);
+    }
+
+    public KryoBackedEncoder(OutputStream outputStream, int bufferSize) {
+        output = new Output(outputStream, bufferSize);
+    }
+
+    @Override
+    public void writeByte(byte value) {
+        output.writeByte(value);
+    }
+
+    @Override
+    public void writeBytes(byte[] bytes, int offset, int count) {
+        output.writeBytes(bytes, offset, count);
+    }
+
+    @Override
+    public void writeLong(long value) {
+        output.writeLong(value);
+    }
+
+    @Override
+    public void writeSmallLong(long value) {
+        output.writeLong(value, true);
+    }
+
+    @Override
+    public void writeInt(int value) {
+        output.writeInt(value);
+    }
+
+    @Override
+    public void writeSmallInt(int value) {
+        output.writeInt(value, true);
+    }
+
+    @Override
+    public void writeBoolean(boolean value) {
+        output.writeBoolean(value);
+    }
+
+    @Override
+    public void writeString(CharSequence value) {
+        if (value == null) {
+            throw new IllegalArgumentException("Cannot encode a null string.");
+        }
+        output.writeString(value);
+    }
+
+    @Override
+    public void writeNullableString(@Nullable CharSequence value) {
+        output.writeString(value);
+    }
+
+    @Override
+    public void encodeChunked(EncodeAction<Encoder> writeAction) throws Exception {
+        if (nested == null) {
+            nested = new KryoBackedEncoder(new OutputStream() {
+                @Override
+                public void write(byte[] buffer, int offset, int length) {
+                    if (length == 0) {
+                        return;
+                    }
+                    writeSmallInt(length);
+                    writeBytes(buffer, offset, length);
+                }
+
+                @Override
+                public void write(byte[] buffer) throws IOException {
+                    write(buffer, 0, buffer.length);
+                }
+
+                @Override
+                public void write(int b) {
+                    throw new UnsupportedOperationException();
+                }
+            });
+        }
+        writeAction.write(nested);
+        nested.flush();
+        writeSmallInt(0);
+    }
+
+    /**
+     * Returns the total number of bytes written by this encoder, some of which may still be buffered.
+     */
+    public long getWritePosition() {
+        return output.total();
+    }
+
+    @Override
+    public void flush() {
+        output.flush();
+    }
+
+    @Override
+    public void close() {
+        output.close();
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/StringDeduplicatingKryoBackedDecoder.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/StringDeduplicatingKryoBackedDecoder.java
@ -0,0 +1,188 @@
+/*
+ * Copyright 2018 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize.kryo;
+
+import com.esotericsoftware.kryo.KryoException;
+import com.esotericsoftware.kryo.io.Input;
+import seaweedfs.client.btree.serialize.AbstractDecoder;
+import seaweedfs.client.btree.serialize.Decoder;
+
+import java.io.Closeable;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Note that this decoder uses buffering, so will attempt to read beyond the end of the encoded data. This means you should use this type only when this decoder will be used to decode the entire
+ * stream.
+ */
+public class StringDeduplicatingKryoBackedDecoder extends AbstractDecoder implements Decoder, Closeable {
+    public static final int INITIAL_CAPACITY = 32;
+    private final Input input;
+    private final InputStream inputStream;
+    private String[] strings;
+    private long extraSkipped;
+
+    public StringDeduplicatingKryoBackedDecoder(InputStream inputStream) {
+        this(inputStream, 4096);
+    }
+
+    public StringDeduplicatingKryoBackedDecoder(InputStream inputStream, int bufferSize) {
+        this.inputStream = inputStream;
+        input = new Input(this.inputStream, bufferSize);
+    }
+
+    @Override
+    protected int maybeReadBytes(byte[] buffer, int offset, int count) {
+        return input.read(buffer, offset, count);
+    }
+
+    @Override
+    protected long maybeSkip(long count) throws IOException {
+        // Work around some bugs in Input.skip()
+        int remaining = input.limit() - input.position();
+        if (remaining == 0) {
+            long skipped = inputStream.skip(count);
+            if (skipped > 0) {
+                extraSkipped += skipped;
+            }
+            return skipped;
+        } else if (count <= remaining) {
+            input.setPosition(input.position() + (int) count);
+            return count;
+        } else {
+            input.setPosition(input.limit());
+            return remaining;
+        }
+    }
+
+    private RuntimeException maybeEndOfStream(KryoException e) throws EOFException {
+        if (e.getMessage().equals("Buffer underflow.")) {
+            throw (EOFException) (new EOFException().initCause(e));
+        }
+        throw e;
+    }
+
+    @Override
+    public byte readByte() throws EOFException {
+        try {
+            return input.readByte();
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public void readBytes(byte[] buffer, int offset, int count) throws EOFException {
+        try {
+            input.readBytes(buffer, offset, count);
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public long readLong() throws EOFException {
+        try {
+            return input.readLong();
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public long readSmallLong() throws EOFException, IOException {
+        try {
+            return input.readLong(true);
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public int readInt() throws EOFException {
+        try {
+            return input.readInt();
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public int readSmallInt() throws EOFException {
+        try {
+            return input.readInt(true);
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public boolean readBoolean() throws EOFException {
+        try {
+            return input.readBoolean();
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    @Override
+    public String readString() throws EOFException {
+        return readNullableString();
+    }
+
+    @Override
+    public String readNullableString() throws EOFException {
+        try {
+            int idx = readInt();
+            if (idx == -1) {
+                return null;
+            }
+            if (strings == null) {
+                strings = new String[INITIAL_CAPACITY];
+            }
+            String string = null;
+            if (idx >= strings.length) {
+                String[] grow = new String[strings.length * 3 / 2];
+                System.arraycopy(strings, 0, grow, 0, strings.length);
+                strings = grow;
+            } else {
+                string = strings[idx];
+            }
+            if (string == null) {
+                string = input.readString();
+                strings[idx] = string;
+            }
+            return string;
+        } catch (KryoException e) {
+            throw maybeEndOfStream(e);
+        }
+    }
+
+    /**
+     * Returns the total number of bytes consumed by this decoder. Some additional bytes may also be buffered by this decoder but have not been consumed.
+     */
+    public long getReadPosition() {
+        return input.total() + extraSkipped;
+    }
+
+    @Override
+    public void close() throws IOException {
+        strings = null;
+        input.close();
+    }
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/StringDeduplicatingKryoBackedEncoder.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/StringDeduplicatingKryoBackedEncoder.java
@ -0,0 +1,128 @@
+/*
+ * Copyright 2018 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize.kryo;
+
+import com.esotericsoftware.kryo.io.Output;
+import com.google.common.collect.Maps;
+import seaweedfs.client.btree.serialize.AbstractEncoder;
+import seaweedfs.client.btree.serialize.FlushableEncoder;
+
+import javax.annotation.Nullable;
+import java.io.Closeable;
+import java.io.OutputStream;
+import java.util.Map;
+
+public class StringDeduplicatingKryoBackedEncoder extends AbstractEncoder implements FlushableEncoder, Closeable {
+    private Map<String, Integer> strings;
+
+    private final Output output;
+
+    public StringDeduplicatingKryoBackedEncoder(OutputStream outputStream) {
+        this(outputStream, 4096);
+    }
+
+    public StringDeduplicatingKryoBackedEncoder(OutputStream outputStream, int bufferSize) {
+        output = new Output(outputStream, bufferSize);
+    }
+
+    @Override
+    public void writeByte(byte value) {
+        output.writeByte(value);
+    }
+
+    @Override
+    public void writeBytes(byte[] bytes, int offset, int count) {
+        output.writeBytes(bytes, offset, count);
+    }
+
+    @Override
+    public void writeLong(long value) {
+        output.writeLong(value);
+    }
+
+    @Override
+    public void writeSmallLong(long value) {
+        output.writeLong(value, true);
+    }
+
+    @Override
+    public void writeInt(int value) {
+        output.writeInt(value);
+    }
+
+    @Override
+    public void writeSmallInt(int value) {
+        output.writeInt(value, true);
+    }
+
+    @Override
+    public void writeBoolean(boolean value) {
+        output.writeBoolean(value);
+    }
+
+    @Override
+    public void writeString(CharSequence value) {
+        if (value == null) {
+            throw new IllegalArgumentException("Cannot encode a null string.");
+        }
+        writeNullableString(value);
+    }
+
+    @Override
+    public void writeNullableString(@Nullable CharSequence value) {
+        if (value == null) {
+            output.writeInt(-1);
+            return;
+        } else {
+            if (strings == null) {
+                strings = Maps.newHashMapWithExpectedSize(1024);
+            }
+        }
+        String key = value.toString();
+        Integer index = strings.get(key);
+        if (index == null) {
+            index = strings.size();
+            output.writeInt(index);
+            strings.put(key, index);
+            output.writeString(key);
+        } else {
+            output.writeInt(index);
+        }
+    }
+
+    /**
+     * Returns the total number of bytes written by this encoder, some of which may still be buffered.
+     */
+    public long getWritePosition() {
+        return output.total();
+    }
+
+    @Override
+    public void flush() {
+        output.flush();
+    }
+
+    @Override
+    public void close() {
+        output.close();
+    }
+
+    public void done() {
+        strings = null;
+    }
+
+}
--- a/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/TypeSafeSerializer.java
+++ b/test/random_access/src/main/java/seaweedfs/client/btree/serialize/kryo/TypeSafeSerializer.java
@ -0,0 +1,51 @@
+/*
+ * Copyright 2012 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package seaweedfs.client.btree.serialize.kryo;
+
+import seaweedfs.client.btree.serialize.*;
+
+public class TypeSafeSerializer<T> implements StatefulSerializer<Object> {
+    private final Class<T> type;
+    private final StatefulSerializer<T> serializer;
+
+    public TypeSafeSerializer(Class<T> type, StatefulSerializer<T> serializer) {
+        this.type = type;
+        this.serializer = serializer;
+    }
+
+    @Override
+    public ObjectReader<Object> newReader(Decoder decoder) {
+        final ObjectReader<T> reader = serializer.newReader(decoder);
+        return new ObjectReader<Object>() {
+            @Override
+            public Object read() throws Exception {
+                return reader.read();
+            }
+        };
+    }
+
+    @Override
+    public ObjectWriter<Object> newWriter(Encoder encoder) {
+        final ObjectWriter<T> writer = serializer.newWriter(encoder);
+        return new ObjectWriter<Object>() {
+            @Override
+            public void write(Object value) throws Exception {
+                writer.write(type.cast(value));
+            }
+        };
+    }
+}
--- a/test/random_access/src/test/java/seaewedfs/mmap/MmapFileTest.java
+++ b/test/random_access/src/test/java/seaewedfs/mmap/MmapFileTest.java
@ -0,0 +1,143 @@
+package seaewedfs.mmap;
+
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+
+public class MmapFileTest {
+
+    File dir = new File("/Users/chris/tmp/mm/dev");
+
+    @Test
+    public void testMmap() {
+        try {
+            System.out.println("starting ...");
+
+            File f = new File(dir, "mmap_file.txt");
+            RandomAccessFile raf = new RandomAccessFile(f, "rw");
+            FileChannel fc = raf.getChannel();
+            MappedByteBuffer mbf = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size());
+            fc.close();
+            raf.close();
+
+            FileOutputStream fos = new FileOutputStream(f);
+            fos.write("abcdefg".getBytes());
+            fos.close();
+            System.out.println("completed!");
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    @Test
+    public void testBigMmap() throws IOException {
+        /*
+
+// new file
+I0817 09:48:02 25175 dir.go:147] create /dev/mmap_big.txt: OpenReadWrite+OpenCreate
+I0817 09:48:02 25175 wfs.go:116] AcquireHandle /dev/mmap_big.txt uid=502 gid=20
+I0817 09:48:02 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 0
+I0817 09:48:02 25175 meta_cache_subscribe.go:32] creating /dev/mmap_big.txt
+
+//get channel
+I0817 09:48:26 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 0
+
+I0817 09:48:32 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 0
+I0817 09:48:32 25175 wfs.go:116] AcquireHandle /dev/mmap_big.txt uid=0 gid=0
+I0817 09:48:32 25175 filehandle.go:160] Release /dev/mmap_big.txt fh 14968871991130164560
+
+//fileChannel.map
+I0817 09:49:18 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 0
+I0817 09:49:18 25175 file.go:112] /dev/mmap_big.txt file setattr set size=262144 chunks=0
+I0817 09:49:18 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 262144
+I0817 09:49:18 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 262144
+I0817 09:49:18 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 262144
+
+// buffer.put
+I0817 09:49:49 25175 filehandle.go:57] /dev/mmap_big.txt read fh 14968871991130164560: [0,32768) size 32768 resp.Data len=0 cap=32768
+I0817 09:49:49 25175 reader_at.go:113] zero2 [0,32768)
+I0817 09:49:50 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 262144
+
+I0817 09:49:53 25175 file.go:233] /dev/mmap_big.txt fsync file Fsync [ID=0x4 Node=0xe Uid=0 Gid=0 Pid=0] Handle 0x2 Flags 1
+
+//close
+I0817 09:50:14 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 262144
+I0817 09:50:14 25175 dirty_page.go:130] saveToStorage /dev/mmap_big.txt 1,315b69812039e5 [0,4096) of 262144 bytes
+I0817 09:50:14 25175 file.go:274] /dev/mmap_big.txt existing 0 chunks adds 1 more
+I0817 09:50:14 25175 filehandle.go:218] /dev/mmap_big.txt set chunks: 1
+I0817 09:50:14 25175 filehandle.go:220] /dev/mmap_big.txt chunks 0: 1,315b69812039e5 [0,4096)
+I0817 09:50:14 25175 meta_cache_subscribe.go:23] deleting /dev/mmap_big.txt
+I0817 09:50:14 25175 meta_cache_subscribe.go:32] creating /dev/mmap_big.txt
+
+// end of test
+I0817 09:50:41 25175 file.go:62] file Attr /dev/mmap_big.txt, open:1, size: 262144
+I0817 09:50:41 25175 filehandle.go:160] Release /dev/mmap_big.txt fh 14968871991130164560
+
+         */
+        // Create file object
+        File file = new File(dir, "mmap_big.txt");
+
+        try (RandomAccessFile randomAccessFile = new RandomAccessFile(file, "rw")) {
+            // Get file channel in read-write mode
+            FileChannel fileChannel = randomAccessFile.getChannel();
+
+            // Get direct byte buffer access using channel.map() operation
+            MappedByteBuffer buffer = fileChannel.map(FileChannel.MapMode.READ_WRITE, 0, 4096 * 8 * 8);
+
+            //Write the content using put methods
+            buffer.put("howtodoinjava.com".getBytes());
+        }
+
+/*
+> meta.cat /dev/mmap_big.txt
+{
+  "name": "mmap_big.txt",
+  "isDirectory": false,
+  "chunks": [
+    {
+      "fileId": "1,315b69812039e5",
+      "offset": "0",
+      "size": "4096",
+      "mtime": "1597683014026365000",
+      "eTag": "985ab0ac",
+      "sourceFileId": "",
+      "fid": {
+        "volumeId": 1,
+        "fileKey": "3234665",
+        "cookie": 2166372837
+      },
+      "sourceFid": null,
+      "cipherKey": null,
+      "isCompressed": true,
+      "isChunkManifest": false
+    }
+  ],
+  "attributes": {
+    "fileSize": "262144",
+    "mtime": "1597683014",
+    "fileMode": 420,
+    "uid": 502,
+    "gid": 20,
+    "crtime": "1597682882",
+    "mime": "application/octet-stream",
+    "replication": "",
+    "collection": "",
+    "ttlSec": 0,
+    "userName": "",
+    "groupName": [
+    ],
+    "symlinkTarget": "",
+    "md5": null
+  },
+  "extended": {
+  }
+}
+ */
+
+    }
+}
--- a/test/random_access/src/test/java/seaweedfs/client/btree/BTreePersistentIndexedCacheTest.java
+++ b/test/random_access/src/test/java/seaweedfs/client/btree/BTreePersistentIndexedCacheTest.java
@ -0,0 +1,476 @@
+/*
+ * Copyright 2010 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package seaweedfs.client.btree;
+
+import seaweedfs.client.btree.serialize.DefaultSerializer;
+import seaweedfs.client.btree.serialize.Serializer;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.hamcrest.CoreMatchers.*;
+import static org.junit.Assert.assertNull;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+public class BTreePersistentIndexedCacheTest {
+    private final Serializer<String> stringSerializer = new DefaultSerializer<String>();
+    private final Serializer<Integer> integerSerializer = new DefaultSerializer<Integer>();
+    private BTreePersistentIndexedCache<String, Integer> cache;
+    private File cacheFile;
+
+    @Before
+    public void setup() {
+        cacheFile = tmpDirFile("cache.bin");
+    }
+
+    public File tmpDirFile(String filename) {
+        File f = new File("/Users/chris/tmp/mm/dev/btree_test");
+        // File f = new File("/tmp/btree_test");
+        f.mkdirs();
+        return new File(f, filename);
+    }
+
+    private void createCache() {
+        cache = new BTreePersistentIndexedCache<String, Integer>(cacheFile, stringSerializer, integerSerializer, (short) 4, 100);
+    }
+
+    private void verifyAndCloseCache() {
+        cache.verify();
+        cache.close();
+    }
+
+    @Test
+    public void getReturnsNullWhenEntryDoesNotExist() {
+        createCache();
+        assertNull(cache.get("unknown"));
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void persistsAddedEntries() {
+        createCache();
+        checkAdds(1, 2, 3, 4, 5);
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void persistsAddedEntriesInReverseOrder() {
+        createCache();
+        checkAdds(5, 4, 3, 2, 1);
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void persistsAddedEntriesOverMultipleIndexBlocks() {
+        createCache();
+        checkAdds(3, 2, 11, 5, 7, 1, 10, 8, 9, 4, 6, 0);
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void persistsUpdates() {
+        createCache();
+        checkUpdates(3, 2, 11, 5, 7, 1, 10, 8, 9, 4, 6, 0);
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void handlesUpdatesWhenBlockSizeDecreases() {
+        BTreePersistentIndexedCache<String, List<Integer>> cache =
+                new BTreePersistentIndexedCache<String, List<Integer>>(
+                        tmpDirFile("listcache.bin"), stringSerializer,
+                        new DefaultSerializer<List<Integer>>(), (short) 4, 100);
+
+        List<Integer> values = Arrays.asList(3, 2, 11, 5, 7, 1, 10, 8, 9, 4, 6, 0);
+        Map<Integer, List<Integer>> updated = new LinkedHashMap<Integer, List<Integer>>();
+
+        for (int i = 10; i > 0; i--) {
+            for (Integer value : values) {
+                String key = String.format("key_%d", value);
+                List<Integer> newValue = new ArrayList<Integer>(i);
+                for (int j = 0; j < i * 2; j++) {
+                    newValue.add(j);
+                }
+                cache.put(key, newValue);
+                updated.put(value, newValue);
+            }
+
+            checkListEntries(cache, updated);
+        }
+
+        cache.reset();
+
+        checkListEntries(cache, updated);
+
+        cache.verify();
+        cache.close();
+    }
+
+    private void checkListEntries(BTreePersistentIndexedCache<String, List<Integer>> cache, Map<Integer, List<Integer>> updated) {
+        for (Map.Entry<Integer, List<Integer>> entry : updated.entrySet()) {
+            String key = String.format("key_%d", entry.getKey());
+            assertThat(cache.get(key), equalTo(entry.getValue()));
+        }
+    }
+
+    @Test
+    public void handlesUpdatesWhenBlockSizeIncreases() {
+        BTreePersistentIndexedCache<String, List<Integer>> cache =
+                new BTreePersistentIndexedCache<String, List<Integer>>(
+                        tmpDirFile("listcache.bin"), stringSerializer,
+                        new DefaultSerializer<List<Integer>>(), (short) 4, 100);
+
+        List<Integer> values = Arrays.asList(3, 2, 11, 5, 7, 1, 10, 8, 9, 4, 6, 0);
+        Map<Integer, List<Integer>> updated = new LinkedHashMap<Integer, List<Integer>>();
+
+        for (int i = 1; i < 10; i++) {
+            for (Integer value : values) {
+                String key = String.format("key_%d", value);
+                List<Integer> newValue = new ArrayList<Integer>(i);
+                for (int j = 0; j < i * 2; j++) {
+                    newValue.add(j);
+                }
+                cache.put(key, newValue);
+                updated.put(value, newValue);
+            }
+
+            checkListEntries(cache, updated);
+        }
+
+        cache.reset();
+
+        checkListEntries(cache, updated);
+
+        cache.verify();
+        cache.close();
+    }
+
+    @Test
+    public void persistsAddedEntriesAfterReopen() {
+        createCache();
+
+        checkAdds(1, 2, 3, 4);
+
+        cache.reset();
+
+        checkAdds(5, 6, 7, 8);
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void persistsReplacedEntries() {
+        createCache();
+
+        cache.put("key_1", 1);
+        cache.put("key_2", 2);
+        cache.put("key_3", 3);
+        cache.put("key_4", 4);
+        cache.put("key_5", 5);
+
+        cache.put("key_1", 1);
+        cache.put("key_4", 12);
+
+        assertThat(cache.get("key_1"), equalTo(1));
+        assertThat(cache.get("key_2"), equalTo(2));
+        assertThat(cache.get("key_3"), equalTo(3));
+        assertThat(cache.get("key_4"), equalTo(12));
+        assertThat(cache.get("key_5"), equalTo(5));
+
+        cache.reset();
+
+        assertThat(cache.get("key_1"), equalTo(1));
+        assertThat(cache.get("key_2"), equalTo(2));
+        assertThat(cache.get("key_3"), equalTo(3));
+        assertThat(cache.get("key_4"), equalTo(12));
+        assertThat(cache.get("key_5"), equalTo(5));
+
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void reusesEmptySpaceWhenPuttingEntries() {
+        BTreePersistentIndexedCache<String, String> cache = new BTreePersistentIndexedCache<String, String>(cacheFile, stringSerializer, stringSerializer, (short) 4, 100);
+
+        long beforeLen = cacheFile.length();
+        if (beforeLen>0){
+            System.out.println(String.format("cache %s: %s", "key_new", cache.get("key_new")));
+        }
+
+        cache.put("key_1", "abcd");
+        cache.put("key_2", "abcd");
+        cache.put("key_3", "abcd");
+        cache.put("key_4", "abcd");
+        cache.put("key_5", "abcd");
+
+        long len = cacheFile.length();
+        assertTrue(len > 0L);
+
+        System.out.println(String.format("cache file size %d => %d", beforeLen, len));
+
+        cache.put("key_1", "1234");
+        assertThat(cacheFile.length(), equalTo(len));
+
+        cache.remove("key_1");
+        cache.put("key_new", "a1b2");
+        assertThat(cacheFile.length(), equalTo(len));
+
+        cache.put("key_new", "longer value assertThat(cacheFile.length(), equalTo(len))");
+        System.out.println(String.format("cache file size %d beforeLen %d", cacheFile.length(), len));
+        // assertTrue(cacheFile.length() > len);
+        len = cacheFile.length();
+
+        cache.put("key_1", "1234");
+        assertThat(cacheFile.length(), equalTo(len));
+
+        cache.close();
+    }
+
+    @Test
+    public void canHandleLargeNumberOfEntries() {
+        createCache();
+        int count = 2000;
+        List<Integer> values = new ArrayList<Integer>();
+        for (int i = 0; i < count; i++) {
+            values.add(i);
+        }
+
+        checkAddsAndRemoves(null, values);
+
+        long len = cacheFile.length();
+
+        checkAddsAndRemoves(Collections.reverseOrder(), values);
+
+        // need to make this better
+        assertTrue(cacheFile.length() < (long)(1.4 * len));
+
+        checkAdds(values);
+
+        // need to make this better
+        assertTrue(cacheFile.length() < (long) (1.4 * 1.4 * len));
+
+        cache.close();
+    }
+
+    @Test
+    public void persistsRemovalOfEntries() {
+        createCache();
+        checkAddsAndRemoves(1, 2, 3, 4, 5);
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void persistsRemovalOfEntriesInReverse() {
+        createCache();
+        checkAddsAndRemoves(Collections.<Integer>reverseOrder(), 1, 2, 3, 4, 5);
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void persistsRemovalOfEntriesOverMultipleIndexBlocks() {
+        createCache();
+        checkAddsAndRemoves(4, 12, 9, 1, 3, 10, 11, 7, 8, 2, 5, 6);
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void removalRedistributesRemainingEntriesWithLeftSibling() {
+        createCache();
+        // Ends up with: 1 2 3 -> 4 <- 5 6
+        checkAdds(1, 2, 5, 6, 4, 3);
+        cache.verify();
+        cache.remove("key_5");
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void removalMergesRemainingEntriesIntoLeftSibling() {
+        createCache();
+        // Ends up with: 1 2 -> 3 <- 4 5
+        checkAdds(1, 2, 4, 5, 3);
+        cache.verify();
+        cache.remove("key_4");
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void removalRedistributesRemainingEntriesWithRightSibling() {
+        createCache();
+        // Ends up with: 1 2 -> 3 <- 4 5 6
+        checkAdds(1, 2, 4, 5, 3, 6);
+        cache.verify();
+        cache.remove("key_2");
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void removalMergesRemainingEntriesIntoRightSibling() {
+        createCache();
+        // Ends up with: 1 2 -> 3 <- 4 5
+        checkAdds(1, 2, 4, 5, 3);
+        cache.verify();
+        cache.remove("key_2");
+        verifyAndCloseCache();
+    }
+
+    @Test
+    public void handlesOpeningATruncatedCacheFile() throws IOException {
+        BTreePersistentIndexedCache<String, Integer> cache = new BTreePersistentIndexedCache<String, Integer>(cacheFile, stringSerializer, integerSerializer);
+
+        assertNull(cache.get("key_1"));
+        cache.put("key_1", 99);
+
+        RandomAccessFile file = new RandomAccessFile(cacheFile, "rw");
+        file.setLength(file.length() - 10);
+        file.close();
+
+        cache.reset();
+
+        assertNull(cache.get("key_1"));
+        cache.verify();
+
+        cache.close();
+    }
+
+    @Test
+    public void canUseFileAsKey() {
+        BTreePersistentIndexedCache<File, Integer> cache = new BTreePersistentIndexedCache<File, Integer>(cacheFile, new DefaultSerializer<File>(), integerSerializer);
+
+        cache.put(new File("file"), 1);
+        cache.put(new File("dir/file"), 2);
+        cache.put(new File("File"), 3);
+
+        assertThat(cache.get(new File("file")), equalTo(1));
+        assertThat(cache.get(new File("dir/file")), equalTo(2));
+        assertThat(cache.get(new File("File")), equalTo(3));
+
+        cache.close();
+    }
+
+    @Test
+    public void handlesKeysWithSameHashCode() {
+        createCache();
+
+        String key1 = new String(new byte[]{2, 31});
+        String key2 = new String(new byte[]{1, 62});
+        cache.put(key1, 1);
+        cache.put(key2, 2);
+
+        assertThat(cache.get(key1), equalTo(1));
+        assertThat(cache.get(key2), equalTo(2));
+
+        cache.close();
+    }
+
+    private void checkAdds(Integer... values) {
+        checkAdds(Arrays.asList(values));
+    }
+
+    private Map<String, Integer> checkAdds(Iterable<Integer> values) {
+        Map<String, Integer> added = new LinkedHashMap<String, Integer>();
+
+        for (Integer value : values) {
+            String key = String.format("key_%d", value);
+            cache.put(key, value);
+            added.put(String.format("key_%d", value), value);
+        }
+
+        for (Map.Entry<String, Integer> entry : added.entrySet()) {
+            assertThat(cache.get(entry.getKey()), equalTo(entry.getValue()));
+        }
+
+        cache.reset();
+
+        for (Map.Entry<String, Integer> entry : added.entrySet()) {
+            assertThat(cache.get(entry.getKey()), equalTo(entry.getValue()));
+        }
+
+        return added;
+    }
+
+    private void checkUpdates(Integer... values) {
+        checkUpdates(Arrays.asList(values));
+    }
+
+    private Map<Integer, Integer> checkUpdates(Iterable<Integer> values) {
+        Map<Integer, Integer> updated = new LinkedHashMap<Integer, Integer>();
+
+        for (int i = 0; i < 10; i++) {
+            for (Integer value : values) {
+                String key = String.format("key_%d", value);
+                int newValue = value + (i * 100);
+                cache.put(key, newValue);
+                updated.put(value, newValue);
+            }
+
+            for (Map.Entry<Integer, Integer> entry : updated.entrySet()) {
+                String key = String.format("key_%d", entry.getKey());
+                assertThat(cache.get(key), equalTo(entry.getValue()));
+            }
+        }
+
+        cache.reset();
+
+        for (Map.Entry<Integer, Integer> entry : updated.entrySet()) {
+            String key = String.format("key_%d", entry.getKey());
+            assertThat(cache.get(key), equalTo(entry.getValue()));
+        }
+
+        return updated;
+    }
+
+    private void checkAddsAndRemoves(Integer... values) {
+        checkAddsAndRemoves(null, values);
+    }
+
+    private void checkAddsAndRemoves(Comparator<Integer> comparator, Integer... values) {
+        checkAddsAndRemoves(comparator, Arrays.asList(values));
+    }
+
+    private void checkAddsAndRemoves(Comparator<Integer> comparator, Collection<Integer> values) {
+        checkAdds(values);
+
+        List<Integer> deleteValues = new ArrayList<Integer>(values);
+        Collections.sort(deleteValues, comparator);
+        for (Integer value : deleteValues) {
+            String key = String.format("key_%d", value);
+            assertThat(cache.get(key), notNullValue());
+            cache.remove(key);
+            assertThat(cache.get(key), nullValue());
+        }
+
+        cache.reset();
+        cache.verify();
+
+        for (Integer value : deleteValues) {
+            String key = String.format("key_%d", value);
+            assertThat(cache.get(key), nullValue());
+        }
+    }
+
+}
--- a/test/s3/basic/basic_test.go
+++ b/test/s3/basic/basic_test.go
@ -61,7 +61,7 @@ func TestCreateBucket(t *testing.T) {

 }

-func TestListBuckets(t *testing.T) {
+func TestPutObject(t *testing.T) {

 	input := &s3.PutObjectInput{
 		ACL:    aws.String("authenticated-read"),
@ -89,7 +89,7 @@ func TestListBuckets(t *testing.T) {

 }

-func TestPutObject(t *testing.T) {
+func TestListBucket(t *testing.T) {

 	result, err := svc.ListBuckets(nil)
 	if err != nil {
@ -105,6 +105,23 @@ func TestPutObject(t *testing.T) {

 }

+func TestListObjectV2(t *testing.T) {
+
+	listObj, err := svc.ListObjectsV2(&s3.ListObjectsV2Input{
+		Bucket: aws.String(Bucket),
+		Prefix: aws.String("foo"),
+		Delimiter: aws.String("/"),
+	})
+	if err != nil {
+		exitErrorf("Unable to list objects, %v", err)
+	}
+	for _, content := range listObj.Contents {
+		fmt.Println(aws.StringValue(content.Key))
+	}
+	fmt.Printf("list: %s\n", listObj)
+
+}
+
 func exitErrorf(msg string, args ...interface{}) {
 	fmt.Fprintf(os.Stderr, msg+"\n", args...)
 	os.Exit(1)
--- a/unmaintained/diff_volume_servers/diff_volume_servers.go
+++ b/unmaintained/diff_volume_servers/diff_volume_servers.go
@ -118,7 +118,7 @@ const (

 type needleState struct {
 	state uint8
-	size  uint32
+	size  types.Size
 }

 func getVolumeFiles(v uint32, addr string) (map[types.NeedleId]needleState, int64, error) {
@ -154,8 +154,8 @@ func getVolumeFiles(v uint32, addr string) (map[types.NeedleId]needleState, int6

 	var maxOffset int64
 	files := map[types.NeedleId]needleState{}
-	err = idx.WalkIndexFile(idxFile, func(key types.NeedleId, offset types.Offset, size uint32) error {
-		if offset.IsZero() || size == types.TombstoneFileSize {
+	err = idx.WalkIndexFile(idxFile, func(key types.NeedleId, offset types.Offset, size types.Size) error {
+		if offset.IsZero() || size.IsDeleted() {
 			files[key] = needleState{
 				state: stateDeleted,
 				size:  size,
--- a/unmaintained/fix_dat/fix_dat.go
+++ b/unmaintained/fix_dat/fix_dat.go
@ -98,7 +98,7 @@ func iterateEntries(datBackend backend.BackendStorageFile, idxFile *os.File, vis
 		// parse index file entry
 		key := util.BytesToUint64(bytes[0:8])
 		offsetFromIndex := util.BytesToUint32(bytes[8:12])
-		sizeFromIndex := util.BytesToUint32(bytes[12:16])
+		sizeFromIndex := types.BytesToSize(bytes[12:16])
 		count, _ = idxFile.ReadAt(bytes, readerOffset)
 		readerOffset += int64(count)

@ -123,7 +123,7 @@ func iterateEntries(datBackend backend.BackendStorageFile, idxFile *os.File, vis
 			}
 		}()

-		if n.Size <= n.DataSize {
+		if n.Size <= types.Size(n.DataSize) {
 			continue
 		}
 		visitNeedle(n, offset)
--- a/unmaintained/see_dat/see_dat_gzip.go
+++ b/unmaintained/see_dat/see_dat_gzip.go
@ -1,83 +0,0 @@
-package main
-
-import (
-	"bytes"
-	"compress/gzip"
-	"crypto/md5"
-	"flag"
-	"io"
-	"io/ioutil"
-	"net/http"
-	"time"
-	"github.com/chrislusf/seaweedfs/weed/glog"
-	"github.com/chrislusf/seaweedfs/weed/storage"
-	"github.com/chrislusf/seaweedfs/weed/storage/needle"
-	"github.com/chrislusf/seaweedfs/weed/storage/super_block"
-	"github.com/chrislusf/seaweedfs/weed/util"
-)
-
-type VolumeFileScanner4SeeDat struct {
-	version needle.Version
-}
-
-func (scanner *VolumeFileScanner4SeeDat) VisitSuperBlock(superBlock super_block.SuperBlock) error {
-	scanner.version = superBlock.Version
-	return nil
-}
-
-func (scanner *VolumeFileScanner4SeeDat) ReadNeedleBody() bool {
-	return true
-}
-
-var (
-	files     = int64(0)
-	filebytes = int64(0)
-	diffbytes = int64(0)
-)
-
-func Compresssion(data []byte) float64 {
-	if len(data) <= 128 {
-		return 100.0
-	}
-	compressed, _ := util.GzipData(data[0:128])
-	return float64(len(compressed)*10) / 1280.0
-}
-
-func (scanner *VolumeFileScanner4SeeDat) VisitNeedle(n *needle.Needle, offset int64, needleHeader, needleBody []byte) error {
-	t := time.Unix(int64(n.AppendAtNs)/int64(time.Second), int64(n.AppendAtNs)%int64(time.Second))
-	glog.V(0).Info("----------------------------------------------------------------------------------")
-	glog.V(0).Infof("%d,%s%x offset %d size %d(%s) cookie %x appendedAt %v  hasmime[%t] mime[%s] (len: %d)",
-		*volumeId, n.Id, n.Cookie, offset, n.Size, util.BytesToHumanReadable(uint64(n.Size)), n.Cookie, t, n.HasMime(), string(n.Mime), len(n.Mime))
-	r, err := gzip.NewReader(bytes.NewReader(n.Data))
-	if err == nil {
-		buf := bytes.Buffer{}
-		h := md5.New()
-		c, _ := io.Copy(&buf, r)
-		d := buf.Bytes()
-		io.Copy(h, bytes.NewReader(d))
-		diff := (int64(n.DataSize) - int64(c))
-		diffbytes += diff
-		glog.V(0).Infof("was gzip! stored_size: %d orig_size: %d diff: %d(%d) mime:%s compression-of-128: %.2f md5: %x", n.DataSize, c, diff, diffbytes, http.DetectContentType(d), Compresssion(d), h.Sum(nil))
-	} else {
-		glog.V(0).Infof("no gzip!")
-	}
-	return nil
-}
-
-var (
-	_                = ioutil.ReadAll
-	volumePath       = flag.String("dir", "/tmp", "data directory to store files")
-	volumeCollection = flag.String("collection", "", "the volume collection name")
-	volumeId         = flag.Int("volumeId", -1, "a volume id. The volume should already exist in the dir. The volume index file should not exist.")
-)
-
-func main() {
-	flag.Parse()
-	vid := needle.VolumeId(*volumeId)
-	glog.V(0).Info("Starting")
-	scanner := &VolumeFileScanner4SeeDat{}
-	err := storage.ScanVolumeFile(*volumePath, *volumeCollection, vid, storage.NeedleMapInMemory, scanner)
-	if err != nil {
-		glog.Fatalf("Reading Volume File [ERROR] %s\n", err)
-	}
-}
--- a/unmaintained/see_idx/see_idx.go
+++ b/unmaintained/see_idx/see_idx.go
@ -36,7 +36,7 @@ func main() {
 	}
 	defer indexFile.Close()

-	idx.WalkIndexFile(indexFile, func(key types.NeedleId, offset types.Offset, size uint32) error {
+	idx.WalkIndexFile(indexFile, func(key types.NeedleId, offset types.Offset, size types.Size) error {
 		fmt.Printf("key:%v offset:%v size:%v(%v)\n", key, offset, size, util.BytesToHumanReadable(uint64(size)))
 		return nil
 	})
--- a/weed/Makefile
+++ b/weed/Makefile
@ -0,0 +1,19 @@
+BINARY = weed
+
+SOURCE_DIR = .
+
+all: debug_mount
+
+.PHONY : clean debug_mount
+
+clean:
+	go clean $(SOURCE_DIR)
+	rm -f $(BINARY)
+
+debug_mount:
+	go build -gcflags="all=-N -l"
+	dlv --listen=:2345 --headless=true --api-version=2 --accept-multiclient exec weed -- mount -dir=~/tmp/mm
+
+debug_server:
+	go build -gcflags="all=-N -l"
+	dlv --listen=:2345 --headless=true --api-version=2 --accept-multiclient exec weed -- server -dir=/Volumes/mobile_disk/99 -filer -volume.port=8343 -s3 -volume.max=0
--- a/weed/command/export.go
+++ b/weed/command/export.go
@ -72,9 +72,9 @@ var (

 func printNeedle(vid needle.VolumeId, n *needle.Needle, version needle.Version, deleted bool) {
 	key := needle.NewFileIdFromNeedle(vid, n).String()
-	size := n.DataSize
+	size := int32(n.DataSize)
 	if version == needle.Version1 {
-		size = n.Size
+		size = int32(n.Size)
 	}
 	fmt.Printf("%s\t%s\t%d\t%t\t%s\t%s\t%s\t%t\n",
 		key,
@ -111,7 +111,7 @@ func (scanner *VolumeFileScanner4Export) VisitNeedle(n *needle.Needle, offset in
 	nv, ok := needleMap.Get(n.Id)
 	glog.V(3).Infof("key %d offset %d size %d disk_size %d compressed %v ok %v nv %+v",
 		n.Id, offset, n.Size, n.DiskSize(scanner.version), n.IsCompressed(), ok, nv)
-	if ok && nv.Size > 0 && nv.Size != types.TombstoneFileSize && nv.Offset.ToAcutalOffset() == offset {
+	if ok && nv.Size.IsValid() && nv.Offset.ToAcutalOffset() == offset {
 		if newerThanUnix >= 0 && n.HasLastModifiedDate() && n.LastModified < uint64(newerThanUnix) {
 			glog.V(3).Infof("Skipping this file, as it's old enough: LastModified %d vs %d",
 				n.LastModified, newerThanUnix)
--- a/weed/command/fix.go
+++ b/weed/command/fix.go
@ -48,7 +48,7 @@ func (scanner *VolumeFileScanner4Fix) ReadNeedleBody() bool {

 func (scanner *VolumeFileScanner4Fix) VisitNeedle(n *needle.Needle, offset int64, needleHeader, needleBody []byte) error {
 	glog.V(2).Infof("key %d offset %d size %d disk_size %d compressed %v", n.Id, offset, n.Size, n.DiskSize(scanner.version), n.IsCompressed())
-	if n.Size > 0 && n.Size != types.TombstoneFileSize {
+	if n.Size.IsValid() {
 		pe := scanner.nm.Set(n.Id, types.ToOffset(offset), n.Size)
 		glog.V(2).Infof("saved %d with error %v", n.Size, pe)
 	} else {
--- a/weed/command/server.go
+++ b/weed/command/server.go
@ -96,7 +96,7 @@ func init() {
 	serverOptions.v.fixJpgOrientation = cmdServer.Flag.Bool("volume.images.fix.orientation", false, "Adjust jpg orientation when uploading.")
 	serverOptions.v.readRedirect = cmdServer.Flag.Bool("volume.read.redirect", true, "Redirect moved or non-local volumes.")
 	serverOptions.v.compactionMBPerSecond = cmdServer.Flag.Int("volume.compactionMBps", 0, "limit compaction speed in mega bytes per second")
-	serverOptions.v.fileSizeLimitMB = cmdServer.Flag.Int("volume.fileSizeLimitMB", 256, "limit file size to avoid out of memory")
+	serverOptions.v.fileSizeLimitMB = cmdServer.Flag.Int("volume.fileSizeLimitMB", 1024, "limit file size to avoid out of memory")
 	serverOptions.v.publicUrl = cmdServer.Flag.String("volume.publicUrl", "", "publicly accessible address")
 	serverOptions.v.pprof = &False

--- a/weed/command/volume.go
+++ b/weed/command/volume.go
@ -76,7 +76,7 @@ func init() {
 	v.cpuProfile = cmdVolume.Flag.String("cpuprofile", "", "cpu profile output file")
 	v.memProfile = cmdVolume.Flag.String("memprofile", "", "memory profile output file")
 	v.compactionMBPerSecond = cmdVolume.Flag.Int("compactionMBps", 0, "limit background compaction or copying speed in mega bytes per second")
-	v.fileSizeLimitMB = cmdVolume.Flag.Int("fileSizeLimitMB", 256, "limit file size to avoid out of memory")
+	v.fileSizeLimitMB = cmdVolume.Flag.Int("fileSizeLimitMB", 1024, "limit file size to avoid out of memory")
 	v.pprof = cmdVolume.Flag.Bool("pprof", false, "enable pprof http handlers. precludes --memprofile and --cpuprofile")
 }

--- a/weed/filer2/entry.go
+++ b/weed/filer2/entry.go
@ -22,6 +22,7 @@ type Attr struct {
 	GroupNames    []string
 	SymlinkTarget string
 	Md5           []byte
+	FileSize      uint64
 }

 func (attr Attr) IsDirectory() bool {
@ -39,7 +40,7 @@ type Entry struct {
 }

 func (entry *Entry) Size() uint64 {
-	return TotalSize(entry.Chunks)
+	return maxUint64(TotalSize(entry.Chunks), entry.FileSize)
 }

 func (entry *Entry) Timestamp() time.Time {
@ -81,3 +82,10 @@ func FromPbEntry(dir string, entry *filer_pb.Entry) *Entry {
 		Chunks:   entry.Chunks,
 	}
 }
+
+func maxUint64(x, y uint64) uint64 {
+	if x > y {
+		return x
+	}
+	return y
+}
--- a/weed/filer2/entry_codec.go
+++ b/weed/filer2/entry_codec.go
@ -53,6 +53,7 @@ func EntryAttributeToPb(entry *Entry) *filer_pb.FuseAttributes {
 		GroupName:     entry.Attr.GroupNames,
 		SymlinkTarget: entry.Attr.SymlinkTarget,
 		Md5:           entry.Attr.Md5,
+		FileSize:      entry.Attr.FileSize,
 	}
 }

@ -73,6 +74,7 @@ func PbToEntryAttribute(attr *filer_pb.FuseAttributes) Attr {
 	t.GroupNames = attr.GroupName
 	t.SymlinkTarget = attr.SymlinkTarget
 	t.Md5 = attr.Md5
+	t.FileSize = attr.FileSize

 	return t
 }
--- a/weed/filer2/filechunk_manifest.go
+++ b/weed/filer2/filechunk_manifest.go
@ -64,7 +64,7 @@ func fetchChunk(lookupFileIdFn LookupFileIdFunctionType, fileId string, cipherKe
 		return nil, err
 	}
 	var buffer bytes.Buffer
-	err = util.ReadUrlAsStream(urlString, cipherKey, isGzipped, true, 0, 0, func(data []byte) {
+	err = util.ReadUrlAsStream(urlString+"?readDeleted=true", cipherKey, isGzipped, true, 0, 0, func(data []byte) {
 		buffer.Write(data)
 	})
 	if err != nil {
--- a/weed/filer2/filechunks.go
+++ b/weed/filer2/filechunks.go
@ -20,6 +20,10 @@ func TotalSize(chunks []*filer_pb.FileChunk) (size uint64) {
 	return
 }

+func FileSize(entry *filer_pb.Entry) (size uint64) {
+	return maxUint64(TotalSize(entry.Chunks), entry.Attributes.FileSize)
+}
+
 func ETag(entry *filer_pb.Entry) (etag string) {
 	if entry.Attributes == nil || entry.Attributes.Md5 == nil {
 		return ETagChunks(entry.Chunks)
@ -100,7 +104,7 @@ type ChunkView struct {
 	FileId      string
 	Offset      int64
 	Size        uint64
-	LogicOffset int64
+	LogicOffset int64 // actual offset in the file, for the data specified via [offset, offset+size) in current chunk
 	ChunkSize   uint64
 	CipherKey   []byte
 	IsGzipped   bool
@ -130,17 +134,18 @@ func ViewFromVisibleIntervals(visibles []VisibleInterval, offset int64, size int

 	for _, chunk := range visibles {

-		if chunk.start <= offset && offset < chunk.stop && offset < stop {
+		chunkStart, chunkStop := max(offset, chunk.start), min(stop, chunk.stop)
+
+		if chunkStart < chunkStop {
 			views = append(views, &ChunkView{
 				FileId:      chunk.fileId,
-				Offset:      offset - chunk.start, // offset is the data starting location in this file id
-				Size:        uint64(min(chunk.stop, stop) - offset),
-				LogicOffset: offset,
+				Offset:      chunkStart - chunk.start + chunk.chunkOffset,
+				Size:        uint64(chunkStop - chunkStart),
+				LogicOffset: chunkStart,
 				ChunkSize:   chunk.chunkSize,
 				CipherKey:   chunk.cipherKey,
 				IsGzipped:   chunk.isGzipped,
 			})
-			offset = min(chunk.stop, stop)
 		}
 	}

@ -149,10 +154,11 @@ func ViewFromVisibleIntervals(visibles []VisibleInterval, offset int64, size int
 }

 func logPrintf(name string, visibles []VisibleInterval) {
+
 	/*
-		log.Printf("%s len %d", name, len(visibles))
+		glog.V(0).Infof("%s len %d", name, len(visibles))
 		for _, v := range visibles {
-			log.Printf("%s:  => %+v", name, v)
+			glog.V(0).Infof("%s:  [%d,%d)", name, v.start, v.stop)
 		}
 	*/
 }
@ -165,7 +171,7 @@ var bufPool = sync.Pool{

 func MergeIntoVisibles(visibles, newVisibles []VisibleInterval, chunk *filer_pb.FileChunk) []VisibleInterval {

-	newV := newVisibleInterval(chunk.Offset, chunk.Offset+int64(chunk.Size), chunk.GetFileIdString(), chunk.Mtime, chunk.Size, chunk.CipherKey, chunk.IsCompressed)
+	newV := newVisibleInterval(chunk.Offset, chunk.Offset+int64(chunk.Size), chunk.GetFileIdString(), chunk.Mtime, 0, chunk.Size, chunk.CipherKey, chunk.IsCompressed)

 	length := len(visibles)
 	if length == 0 {
@ -177,13 +183,13 @@ func MergeIntoVisibles(visibles, newVisibles []VisibleInterval, chunk *filer_pb.
 	}

 	logPrintf("  before", visibles)
+	chunkStop := chunk.Offset + int64(chunk.Size)
 	for _, v := range visibles {
 		if v.start < chunk.Offset && chunk.Offset < v.stop {
-			newVisibles = append(newVisibles, newVisibleInterval(v.start, chunk.Offset, v.fileId, v.modifiedTime, chunk.Size, v.cipherKey, v.isGzipped))
+			newVisibles = append(newVisibles, newVisibleInterval(v.start, chunk.Offset, v.fileId, v.modifiedTime, v.chunkOffset, v.chunkSize, v.cipherKey, v.isGzipped))
 		}
-		chunkStop := chunk.Offset + int64(chunk.Size)
 		if v.start < chunkStop && chunkStop < v.stop {
-			newVisibles = append(newVisibles, newVisibleInterval(chunkStop, v.stop, v.fileId, v.modifiedTime, chunk.Size, v.cipherKey, v.isGzipped))
+			newVisibles = append(newVisibles, newVisibleInterval(chunkStop, v.stop, v.fileId, v.modifiedTime, v.chunkOffset+(chunkStop-v.start), v.chunkSize, v.cipherKey, v.isGzipped))
 		}
 		if chunkStop <= v.start || v.stop <= chunk.Offset {
 			newVisibles = append(newVisibles, v)
@ -219,6 +225,7 @@ func NonOverlappingVisibleIntervals(lookupFileIdFn LookupFileIdFunctionType, chu
 	var newVisibles []VisibleInterval
 	for _, chunk := range chunks {

+		// glog.V(0).Infof("merge [%d,%d)", chunk.Offset, chunk.Offset+int64(chunk.Size))
 		newVisibles = MergeIntoVisibles(visibles, newVisibles, chunk)
 		t := visibles[:0]
 		visibles = newVisibles
@ -239,17 +246,19 @@ type VisibleInterval struct {
 	stop         int64
 	modifiedTime int64
 	fileId       string
+	chunkOffset  int64
 	chunkSize    uint64
 	cipherKey    []byte
 	isGzipped    bool
 }

-func newVisibleInterval(start, stop int64, fileId string, modifiedTime int64, chunkSize uint64, cipherKey []byte, isGzipped bool) VisibleInterval {
+func newVisibleInterval(start, stop int64, fileId string, modifiedTime int64, chunkOffset int64, chunkSize uint64, cipherKey []byte, isGzipped bool) VisibleInterval {
 	return VisibleInterval{
 		start:        start,
 		stop:         stop,
 		fileId:       fileId,
 		modifiedTime: modifiedTime,
+		chunkOffset:  chunkOffset, // the starting position in the chunk
 		chunkSize:    chunkSize,
 		cipherKey:    cipherKey,
 		isGzipped:    isGzipped,
@ -262,3 +271,9 @@ func min(x, y int64) int64 {
 	}
 	return y
 }
+func max(x, y int64) int64 {
+	if x <= y {
+		return y
+	}
+	return x
+}
--- a/weed/filer2/filechunks_test.go
+++ b/weed/filer2/filechunks_test.go
@ -1,10 +1,13 @@
 package filer2

 import (
+	"fmt"
 	"log"
+	"math"
 	"testing"

-	"fmt"
+	"github.com/stretchr/testify/assert"
+
 	"github.com/chrislusf/seaweedfs/weed/pb/filer_pb"
 )

@ -91,12 +94,12 @@ func TestIntervalMerging(t *testing.T) {
 		// case 2: updates overwrite part of previous chunks
 		{
 			Chunks: []*filer_pb.FileChunk{
-				{Offset: 0, Size: 100, FileId: "abc", Mtime: 123},
-				{Offset: 0, Size: 50, FileId: "asdf", Mtime: 134},
+				{Offset: 0, Size: 100, FileId: "a", Mtime: 123},
+				{Offset: 0, Size: 70, FileId: "b", Mtime: 134},
 			},
 			Expected: []*VisibleInterval{
-				{start: 0, stop: 50, fileId: "asdf"},
-				{start: 50, stop: 100, fileId: "abc"},
+				{start: 0, stop: 70, fileId: "b"},
+				{start: 70, stop: 100, fileId: "a", chunkOffset: 70},
 			},
 		},
 		// case 3: updates overwrite full chunks
@ -126,14 +129,14 @@ func TestIntervalMerging(t *testing.T) {
 		// case 5: updates overwrite full chunks
 		{
 			Chunks: []*filer_pb.FileChunk{
-				{Offset: 0, Size: 100, FileId: "abc", Mtime: 123},
-				{Offset: 0, Size: 200, FileId: "asdf", Mtime: 184},
-				{Offset: 70, Size: 150, FileId: "abc", Mtime: 143},
-				{Offset: 80, Size: 100, FileId: "xxxx", Mtime: 134},
+				{Offset: 0, Size: 100, FileId: "a", Mtime: 123},
+				{Offset: 0, Size: 200, FileId: "d", Mtime: 184},
+				{Offset: 70, Size: 150, FileId: "c", Mtime: 143},
+				{Offset: 80, Size: 100, FileId: "b", Mtime: 134},
 			},
 			Expected: []*VisibleInterval{
-				{start: 0, stop: 200, fileId: "asdf"},
-				{start: 200, stop: 220, fileId: "abc"},
+				{start: 0, stop: 200, fileId: "d"},
+				{start: 200, stop: 220, fileId: "c", chunkOffset: 130},
 			},
 		},
 		// case 6: same updates
@ -204,6 +207,10 @@ func TestIntervalMerging(t *testing.T) {
 				t.Fatalf("failed on test case %d, interval %d, chunkId %s, expect %s",
 					i, x, interval.fileId, testcase.Expected[x].fileId)
 			}
+			if interval.chunkOffset != testcase.Expected[x].chunkOffset {
+				t.Fatalf("failed on test case %d, interval %d, chunkOffset %d, expect %d",
+					i, x, interval.chunkOffset, testcase.Expected[x].chunkOffset)
+			}
 		}
 		if len(intervals) != len(testcase.Expected) {
 			t.Fatalf("failed to compact test case %d, len %d expected %d", i, len(intervals), len(testcase.Expected))
@ -251,14 +258,14 @@ func TestChunksReading(t *testing.T) {
 		// case 2: updates overwrite part of previous chunks
 		{
 			Chunks: []*filer_pb.FileChunk{
-				{Offset: 0, Size: 100, FileId: "abc", Mtime: 123},
-				{Offset: 0, Size: 50, FileId: "asdf", Mtime: 134},
+				{Offset: 3, Size: 100, FileId: "a", Mtime: 123},
+				{Offset: 10, Size: 50, FileId: "b", Mtime: 134},
 			},
-			Offset: 25,
-			Size:   50,
+			Offset: 30,
+			Size:   40,
 			Expected: []*ChunkView{
-				{Offset: 25, Size: 25, FileId: "asdf", LogicOffset: 25},
-				{Offset: 0, Size: 25, FileId: "abc", LogicOffset: 50},
+				{Offset: 20, Size: 30, FileId: "b", LogicOffset: 30},
+				{Offset: 57, Size: 10, FileId: "a", LogicOffset: 60},
 			},
 		},
 		// case 3: updates overwrite full chunks
@ -286,22 +293,22 @@ func TestChunksReading(t *testing.T) {
 			Size:   400,
 			Expected: []*ChunkView{
 				{Offset: 0, Size: 200, FileId: "asdf", LogicOffset: 0},
-				// {Offset: 0, Size: 150, FileId: "xxxx"}, // missing intervals should not happen
+				{Offset: 0, Size: 150, FileId: "xxxx", LogicOffset: 250},
 			},
 		},
 		// case 5: updates overwrite full chunks
 		{
 			Chunks: []*filer_pb.FileChunk{
-				{Offset: 0, Size: 100, FileId: "abc", Mtime: 123},
-				{Offset: 0, Size: 200, FileId: "asdf", Mtime: 184},
-				{Offset: 70, Size: 150, FileId: "abc", Mtime: 143},
+				{Offset: 0, Size: 100, FileId: "a", Mtime: 123},
+				{Offset: 0, Size: 200, FileId: "c", Mtime: 184},
+				{Offset: 70, Size: 150, FileId: "b", Mtime: 143},
 				{Offset: 80, Size: 100, FileId: "xxxx", Mtime: 134},
 			},
 			Offset: 0,
 			Size:   220,
 			Expected: []*ChunkView{
-				{Offset: 0, Size: 200, FileId: "asdf", LogicOffset: 0},
-				{Offset: 0, Size: 20, FileId: "abc", LogicOffset: 200},
+				{Offset: 0, Size: 200, FileId: "c", LogicOffset: 0},
+				{Offset: 130, Size: 20, FileId: "b", LogicOffset: 200},
 			},
 		},
 		// case 6: same updates
@ -370,18 +377,21 @@ func TestChunksReading(t *testing.T) {
 	}

 	for i, testcase := range testcases {
+		if i != 2 {
+			// continue
+		}
 		log.Printf("++++++++++ read test case %d ++++++++++++++++++++", i)
 		chunks := ViewFromChunks(nil, testcase.Chunks, testcase.Offset, testcase.Size)
 		for x, chunk := range chunks {
 			log.Printf("read case %d, chunk %d, offset=%d, size=%d, fileId=%s",
 				i, x, chunk.Offset, chunk.Size, chunk.FileId)
 			if chunk.Offset != testcase.Expected[x].Offset {
-				t.Fatalf("failed on read case %d, chunk %d, Offset %d, expect %d",
-					i, x, chunk.Offset, testcase.Expected[x].Offset)
+				t.Fatalf("failed on read case %d, chunk %s, Offset %d, expect %d",
+					i, chunk.FileId, chunk.Offset, testcase.Expected[x].Offset)
 			}
 			if chunk.Size != testcase.Expected[x].Size {
-				t.Fatalf("failed on read case %d, chunk %d, Size %d, expect %d",
-					i, x, chunk.Size, testcase.Expected[x].Size)
+				t.Fatalf("failed on read case %d, chunk %s, Size %d, expect %d",
+					i, chunk.FileId, chunk.Size, testcase.Expected[x].Size)
 			}
 			if chunk.FileId != testcase.Expected[x].FileId {
 				t.Fatalf("failed on read case %d, chunk %d, FileId %s, expect %s",
@ -418,3 +428,74 @@ func BenchmarkCompactFileChunks(b *testing.B) {
 		CompactFileChunks(nil, chunks)
 	}
 }
+
+func TestViewFromVisibleIntervals(t *testing.T) {
+	visibles := []VisibleInterval{
+		{
+			start:  0,
+			stop:   25,
+			fileId: "fid1",
+		},
+		{
+			start:  4096,
+			stop:   8192,
+			fileId: "fid2",
+		},
+		{
+			start:  16384,
+			stop:   18551,
+			fileId: "fid3",
+		},
+	}
+
+	views := ViewFromVisibleIntervals(visibles, 0, math.MaxInt32)
+
+	if len(views) != len(visibles) {
+		assert.Equal(t, len(visibles), len(views), "ViewFromVisibleIntervals error")
+	}
+
+}
+
+func TestViewFromVisibleIntervals2(t *testing.T) {
+	visibles := []VisibleInterval{
+		{
+			start:  344064,
+			stop:   348160,
+			fileId: "fid1",
+		},
+		{
+			start:  348160,
+			stop:   356352,
+			fileId: "fid2",
+		},
+	}
+
+	views := ViewFromVisibleIntervals(visibles, 0, math.MaxInt32)
+
+	if len(views) != len(visibles) {
+		assert.Equal(t, len(visibles), len(views), "ViewFromVisibleIntervals error")
+	}
+
+}
+
+func TestViewFromVisibleIntervals3(t *testing.T) {
+	visibles := []VisibleInterval{
+		{
+			start:  1000,
+			stop:   2000,
+			fileId: "fid1",
+		},
+		{
+			start:  3000,
+			stop:   4000,
+			fileId: "fid2",
+		},
+	}
+
+	views := ViewFromVisibleIntervals(visibles, 1700, 1500)
+
+	if len(views) != len(visibles) {
+		assert.Equal(t, len(visibles), len(views), "ViewFromVisibleIntervals error")
+	}
+
+}
--- a/weed/filer2/filer.go
+++ b/weed/filer2/filer.go
@ -9,8 +9,6 @@ import (

 	"google.golang.org/grpc"

-	"github.com/karlseguin/ccache"
-
 	"github.com/chrislusf/seaweedfs/weed/glog"
 	"github.com/chrislusf/seaweedfs/weed/pb/filer_pb"
 	"github.com/chrislusf/seaweedfs/weed/util"
@ -27,7 +25,6 @@ var (

 type Filer struct {
 	Store               *FilerStoreWrapper
-	directoryCache      *ccache.Cache
 	MasterClient        *wdclient.MasterClient
 	fileIdDeletionQueue *util.UnboundedQueue
 	GrpcDialOption      grpc.DialOption
@ -44,7 +41,6 @@ type Filer struct {
 func NewFiler(masters []string, grpcDialOption grpc.DialOption,
 	filerHost string, filerGrpcPort uint32, collection string, replication string, notifyFn func()) *Filer {
 	f := &Filer{
-		directoryCache:      ccache.New(ccache.Configure().MaxSize(1000).ItemsToPrune(100)),
 		MasterClient:        wdclient.NewMasterClient(grpcDialOption, "filer", filerHost, filerGrpcPort, masters),
 		fileIdDeletionQueue: util.NewUnboundedQueue(),
 		GrpcDialOption:      grpcDialOption,
@ -77,10 +73,6 @@ func (f *Filer) GetStore() (store FilerStore) {
 	return f.Store
 }

-func (f *Filer) DisableDirectoryCache() {
-	f.directoryCache = nil
-}
-
 func (fs *Filer) GetMaster() string {
 	return fs.MasterClient.GetMaster()
 }
@ -117,16 +109,9 @@ func (f *Filer) CreateEntry(ctx context.Context, entry *Entry, o_excl bool, isFr
 		dirPath := "/" + util.Join(dirParts[:i]...)
 		// fmt.Printf("%d directory: %+v\n", i, dirPath)

-		// first check local cache
-		dirEntry := f.cacheGetDirectory(dirPath)
-
-		// not found, check the store directly
-		if dirEntry == nil {
-			glog.V(4).Infof("find uncached directory: %s", dirPath)
-			dirEntry, _ = f.FindEntry(ctx, util.FullPath(dirPath))
-		} else {
-			// glog.V(4).Infof("found cached directory: %s", dirPath)
-		}
+		// check the store directly
+		glog.V(4).Infof("find uncached directory: %s", dirPath)
+		dirEntry, _ := f.FindEntry(ctx, util.FullPath(dirPath))

 		// no such existing directory
 		if dirEntry == nil {
@ -166,9 +151,6 @@ func (f *Filer) CreateEntry(ctx context.Context, entry *Entry, o_excl bool, isFr
 			return fmt.Errorf("%s is a file", dirPath)
 		}

-		// cache the directory entry
-		f.cacheSetDirectory(dirPath, dirEntry, i)
-
 		// remember the direct parent directory entry
 		if i == len(dirParts)-1 {
 			lastDirectoryEntry = dirEntry
@ -295,45 +277,6 @@ func (f *Filer) doListDirectoryEntries(ctx context.Context, p util.FullPath, sta
 	return
 }

-func (f *Filer) cacheDelDirectory(dirpath string) {
-
-	if dirpath == "/" {
-		return
-	}
-
-	if f.directoryCache == nil {
-		return
-	}
-	f.directoryCache.Delete(dirpath)
-	return
-}
-
-func (f *Filer) cacheGetDirectory(dirpath string) *Entry {
-
-	if f.directoryCache == nil {
-		return nil
-	}
-	item := f.directoryCache.Get(dirpath)
-	if item == nil {
-		return nil
-	}
-	return item.Value().(*Entry)
-}
-
-func (f *Filer) cacheSetDirectory(dirpath string, dirEntry *Entry, level int) {
-
-	if f.directoryCache == nil {
-		return
-	}
-
-	minutes := 60
-	if level < 10 {
-		minutes -= level * 6
-	}
-
-	f.directoryCache.Set(dirpath, dirEntry, time.Duration(minutes)*time.Minute)
-}
-
 func (f *Filer) Shutdown() {
 	f.LocalMetaLogBuffer.Shutdown()
 	f.Store.Shutdown()
--- a/weed/filer2/filer_delete_entry.go
+++ b/weed/filer2/filer_delete_entry.go
@ -65,6 +65,7 @@ func (f *Filer) doBatchDeleteFolderMetaAndData(ctx context.Context, entry *Entry
 		}
 		if lastFileName == "" && !isRecursive && len(entries) > 0 {
 			// only for first iteration in the loop
+			glog.Errorf("deleting a folder %s has children: %+v ...", entry.FullPath, entries[0].Name())
 			return nil, fmt.Errorf("fail to delete non-empty folder: %s", entry.FullPath)
 		}

@ -73,7 +74,6 @@ func (f *Filer) doBatchDeleteFolderMetaAndData(ctx context.Context, entry *Entry
 			var dirChunks []*filer_pb.FileChunk
 			if sub.IsDirectory() {
 				dirChunks, err = f.doBatchDeleteFolderMetaAndData(ctx, sub, isRecursive, ignoreRecursiveError, shouldDeleteChunks, false)
-				f.cacheDelDirectory(string(sub.FullPath))
 				chunks = append(chunks, dirChunks...)
 			} else {
 				f.NotifyUpdateEvent(ctx, sub, nil, shouldDeleteChunks, isFromOtherCluster)
@ -107,9 +107,7 @@ func (f *Filer) doDeleteEntryMetaAndData(ctx context.Context, entry *Entry, shou
 	if storeDeletionErr := f.Store.DeleteEntry(ctx, entry.FullPath); storeDeletionErr != nil {
 		return fmt.Errorf("filer store delete: %v", storeDeletionErr)
 	}
-	if entry.IsDirectory() {
-		f.cacheDelDirectory(string(entry.FullPath))
-	} else {
+	if !entry.IsDirectory() {
 		f.NotifyUpdateEvent(ctx, entry, nil, shouldDeleteChunks, isFromOtherCluster)
 	}

--- a/weed/filer2/filer_deletion.go
+++ b/weed/filer2/filer_deletion.go
@ -1,6 +1,7 @@
 package filer2

 import (
+	"strings"
 	"time"

 	"github.com/chrislusf/seaweedfs/weed/glog"
@ -50,15 +51,14 @@ func (f *Filer) loopProcessingDeletion() {
 					fileIds = fileIds[:0]
 				}
 				deletionCount = len(toDeleteFileIds)
-				deleteResults, err := operation.DeleteFilesWithLookupVolumeId(f.GrpcDialOption, toDeleteFileIds, lookupFunc)
+				_, err := operation.DeleteFilesWithLookupVolumeId(f.GrpcDialOption, toDeleteFileIds, lookupFunc)
 				if err != nil {
-					glog.V(0).Infof("deleting fileIds len=%d error: %v", deletionCount, err)
+					if !strings.Contains(err.Error(), "already deleted") {
+						glog.V(0).Infof("deleting fileIds len=%d error: %v", deletionCount, err)
+					}
 				} else {
 					glog.V(1).Infof("deleting fileIds len=%d", deletionCount)
 				}
-				if len(deleteResults) != deletionCount {
-					glog.V(0).Infof("delete %d fileIds actual %d", deletionCount, len(deleteResults))
-				}
 			}
 		})

--- a/weed/filer2/leveldb/leveldb_store_test.go
+++ b/weed/filer2/leveldb/leveldb_store_test.go
@ -17,7 +17,6 @@ func TestCreateAndFind(t *testing.T) {
 	store := &LevelDBStore{}
 	store.initialize(dir)
 	filer.SetStore(store)
-	filer.DisableDirectoryCache()

 	fullpath := util.FullPath("/home/chris/this/is/one/file1.jpg")

@ -72,7 +71,6 @@ func TestEmptyRoot(t *testing.T) {
 	store := &LevelDBStore{}
 	store.initialize(dir)
 	filer.SetStore(store)
-	filer.DisableDirectoryCache()

 	ctx := context.Background()

--- a/weed/filer2/leveldb2/leveldb2_store_test.go
+++ b/weed/filer2/leveldb2/leveldb2_store_test.go
@ -17,7 +17,6 @@ func TestCreateAndFind(t *testing.T) {
 	store := &LevelDB2Store{}
 	store.initialize(dir, 2)
 	filer.SetStore(store)
-	filer.DisableDirectoryCache()

 	fullpath := util.FullPath("/home/chris/this/is/one/file1.jpg")

@ -72,7 +71,6 @@ func TestEmptyRoot(t *testing.T) {
 	store := &LevelDB2Store{}
 	store.initialize(dir, 2)
 	filer.SetStore(store)
-	filer.DisableDirectoryCache()

 	ctx := context.Background()

--- a/weed/filer2/reader_at.go
+++ b/weed/filer2/reader_at.go
@ -15,12 +15,11 @@ import (
 type ChunkReadAt struct {
 	masterClient *wdclient.MasterClient
 	chunkViews   []*ChunkView
-	buffer       []byte
-	bufferOffset int64
 	lookupFileId func(fileId string) (targetUrl string, err error)
 	readerLock   sync.Mutex
+	fileSize     int64

-	chunkCache *chunk_cache.ChunkCache
+	chunkCache chunk_cache.ChunkCache
 }

 // var _ = io.ReaderAt(&ChunkReadAt{})
@ -54,13 +53,13 @@ func LookupFn(filerClient filer_pb.FilerClient) LookupFileIdFunctionType {
 	}
 }

-func NewChunkReaderAtFromClient(filerClient filer_pb.FilerClient, chunkViews []*ChunkView, chunkCache *chunk_cache.ChunkCache) *ChunkReadAt {
+func NewChunkReaderAtFromClient(filerClient filer_pb.FilerClient, chunkViews []*ChunkView, chunkCache chunk_cache.ChunkCache, fileSize int64) *ChunkReadAt {

 	return &ChunkReadAt{
 		chunkViews:   chunkViews,
 		lookupFileId: LookupFn(filerClient),
-		bufferOffset: -1,
 		chunkCache:   chunkCache,
+		fileSize:     fileSize,
 	}
 }

@ -69,75 +68,78 @@ func (c *ChunkReadAt) ReadAt(p []byte, offset int64) (n int, err error) {
 	c.readerLock.Lock()
 	defer c.readerLock.Unlock()

-	for n < len(p) && err == nil {
-		readCount, readErr := c.doReadAt(p[n:], offset+int64(n))
-		n += readCount
-		err = readErr
-		if readCount == 0 {
-			return n, io.EOF
-		}
-	}
-	return
+	glog.V(4).Infof("ReadAt [%d,%d) of total file size %d bytes %d chunk views", offset, offset+int64(len(p)), c.fileSize, len(c.chunkViews))
+	return c.doReadAt(p[n:], offset+int64(n))
 }

 func (c *ChunkReadAt) doReadAt(p []byte, offset int64) (n int, err error) {

-	var found bool
-	for _, chunk := range c.chunkViews {
-		if chunk.LogicOffset <= offset && offset < chunk.LogicOffset+int64(chunk.Size) {
-			found = true
-			if c.bufferOffset != chunk.LogicOffset {
-				c.buffer, err = c.fetchChunkData(chunk)
-				if err != nil {
-					glog.Errorf("fetching chunk %+v: %v\n", chunk, err)
-				}
-				c.bufferOffset = chunk.LogicOffset
-			}
+	var buffer []byte
+	startOffset, remaining := offset, int64(len(p))
+	for i, chunk := range c.chunkViews {
+		if remaining <= 0 {
 			break
 		}
-	}
-	if !found {
-		return 0, io.EOF
+		if startOffset < chunk.LogicOffset {
+			gap := int(chunk.LogicOffset - startOffset)
+			glog.V(4).Infof("zero [%d,%d)", startOffset, startOffset+int64(gap))
+			n += int(min(int64(gap), remaining))
+			startOffset, remaining = chunk.LogicOffset, remaining-int64(gap)
+			if remaining <= 0 {
+				break
+			}
+		}
+		// fmt.Printf(">>> doReadAt [%d,%d), chunk[%d,%d)\n", offset, offset+int64(len(p)), chunk.LogicOffset, chunk.LogicOffset+int64(chunk.Size))
+		chunkStart, chunkStop := max(chunk.LogicOffset, startOffset), min(chunk.LogicOffset+int64(chunk.Size), startOffset+remaining)
+		if chunkStart >= chunkStop {
+			continue
+		}
+		glog.V(4).Infof("read [%d,%d), %d/%d chunk %s [%d,%d)", chunkStart, chunkStop, i, len(c.chunkViews), chunk.FileId, chunk.LogicOffset-chunk.Offset, chunk.LogicOffset-chunk.Offset+int64(chunk.Size))
+		buffer, err = c.readFromWholeChunkData(chunk)
+		if err != nil {
+			glog.Errorf("fetching chunk %+v: %v\n", chunk, err)
+			return
+		}
+		bufferOffset := chunkStart - chunk.LogicOffset + chunk.Offset
+		copied := copy(p[startOffset-offset:chunkStop-chunkStart+startOffset-offset], buffer[bufferOffset:bufferOffset+chunkStop-chunkStart])
+		n += copied
+		startOffset, remaining = startOffset+int64(copied), remaining-int64(copied)
 	}

-	if err == nil {
-		n = copy(p, c.buffer[offset-c.bufferOffset:])
+	glog.V(4).Infof("doReadAt [%d,%d), n:%v, err:%v", offset, offset+int64(len(p)), n, err)
+
+	if err == nil && remaining > 0 && c.fileSize > startOffset {
+		delta := int(min(remaining, c.fileSize - startOffset))
+		glog.V(4).Infof("zero2 [%d,%d) of file size %d bytes", startOffset, startOffset+int64(delta), c.fileSize)
+		n += delta
 	}

-	// fmt.Printf("> doReadAt [%d,%d), buffer:[%d,%d)\n", offset, offset+int64(n), c.bufferOffset, c.bufferOffset+int64(len(c.buffer)))
+	if err == nil && offset+int64(len(p)) > c.fileSize {
+		err = io.EOF
+	}
+	// fmt.Printf("~~~ filled %d, err: %v\n\n", n, err)

 	return

 }

-func (c *ChunkReadAt) fetchChunkData(chunkView *ChunkView) (data []byte, err error) {
+func (c *ChunkReadAt) readFromWholeChunkData(chunkView *ChunkView) (chunkData []byte, err error) {

-	glog.V(4).Infof("fetchChunkData %s [%d,%d)\n", chunkView.FileId, chunkView.LogicOffset, chunkView.LogicOffset+int64(chunkView.Size))
+	glog.V(4).Infof("readFromWholeChunkData %s offset %d [%d,%d) size at least %d", chunkView.FileId, chunkView.Offset, chunkView.LogicOffset, chunkView.LogicOffset+int64(chunkView.Size), chunkView.ChunkSize)

-	hasDataInCache := false
-	chunkData := c.chunkCache.GetChunk(chunkView.FileId, chunkView.ChunkSize)
+	chunkData = c.chunkCache.GetChunk(chunkView.FileId, chunkView.ChunkSize)
 	if chunkData != nil {
-		glog.V(3).Infof("cache hit %s [%d,%d)", chunkView.FileId, chunkView.LogicOffset, chunkView.LogicOffset+int64(chunkView.Size))
-		hasDataInCache = true
+		glog.V(5).Infof("cache hit %s [%d,%d)", chunkView.FileId, chunkView.LogicOffset-chunkView.Offset, chunkView.LogicOffset-chunkView.Offset+int64(len(chunkData)))
 	} else {
+		glog.V(4).Infof("doFetchFullChunkData %s", chunkView.FileId)
 		chunkData, err = c.doFetchFullChunkData(chunkView.FileId, chunkView.CipherKey, chunkView.IsGzipped)
 		if err != nil {
-			return nil, err
+			return
 		}
-	}
-
-	if int64(len(chunkData)) < chunkView.Offset+int64(chunkView.Size) {
-		glog.Errorf("unexpected larger cached:%v chunk %s [%d,%d) than %d", hasDataInCache, chunkView.FileId, chunkView.Offset, chunkView.Offset+int64(chunkView.Size), len(chunkData))
-		return nil, fmt.Errorf("unexpected larger cached:%v chunk %s [%d,%d) than %d", hasDataInCache, chunkView.FileId, chunkView.Offset, chunkView.Offset+int64(chunkView.Size), len(chunkData))
-	}
-
-	data = chunkData[chunkView.Offset : chunkView.Offset+int64(chunkView.Size)]
-
-	if !hasDataInCache {
 		c.chunkCache.SetChunk(chunkView.FileId, chunkData)
 	}

-	return data, nil
+	return
 }

 func (c *ChunkReadAt) doFetchFullChunkData(fileId string, cipherKey []byte, isGzipped bool) ([]byte, error) {
--- a/weed/filer2/reader_at_test.go
+++ b/weed/filer2/reader_at_test.go
@ -0,0 +1,156 @@
+package filer2
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"strconv"
+	"sync"
+	"testing"
+)
+
+type mockChunkCache struct {
+}
+
+func (m *mockChunkCache) GetChunk(fileId string, minSize uint64) (data []byte) {
+	x, _ := strconv.Atoi(fileId)
+	data = make([]byte, minSize)
+	for i := 0; i < int(minSize); i++ {
+		data[i] = byte(x)
+	}
+	return data
+}
+func (m *mockChunkCache) SetChunk(fileId string, data []byte) {
+}
+
+func TestReaderAt(t *testing.T) {
+
+	visibles := []VisibleInterval{
+		{
+			start:  1,
+			stop:   2,
+			fileId: "1",
+			chunkSize: 9,
+		},
+		{
+			start:  3,
+			stop:   4,
+			fileId: "3",
+			chunkSize: 1,
+		},
+		{
+			start:  5,
+			stop:   6,
+			fileId: "5",
+			chunkSize: 2,
+		},
+		{
+			start:  7,
+			stop:   9,
+			fileId: "7",
+			chunkSize: 2,
+		},
+		{
+			start:  9,
+			stop:   10,
+			fileId: "9",
+			chunkSize: 2,
+		},
+	}
+
+	readerAt := &ChunkReadAt{
+		chunkViews:   ViewFromVisibleIntervals(visibles, 0, math.MaxInt64),
+		lookupFileId: nil,
+		readerLock:   sync.Mutex{},
+		fileSize:     10,
+		chunkCache:   &mockChunkCache{},
+	}
+
+	testReadAt(t, readerAt, 0, 10, 10, nil)
+	testReadAt(t, readerAt, 0, 12, 10, io.EOF)
+	testReadAt(t, readerAt, 2, 8, 8, nil)
+	testReadAt(t, readerAt, 3, 6, 6, nil)
+
+}
+
+func testReadAt(t *testing.T, readerAt *ChunkReadAt, offset int64, size int, expected int, expectedErr error) {
+	data := make([]byte, size)
+	n, err := readerAt.ReadAt(data, offset)
+
+	for _, d := range data {
+		fmt.Printf("%x", d)
+	}
+	fmt.Println()
+
+	if expected != n {
+		t.Errorf("unexpected read size: %d, expect: %d", n, expected)
+	}
+	if err != expectedErr {
+		t.Errorf("unexpected read error: %v, expect: %v", err, expectedErr)
+	}
+
+}
+
+func TestReaderAt0(t *testing.T) {
+
+	visibles := []VisibleInterval{
+		{
+			start:  2,
+			stop:   5,
+			fileId: "1",
+			chunkSize: 9,
+		},
+		{
+			start:  7,
+			stop:   9,
+			fileId: "2",
+			chunkSize: 9,
+		},
+	}
+
+	readerAt := &ChunkReadAt{
+		chunkViews:   ViewFromVisibleIntervals(visibles, 0, math.MaxInt64),
+		lookupFileId: nil,
+		readerLock:   sync.Mutex{},
+		fileSize:     10,
+		chunkCache:   &mockChunkCache{},
+	}
+
+	testReadAt(t, readerAt, 0, 10, 10, nil)
+	testReadAt(t, readerAt, 3, 16, 7, io.EOF)
+	testReadAt(t, readerAt, 3, 5, 5, nil)
+
+	testReadAt(t, readerAt, 11, 5, 0, io.EOF)
+	testReadAt(t, readerAt, 10, 5, 0, io.EOF)
+
+}
+
+func TestReaderAt1(t *testing.T) {
+
+	visibles := []VisibleInterval{
+		{
+			start:  2,
+			stop:   5,
+			fileId: "1",
+			chunkSize: 9,
+		},
+	}
+
+	readerAt := &ChunkReadAt{
+		chunkViews:   ViewFromVisibleIntervals(visibles, 0, math.MaxInt64),
+		lookupFileId: nil,
+		readerLock:   sync.Mutex{},
+		fileSize:     20,
+		chunkCache:   &mockChunkCache{},
+	}
+
+	testReadAt(t, readerAt, 0, 20, 20, nil)
+	testReadAt(t, readerAt, 1, 7, 7, nil)
+	testReadAt(t, readerAt, 0, 1, 1, nil)
+	testReadAt(t, readerAt, 18, 4, 2, io.EOF)
+	testReadAt(t, readerAt, 12, 4, 4, nil)
+	testReadAt(t, readerAt, 4, 20, 16, io.EOF)
+	testReadAt(t, readerAt, 4, 10, 10, nil)
+	testReadAt(t, readerAt, 1, 10, 10, nil)
+
+}
--- a/weed/filer2/stream.go
+++ b/weed/filer2/stream.go
@ -32,7 +32,7 @@ func StreamContent(masterClient *wdclient.MasterClient, w io.Writer, chunks []*f
 	for _, chunkView := range chunkViews {

 		urlString := fileId2Url[chunkView.FileId]
-		err := util.ReadUrlAsStream(urlString, chunkView.CipherKey, chunkView.IsGzipped, chunkView.IsFullChunk(), chunkView.Offset, int(chunkView.Size), func(data []byte) {
+		err := util.ReadUrlAsStream(urlString+"?readDeleted=true", chunkView.CipherKey, chunkView.IsGzipped, chunkView.IsFullChunk(), chunkView.Offset, int(chunkView.Size), func(data []byte) {
 			w.Write(data)
 		})
 		if err != nil {
@ -63,7 +63,7 @@ func ReadAll(masterClient *wdclient.MasterClient, chunks []*filer_pb.FileChunk)
 			glog.V(1).Infof("operation LookupFileId %s failed, err: %v", chunkView.FileId, err)
 			return nil, err
 		}
-		err = util.ReadUrlAsStream(urlString, chunkView.CipherKey, chunkView.IsGzipped, chunkView.IsFullChunk(), chunkView.Offset, int(chunkView.Size), func(data []byte) {
+		err = util.ReadUrlAsStream(urlString+"?readDeleted=true", chunkView.CipherKey, chunkView.IsGzipped, chunkView.IsFullChunk(), chunkView.Offset, int(chunkView.Size), func(data []byte) {
 			buffer.Write(data)
 		})
 		if err != nil {
@ -175,7 +175,7 @@ func (c *ChunkStreamReader) fetchChunkToBuffer(chunkView *ChunkView) error {
 		return err
 	}
 	var buffer bytes.Buffer
-	err = util.ReadUrlAsStream(urlString, chunkView.CipherKey, chunkView.IsGzipped, chunkView.IsFullChunk(), chunkView.Offset, int(chunkView.Size), func(data []byte) {
+	err = util.ReadUrlAsStream(urlString+"?readDeleted=true", chunkView.CipherKey, chunkView.IsGzipped, chunkView.IsFullChunk(), chunkView.Offset, int(chunkView.Size), func(data []byte) {
 		buffer.Write(data)
 	})
 	if err != nil {
--- a/weed/filesys/dir.go
+++ b/weed/filesys/dir.go
@ -63,7 +63,7 @@ func (dir *Dir) Attr(ctx context.Context, attr *fuse.Attr) error {
 	attr.Gid = dir.entry.Attributes.Gid
 	attr.Uid = dir.entry.Attributes.Uid

-	glog.V(4).Infof("dir Attr %s, attr: %+v", dir.FullPath(), attr)
+	glog.V(5).Infof("dir Attr %s, attr: %+v", dir.FullPath(), attr)

 	return nil
 }
@ -101,7 +101,7 @@ func (dir *Dir) Fsync(ctx context.Context, req *fuse.FsyncRequest) error {
 }

 func (dir *Dir) newFile(name string, entry *filer_pb.Entry) fs.Node {
-	return dir.wfs.fsNodeCache.EnsureFsNode(util.NewFullPath(dir.FullPath(), name), func() fs.Node {
+	f := dir.wfs.fsNodeCache.EnsureFsNode(util.NewFullPath(dir.FullPath(), name), func() fs.Node {
 		return &File{
 			Name:           name,
 			dir:            dir,
@ -110,14 +110,17 @@ func (dir *Dir) newFile(name string, entry *filer_pb.Entry) fs.Node {
 			entryViewCache: nil,
 		}
 	})
+	f.(*File).dir = dir // in case dir node was created later
+	return f
 }

 func (dir *Dir) newDirectory(fullpath util.FullPath, entry *filer_pb.Entry) fs.Node {

-	return dir.wfs.fsNodeCache.EnsureFsNode(fullpath, func() fs.Node {
+	d := dir.wfs.fsNodeCache.EnsureFsNode(fullpath, func() fs.Node {
 		return &Dir{name: entry.Name, wfs: dir.wfs, entry: entry, parent: dir}
 	})
-
+	d.(*Dir).parent = dir // in case dir node was created later
+	return d
 }

 func (dir *Dir) Create(ctx context.Context, req *fuse.CreateRequest,
@ -218,7 +221,7 @@ func (dir *Dir) Mkdir(ctx context.Context, req *fuse.MkdirRequest) (fs.Node, err

 func (dir *Dir) Lookup(ctx context.Context, req *fuse.LookupRequest, resp *fuse.LookupResponse) (node fs.Node, err error) {

-	glog.V(4).Infof("dir Lookup %s: %s by %s", dir.FullPath(), req.Name, req.Header.String())
+	glog.V(5).Infof("dir Lookup %s: %s by %s", dir.FullPath(), req.Name, req.Header.String())

 	fullFilePath := util.NewFullPath(dir.FullPath(), req.Name)
 	dirPath := util.FullPath(dir.FullPath())
@ -237,7 +240,7 @@ func (dir *Dir) Lookup(ctx context.Context, req *fuse.LookupRequest, resp *fuse.
 			return nil, fuse.ENOENT
 		}
 	} else {
-		glog.V(4).Infof("dir Lookup cache hit %s", fullFilePath)
+		glog.V(5).Infof("dir Lookup cache hit %s", fullFilePath)
 	}

 	if entry != nil {
@ -265,7 +268,7 @@ func (dir *Dir) Lookup(ctx context.Context, req *fuse.LookupRequest, resp *fuse.

 func (dir *Dir) ReadDirAll(ctx context.Context) (ret []fuse.Dirent, err error) {

-	glog.V(3).Infof("dir ReadDirAll %s", dir.FullPath())
+	glog.V(5).Infof("dir ReadDirAll %s", dir.FullPath())

 	processEachEntryFn := func(entry *filer_pb.Entry, isLast bool) error {
 		fullpath := util.NewFullPath(dir.FullPath(), entry.Name)
@ -314,12 +317,8 @@ func (dir *Dir) removeOneFile(req *fuse.RemoveRequest) error {
 		return nil
 	}

-	dir.wfs.deleteFileChunks(entry.Chunks)
-
-	dir.wfs.fsNodeCache.DeleteFsNode(filePath)
-
-	dir.wfs.metaCache.DeleteEntry(context.Background(), filePath)

+	// first, ensure the filer store can correctly delete
 	glog.V(3).Infof("remove file: %v", req)
 	err = filer_pb.Remove(dir.wfs, dir.FullPath(), req.Name, false, false, false, false)
 	if err != nil {
@ -327,34 +326,40 @@ func (dir *Dir) removeOneFile(req *fuse.RemoveRequest) error {
 		return fuse.ENOENT
 	}

+	// then, delete meta cache and fsNode cache
+	dir.wfs.metaCache.DeleteEntry(context.Background(), filePath)
+	dir.wfs.fsNodeCache.DeleteFsNode(filePath)
+
+	// delete the chunks last
+	dir.wfs.deleteFileChunks(entry.Chunks)
+
 	return nil

 }

 func (dir *Dir) removeFolder(req *fuse.RemoveRequest) error {

-	t := util.NewFullPath(dir.FullPath(), req.Name)
-	dir.wfs.fsNodeCache.DeleteFsNode(t)
-
-	dir.wfs.metaCache.DeleteEntry(context.Background(), t)
-
 	glog.V(3).Infof("remove directory entry: %v", req)
 	err := filer_pb.Remove(dir.wfs, dir.FullPath(), req.Name, true, false, false, false)
 	if err != nil {
-		glog.V(3).Infof("remove %s/%s: %v", dir.FullPath(), req.Name, err)
+		glog.V(0).Infof("remove %s/%s: %v", dir.FullPath(), req.Name, err)
 		if strings.Contains(err.Error(), "non-empty"){
 			return fuse.EEXIST
 		}
 		return fuse.ENOENT
 	}

+	t := util.NewFullPath(dir.FullPath(), req.Name)
+	dir.wfs.metaCache.DeleteEntry(context.Background(), t)
+	dir.wfs.fsNodeCache.DeleteFsNode(t)
+
 	return nil

 }

 func (dir *Dir) Setattr(ctx context.Context, req *fuse.SetattrRequest, resp *fuse.SetattrResponse) error {

-	glog.V(3).Infof("%v dir setattr %+v", dir.FullPath(), req)
+	glog.V(4).Infof("%v dir setattr %+v", dir.FullPath(), req)

 	if err := dir.maybeLoadEntry(); err != nil {
 		return err
@ -429,7 +434,7 @@ func (dir *Dir) Listxattr(ctx context.Context, req *fuse.ListxattrRequest, resp
 }

 func (dir *Dir) Forget() {
-	glog.V(3).Infof("Forget dir %s", dir.FullPath())
+	glog.V(5).Infof("Forget dir %s", dir.FullPath())

 	dir.wfs.fsNodeCache.DeleteFsNode(util.FullPath(dir.FullPath()))
 }
@ -460,7 +465,7 @@ func (dir *Dir) saveEntry() error {
 		glog.V(1).Infof("save dir entry: %v", request)
 		_, err := client.UpdateEntry(context.Background(), request)
 		if err != nil {
-			glog.V(0).Infof("UpdateEntry dir %s/%s: %v", parentDir, name, err)
+			glog.Errorf("UpdateEntry dir %s/%s: %v", parentDir, name, err)
 			return fuse.EIO
 		}

--- a/weed/filesys/dir_link.go
+++ b/weed/filesys/dir_link.go
@ -18,7 +18,7 @@ var _ = fs.NodeReadlinker(&File{})

 func (dir *Dir) Symlink(ctx context.Context, req *fuse.SymlinkRequest) (fs.Node, error) {

-	glog.V(3).Infof("Symlink: %v/%v to %v", dir.FullPath(), req.NewName, req.Target)
+	glog.V(4).Infof("Symlink: %v/%v to %v", dir.FullPath(), req.NewName, req.Target)

 	request := &filer_pb.CreateEntryRequest{
 		Directory: dir.FullPath(),
@ -63,7 +63,7 @@ func (file *File) Readlink(ctx context.Context, req *fuse.ReadlinkRequest) (stri
 		return "", fuse.Errno(syscall.EINVAL)
 	}

-	glog.V(3).Infof("Readlink: %v/%v => %v", file.dir.FullPath(), file.Name, file.entry.Attributes.SymlinkTarget)
+	glog.V(4).Infof("Readlink: %v/%v => %v", file.dir.FullPath(), file.Name, file.entry.Attributes.SymlinkTarget)

 	return file.entry.Attributes.SymlinkTarget, nil

--- a/weed/filesys/dir_rename.go
+++ b/weed/filesys/dir_rename.go
@ -63,7 +63,17 @@ func (dir *Dir) Rename(ctx context.Context, req *fuse.RenameRequest, newDirector

 	// fmt.Printf("rename path: %v => %v\n", oldPath, newPath)
 	dir.wfs.fsNodeCache.Move(oldPath, newPath)
-	delete(dir.wfs.handles, oldPath.AsInode())
+
+	// change file handle
+	dir.wfs.handlesLock.Lock()
+	defer dir.wfs.handlesLock.Unlock()
+	inodeId := oldPath.AsInode()
+	existingHandle, found := dir.wfs.handles[inodeId]
+	if !found || existingHandle == nil {
+		return err
+	}
+	delete(dir.wfs.handles, inodeId)
+	dir.wfs.handles[newPath.AsInode()] = existingHandle

 	return err
 }
--- a/weed/filesys/dirty_page.go
+++ b/weed/filesys/dirty_page.go
@ -25,9 +25,6 @@ func newDirtyPages(file *File) *ContinuousDirtyPages {
 	}
 }

-func (pages *ContinuousDirtyPages) releaseResource() {
-}
-
 var counter = int32(0)

 func (pages *ContinuousDirtyPages) AddPage(offset int64, data []byte) (chunks []*filer_pb.FileChunk, err error) {
@ -35,7 +32,7 @@ func (pages *ContinuousDirtyPages) AddPage(offset int64, data []byte) (chunks []
 	pages.lock.Lock()
 	defer pages.lock.Unlock()

-	glog.V(3).Infof("%s AddPage [%d,%d)", pages.f.fullpath(), offset, offset+int64(len(data)))
+	glog.V(5).Infof("%s AddPage [%d,%d) of %d bytes", pages.f.fullpath(), offset, offset+int64(len(data)), pages.f.entry.Attributes.FileSize)

 	if len(data) > int(pages.f.wfs.option.ChunkSizeLimit) {
 		// this is more than what buffer can hold.
@ -121,14 +118,16 @@ func (pages *ContinuousDirtyPages) saveExistingLargestPageToStorage() (chunk *fi
 		return nil, false, nil
 	}

+	fileSize := int64(pages.f.entry.Attributes.FileSize)
 	for {
-		chunk, err = pages.saveToStorage(maxList.ToReader(), maxList.Offset(), maxList.Size())
+		chunkSize := min(maxList.Size(), fileSize-maxList.Offset())
+		chunk, err = pages.saveToStorage(maxList.ToReader(), maxList.Offset(), chunkSize)
 		if err == nil {
 			hasSavedData = true
-			glog.V(3).Infof("%s saveToStorage [%d,%d) %s", pages.f.fullpath(), maxList.Offset(), maxList.Offset()+maxList.Size(), chunk.FileId)
+			glog.V(4).Infof("saveToStorage %s %s [%d,%d) of %d bytes", pages.f.fullpath(), chunk.GetFileIdString(), maxList.Offset(), maxList.Offset()+chunkSize, fileSize)
 			return
 		} else {
-			glog.V(0).Infof("%s saveToStorage [%d,%d): %v", pages.f.fullpath(), maxList.Offset(), maxList.Offset()+maxList.Size(), err)
+			glog.V(0).Infof("%s saveToStorage [%d,%d): %v", pages.f.fullpath(), maxList.Offset(), maxList.Offset()+chunkSize, err)
 			time.Sleep(5 * time.Second)
 		}
 	}
@ -139,6 +138,7 @@ func (pages *ContinuousDirtyPages) saveToStorage(reader io.Reader, offset int64,

 	dir, _ := pages.f.fullpath().DirAndName()

+	reader = io.LimitReader(reader, size)
 	chunk, collection, replication, err := pages.f.wfs.saveDataAsChunk(dir)(reader, pages.f.Name, offset)
 	if err != nil {
 		return nil, err
@ -149,6 +149,13 @@ func (pages *ContinuousDirtyPages) saveToStorage(reader io.Reader, offset int64,

 }

+func maxUint64(x, y uint64) uint64 {
+	if x > y {
+		return x
+	}
+	return y
+}
+
 func max(x, y int64) int64 {
 	if x > y {
 		return x
@ -162,11 +169,11 @@ func min(x, y int64) int64 {
 	return y
 }

-func (pages *ContinuousDirtyPages) ReadDirtyData(data []byte, startOffset int64) (offset int64, size int) {
+func (pages *ContinuousDirtyPages) ReadDirtyDataAt(data []byte, startOffset int64) (maxStop int64) {

 	pages.lock.Lock()
 	defer pages.lock.Unlock()

-	return pages.intervals.ReadData(data, startOffset)
+	return pages.intervals.ReadDataAt(data, startOffset)

 }
--- a/weed/filesys/dirty_page_interval.go
+++ b/weed/filesys/dirty_page_interval.go
@ -3,7 +3,6 @@ package filesys
 import (
 	"bytes"
 	"io"
-	"math"
 )

 type IntervalNode struct {
@ -186,25 +185,15 @@ func (c *ContinuousIntervals) removeList(target *IntervalLinkedList) {

 }

-func (c *ContinuousIntervals) ReadData(data []byte, startOffset int64) (offset int64, size int) {
-	var minOffset int64 = math.MaxInt64
-	var maxStop int64
+func (c *ContinuousIntervals) ReadDataAt(data []byte, startOffset int64) (maxStop int64) {
 	for _, list := range c.lists {
 		start := max(startOffset, list.Offset())
 		stop := min(startOffset+int64(len(data)), list.Offset()+list.Size())
-		if start <= stop {
+		if start < stop {
 			list.ReadData(data[start-startOffset:], start, stop)
-			minOffset = min(minOffset, start)
 			maxStop = max(maxStop, stop)
 		}
 	}
-
-	if minOffset == math.MaxInt64 {
-		return 0, 0
-	}
-
-	offset = minOffset
-	size = int(maxStop - offset)
 	return
 }

--- a/weed/filesys/file.go
+++ b/weed/filesys/file.go
@ -7,12 +7,13 @@ import (
 	"sort"
 	"time"

+	"github.com/seaweedfs/fuse"
+	"github.com/seaweedfs/fuse/fs"
+
 	"github.com/chrislusf/seaweedfs/weed/filer2"
 	"github.com/chrislusf/seaweedfs/weed/glog"
 	"github.com/chrislusf/seaweedfs/weed/pb/filer_pb"
 	"github.com/chrislusf/seaweedfs/weed/util"
-	"github.com/seaweedfs/fuse"
-	"github.com/seaweedfs/fuse/fs"
 )

 const blockSize = 512
@ -35,6 +36,7 @@ type File struct {
 	entryViewCache []filer2.VisibleInterval
 	isOpen         int
 	reader         io.ReaderAt
+	dirtyMetadata  bool
 }

 func (file *File) fullpath() util.FullPath {
@ -43,7 +45,7 @@ func (file *File) fullpath() util.FullPath {

 func (file *File) Attr(ctx context.Context, attr *fuse.Attr) error {

-	glog.V(4).Infof("file Attr %s, open:%v, existing attr: %+v", file.fullpath(), file.isOpen, attr)
+	glog.V(5).Infof("file Attr %s, open:%v, existing attr: %+v", file.fullpath(), file.isOpen, attr)

 	if file.isOpen <= 0 {
 		if err := file.maybeLoadEntry(ctx); err != nil {
@ -54,7 +56,7 @@ func (file *File) Attr(ctx context.Context, attr *fuse.Attr) error {
 	attr.Inode = file.fullpath().AsInode()
 	attr.Valid = time.Second
 	attr.Mode = os.FileMode(file.entry.Attributes.FileMode)
-	attr.Size = filer2.TotalSize(file.entry.Chunks)
+	attr.Size = filer2.FileSize(file.entry)
 	if file.isOpen > 0 {
 		attr.Size = file.entry.Attributes.FileSize
 		glog.V(4).Infof("file Attr %s, open:%v, size: %d", file.fullpath(), file.isOpen, attr.Size)
@ -91,7 +93,7 @@ func (file *File) Open(ctx context.Context, req *fuse.OpenRequest, resp *fuse.Op

 	resp.Handle = fuse.HandleID(handle.handle)

-	glog.V(3).Infof("%v file open handle id = %d", file.fullpath(), handle.handle)
+	glog.V(4).Infof("%v file open handle id = %d", file.fullpath(), handle.handle)

 	return handle, nil

@ -99,7 +101,7 @@ func (file *File) Open(ctx context.Context, req *fuse.OpenRequest, resp *fuse.Op

 func (file *File) Setattr(ctx context.Context, req *fuse.SetattrRequest, resp *fuse.SetattrResponse) error {

-	glog.V(3).Infof("%v file setattr %+v, old:%+v", file.fullpath(), req, file.entry.Attributes)
+	glog.V(5).Infof("%v file setattr %+v", file.fullpath(), req)

 	if err := file.maybeLoadEntry(ctx); err != nil {
 		return err
@ -107,49 +109,72 @@ func (file *File) Setattr(ctx context.Context, req *fuse.SetattrRequest, resp *f

 	if req.Valid.Size() {

-		glog.V(3).Infof("%v file setattr set size=%v", file.fullpath(), req.Size)
+		glog.V(4).Infof("%v file setattr set size=%v chunks=%d", file.fullpath(), req.Size, len(file.entry.Chunks))
 		if req.Size < filer2.TotalSize(file.entry.Chunks) {
 			// fmt.Printf("truncate %v \n", fullPath)
 			var chunks []*filer_pb.FileChunk
+			var truncatedChunks []*filer_pb.FileChunk
 			for _, chunk := range file.entry.Chunks {
 				int64Size := int64(chunk.Size)
 				if chunk.Offset+int64Size > int64(req.Size) {
+					// this chunk is truncated
 					int64Size = int64(req.Size) - chunk.Offset
-				}
-				if int64Size > 0 {
-					chunks = append(chunks, chunk)
+					if int64Size > 0 {
+						chunks = append(chunks, chunk)
+						glog.V(4).Infof("truncated chunk %+v from %d to %d\n", chunk.GetFileIdString(), chunk.Size, int64Size)
+						chunk.Size = uint64(int64Size)
+					} else {
+						glog.V(4).Infof("truncated whole chunk %+v\n", chunk.GetFileIdString())
+						truncatedChunks = append(truncatedChunks, chunk)
+					}
 				}
 			}
+			file.wfs.deleteFileChunks(truncatedChunks)
 			file.entry.Chunks = chunks
 			file.entryViewCache = nil
 			file.reader = nil
 		}
 		file.entry.Attributes.FileSize = req.Size
+		file.dirtyMetadata = true
 	}
+
 	if req.Valid.Mode() {
 		file.entry.Attributes.FileMode = uint32(req.Mode)
+		file.dirtyMetadata = true
 	}

 	if req.Valid.Uid() {
 		file.entry.Attributes.Uid = req.Uid
+		file.dirtyMetadata = true
 	}

 	if req.Valid.Gid() {
 		file.entry.Attributes.Gid = req.Gid
+		file.dirtyMetadata = true
 	}

 	if req.Valid.Crtime() {
 		file.entry.Attributes.Crtime = req.Crtime.Unix()
+		file.dirtyMetadata = true
 	}

 	if req.Valid.Mtime() {
 		file.entry.Attributes.Mtime = req.Mtime.Unix()
+		file.dirtyMetadata = true
+	}
+
+	if req.Valid.Handle() {
+		// fmt.Printf("file handle => %d\n", req.Handle)
 	}

 	if file.isOpen > 0 {
 		return nil
 	}

+	if !file.dirtyMetadata {
+		return nil
+	}
+
 	return file.saveEntry()

 }
@ -205,14 +230,14 @@ func (file *File) Listxattr(ctx context.Context, req *fuse.ListxattrRequest, res
 func (file *File) Fsync(ctx context.Context, req *fuse.FsyncRequest) error {
 	// fsync works at OS level
 	// write the file chunks to the filerGrpcAddress
-	glog.V(3).Infof("%s/%s fsync file %+v", file.dir.FullPath(), file.Name, req)
+	glog.V(4).Infof("%s/%s fsync file %+v", file.dir.FullPath(), file.Name, req)

 	return nil
 }

 func (file *File) Forget() {
 	t := util.NewFullPath(file.dir.FullPath(), file.Name)
-	glog.V(3).Infof("Forget file %s", t)
+	glog.V(5).Infof("Forget file %s", t)
 	file.wfs.fsNodeCache.DeleteFsNode(t)
 }

@ -246,7 +271,7 @@ func (file *File) addChunks(chunks []*filer_pb.FileChunk) {

 	file.reader = nil

-	glog.V(3).Infof("%s existing %d chunks adds %d more", file.fullpath(), len(file.entry.Chunks), len(chunks))
+	glog.V(4).Infof("%s existing %d chunks adds %d more", file.fullpath(), len(file.entry.Chunks), len(chunks))

 	file.entry.Chunks = append(file.entry.Chunks, chunks...)
 }
@ -265,10 +290,10 @@ func (file *File) saveEntry() error {
 			Entry:     file.entry,
 		}

-		glog.V(1).Infof("save file entry: %v", request)
+		glog.V(4).Infof("save file entry: %v", request)
 		_, err := client.UpdateEntry(context.Background(), request)
 		if err != nil {
-			glog.V(0).Infof("UpdateEntry file %s/%s: %v", file.dir.FullPath(), file.Name, err)
+			glog.Errorf("UpdateEntry file %s/%s: %v", file.dir.FullPath(), file.Name, err)
 			return fuse.EIO
 		}

--- a/weed/filesys/filehandle.go
+++ b/weed/filesys/filehandle.go
@ -19,10 +19,9 @@ import (

 type FileHandle struct {
 	// cache file has been written to
-	dirtyPages    *ContinuousDirtyPages
-	contentType   string
-	dirtyMetadata bool
-	handle        uint64
+	dirtyPages  *ContinuousDirtyPages
+	contentType string
+	handle      uint64

 	f         *File
 	RequestId fuse.RequestID // unique ID for request
@ -40,7 +39,7 @@ func newFileHandle(file *File, uid, gid uint32) *FileHandle {
 		Gid:        gid,
 	}
 	if fh.f.entry != nil {
-		fh.f.entry.Attributes.FileSize = filer2.TotalSize(fh.f.entry.Chunks)
+		fh.f.entry.Attributes.FileSize = filer2.FileSize(fh.f.entry)
 	}
 	return fh
 }
@ -55,38 +54,45 @@ var _ = fs.HandleReleaser(&FileHandle{})

 func (fh *FileHandle) Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) error {

-	glog.V(4).Infof("%s read fh %d: [%d,%d)", fh.f.fullpath(), fh.handle, req.Offset, req.Offset+int64(req.Size))
+	glog.V(4).Infof("%s read fh %d: [%d,%d) size %d resp.Data cap=%d", fh.f.fullpath(), fh.handle, req.Offset, req.Offset+int64(req.Size), req.Size, cap(resp.Data))

-	buff := make([]byte, req.Size)
+	buff := resp.Data[:cap(resp.Data)]
+	if req.Size > cap(resp.Data) {
+		// should not happen
+		buff = make([]byte, req.Size)
+	}

 	totalRead, err := fh.readFromChunks(buff, req.Offset)
 	if err == nil {
-		dirtyOffset, dirtySize := fh.readFromDirtyPages(buff, req.Offset)
-		if totalRead+req.Offset < dirtyOffset+int64(dirtySize) {
-			totalRead = dirtyOffset + int64(dirtySize) - req.Offset
-		}
+		maxStop := fh.readFromDirtyPages(buff, req.Offset)
+		totalRead = max(maxStop - req.Offset, totalRead)
 	}

-	resp.Data = buff[:totalRead]
-
 	if err != nil {
 		glog.Errorf("file handle read %s: %v", fh.f.fullpath(), err)
 		return fuse.EIO
 	}

+	if totalRead > int64(len(buff)) {
+		glog.Warningf("%s FileHandle Read %d: [%d,%d) size %d totalRead %d", fh.f.fullpath(), fh.handle, req.Offset, req.Offset+int64(req.Size), req.Size, totalRead)
+		totalRead = min(int64(len(buff)), totalRead)
+	}
+	resp.Data = buff[:totalRead]
+
 	return err
 }

-func (fh *FileHandle) readFromDirtyPages(buff []byte, startOffset int64) (offset int64, size int) {
-	return fh.dirtyPages.ReadDirtyData(buff, startOffset)
+func (fh *FileHandle) readFromDirtyPages(buff []byte, startOffset int64) (maxStop int64) {
+	return fh.dirtyPages.ReadDirtyDataAt(buff, startOffset)
 }

 func (fh *FileHandle) readFromChunks(buff []byte, offset int64) (int64, error) {

-	// this value should come from the filer instead of the old f
-	if len(fh.f.entry.Chunks) == 0 {
+	fileSize := int64(filer2.FileSize(fh.f.entry))
+
+	if fileSize == 0 {
 		glog.V(1).Infof("empty fh %v", fh.f.fullpath())
-		return 0, nil
+		return 0, io.EOF
 	}

 	var chunkResolveErr error
@ -99,8 +105,8 @@ func (fh *FileHandle) readFromChunks(buff []byte, offset int64) (int64, error) {
 	}

 	if fh.f.reader == nil {
-		chunkViews := filer2.ViewFromVisibleIntervals(fh.f.entryViewCache, 0, math.MaxInt32)
-		fh.f.reader = filer2.NewChunkReaderAtFromClient(fh.f.wfs, chunkViews, fh.f.wfs.chunkCache)
+		chunkViews := filer2.ViewFromVisibleIntervals(fh.f.entryViewCache, 0, math.MaxInt64)
+		fh.f.reader = filer2.NewChunkReaderAtFromClient(fh.f.wfs, chunkViews, fh.f.wfs.chunkCache, fileSize)
 	}

 	totalRead, err := fh.f.reader.ReadAt(buff, offset)
@ -113,7 +119,7 @@ func (fh *FileHandle) readFromChunks(buff []byte, offset int64) (int64, error) {
 		glog.Errorf("file handle read %s: %v", fh.f.fullpath(), err)
 	}

-	// glog.V(0).Infof("file handle read %s [%d,%d] %d : %v", fh.f.fullpath(), offset, offset+int64(totalRead), totalRead, err)
+	glog.V(4).Infof("file handle read %s [%d,%d] %d : %v", fh.f.fullpath(), offset, offset+int64(totalRead), totalRead, err)

 	return int64(totalRead), err
 }
@ -126,7 +132,7 @@ func (fh *FileHandle) Write(ctx context.Context, req *fuse.WriteRequest, resp *f
 	copy(data, req.Data)

 	fh.f.entry.Attributes.FileSize = uint64(max(req.Offset+int64(len(data)), int64(fh.f.entry.Attributes.FileSize)))
-	// glog.V(0).Infof("%v write [%d,%d)", fh.f.fullpath(), req.Offset, req.Offset+int64(len(req.Data)))
+	glog.V(4).Infof("%v write [%d,%d) %d", fh.f.fullpath(), req.Offset, req.Offset+int64(len(req.Data)), len(req.Data))

 	chunks, err := fh.dirtyPages.AddPage(req.Offset, data)
 	if err != nil {
@ -139,14 +145,14 @@ func (fh *FileHandle) Write(ctx context.Context, req *fuse.WriteRequest, resp *f
 	if req.Offset == 0 {
 		// detect mime type
 		fh.contentType = http.DetectContentType(data)
-		fh.dirtyMetadata = true
+		fh.f.dirtyMetadata = true
 	}

 	if len(chunks) > 0 {

 		fh.f.addChunks(chunks)

-		fh.dirtyMetadata = true
+		fh.f.dirtyMetadata = true
 	}

 	return nil
@ -154,24 +160,28 @@ func (fh *FileHandle) Write(ctx context.Context, req *fuse.WriteRequest, resp *f

 func (fh *FileHandle) Release(ctx context.Context, req *fuse.ReleaseRequest) error {

-	glog.V(4).Infof("%v release fh %d", fh.f.fullpath(), fh.handle)
+	glog.V(4).Infof("Release %v fh %d", fh.f.fullpath(), fh.handle)

 	fh.f.isOpen--

 	if fh.f.isOpen <= 0 {
-		fh.dirtyPages.releaseResource()
+		fh.doFlush(ctx, req.Header)
 		fh.f.wfs.ReleaseHandle(fh.f.fullpath(), fuse.HandleID(fh.handle))
+		fh.f.entryViewCache = nil
+		fh.f.reader = nil
 	}
-	fh.f.entryViewCache = nil
-	fh.f.reader = nil

 	return nil
 }

 func (fh *FileHandle) Flush(ctx context.Context, req *fuse.FlushRequest) error {
+	return fh.doFlush(ctx, req.Header)
+}
+
+func (fh *FileHandle) doFlush(ctx context.Context, header fuse.Header) error {
 	// fflush works at fh level
 	// send the data to the OS
-	glog.V(4).Infof("%s fh %d flush %v", fh.f.fullpath(), fh.handle, req)
+	glog.V(4).Infof("doFlush %s fh %d %v", fh.f.fullpath(), fh.handle, header)

 	chunks, err := fh.dirtyPages.FlushToStorage()
 	if err != nil {
@ -181,10 +191,10 @@ func (fh *FileHandle) Flush(ctx context.Context, req *fuse.FlushRequest) error {

 	if len(chunks) > 0 {
 		fh.f.addChunks(chunks)
-		fh.dirtyMetadata = true
+		fh.f.dirtyMetadata = true
 	}

-	if !fh.dirtyMetadata {
+	if !fh.f.dirtyMetadata {
 		return nil
 	}

@ -193,10 +203,10 @@ func (fh *FileHandle) Flush(ctx context.Context, req *fuse.FlushRequest) error {
 		if fh.f.entry.Attributes != nil {
 			fh.f.entry.Attributes.Mime = fh.contentType
 			if fh.f.entry.Attributes.Uid == 0 {
-				fh.f.entry.Attributes.Uid = req.Uid
+				fh.f.entry.Attributes.Uid = header.Uid
 			}
 			if fh.f.entry.Attributes.Gid == 0 {
-				fh.f.entry.Attributes.Gid = req.Gid
+				fh.f.entry.Attributes.Gid = header.Gid
 			}
 			if fh.f.entry.Attributes.Crtime == 0 {
 				fh.f.entry.Attributes.Crtime = time.Now().Unix()
@ -212,9 +222,9 @@ func (fh *FileHandle) Flush(ctx context.Context, req *fuse.FlushRequest) error {
 			Entry:     fh.f.entry,
 		}

-		glog.V(3).Infof("%s set chunks: %v", fh.f.fullpath(), len(fh.f.entry.Chunks))
+		glog.V(4).Infof("%s set chunks: %v", fh.f.fullpath(), len(fh.f.entry.Chunks))
 		for i, chunk := range fh.f.entry.Chunks {
-			glog.V(3).Infof("%s chunks %d: %v [%d,%d)", fh.f.fullpath(), i, chunk.FileId, chunk.Offset, chunk.Offset+int64(chunk.Size))
+			glog.V(4).Infof("%s chunks %d: %v [%d,%d)", fh.f.fullpath(), i, chunk.GetFileIdString(), chunk.Offset, chunk.Offset+int64(chunk.Size))
 		}

 		chunks, garbages := filer2.CompactFileChunks(filer2.LookupFn(fh.f.wfs), fh.f.entry.Chunks)
@ -239,14 +249,14 @@ func (fh *FileHandle) Flush(ctx context.Context, req *fuse.FlushRequest) error {

 		fh.f.wfs.deleteFileChunks(garbages)
 		for i, chunk := range garbages {
-			glog.V(3).Infof("garbage %s chunks %d: %v [%d,%d)", fh.f.fullpath(), i, chunk.FileId, chunk.Offset, chunk.Offset+int64(chunk.Size))
+			glog.V(4).Infof("garbage %s chunks %d: %v [%d,%d)", fh.f.fullpath(), i, chunk.GetFileIdString(), chunk.Offset, chunk.Offset+int64(chunk.Size))
 		}

 		return nil
 	})

 	if err == nil {
-		fh.dirtyMetadata = false
+		fh.f.dirtyMetadata = false
 	}

 	if err != nil {
--- a/weed/filesys/fscache.go
+++ b/weed/filesys/fscache.go
@ -3,8 +3,9 @@ package filesys
 import (
 	"sync"

-	"github.com/chrislusf/seaweedfs/weed/util"
 	"github.com/seaweedfs/fuse/fs"
+
+	"github.com/chrislusf/seaweedfs/weed/util"
 )

 type FsCache struct {
@ -118,7 +119,6 @@ func (c *FsCache) Move(oldPath util.FullPath, newPath util.FullPath) *FsNode {
 		target = target.ensureChild(p)
 	}
 	parent := target.parent
-	src.name = target.name
 	if dir, ok := src.node.(*Dir); ok {
 		dir.name = target.name // target is not Dir, but a shortcut
 	}
@ -132,6 +132,7 @@ func (c *FsCache) Move(oldPath util.FullPath, newPath util.FullPath) *FsNode {

 	target.deleteSelf()

+	src.name = target.name
 	src.connectToParent(parent)

 	return src
@ -144,10 +145,14 @@ func (n *FsNode) connectToParent(parent *FsNode) {
 		oldNode.deleteSelf()
 	}
 	if dir, ok := n.node.(*Dir); ok {
-		dir.parent = parent.node.(*Dir)
+		if parent.node != nil {
+			dir.parent = parent.node.(*Dir)
+		}
 	}
 	if f, ok := n.node.(*File); ok {
-		f.dir = parent.node.(*Dir)
+		if parent.node != nil {
+			f.dir = parent.node.(*Dir)
+		}
 	}
 	n.childrenLock.Lock()
 	parent.children[n.name] = n
--- a/weed/filesys/fscache_test.go
+++ b/weed/filesys/fscache_test.go
@ -94,3 +94,24 @@ func TestFsCacheMove(t *testing.T) {
 	}

 }
+
+
+func TestFsCacheMove2(t *testing.T) {
+
+	cache := newFsCache(nil)
+
+	cache.SetFsNode(util.FullPath("/a/b/d"), &File{Name: "dd"})
+	cache.SetFsNode(util.FullPath("/a/b/e"), &File{Name: "ee"})
+
+	cache.Move(util.FullPath("/a/b/d"), util.FullPath("/a/b/e"))
+
+	d := cache.GetFsNode(util.FullPath("/a/b/e"))
+	if d == nil {
+		t.Errorf("unexpected nil node!")
+	}
+	if d.(*File).Name != "e" {
+		t.Errorf("unexpected node!")
+	}
+
+}
+
--- a/weed/filesys/meta_cache/meta_cache.go
+++ b/weed/filesys/meta_cache/meta_cache.go
@ -61,8 +61,13 @@ func (mc *MetaCache) AtomicUpdateEntry(ctx context.Context, oldPath util.FullPat
 	oldDir, _ := oldPath.DirAndName()
 	if mc.visitedBoundary.HasVisited(util.FullPath(oldDir)) {
 		if oldPath != "" {
-			if err := mc.actualStore.DeleteEntry(ctx, oldPath); err != nil {
-				return err
+			if newEntry != nil && oldPath == newEntry.FullPath {
+				// skip the unnecessary deletion
+				// leave the update to the following InsertEntry operation
+			} else {
+				if err := mc.actualStore.DeleteEntry(ctx, oldPath); err != nil {
+					return err
+				}
 			}
 		}
 	} else {
--- a/weed/filesys/meta_cache/meta_cache_init.go
+++ b/weed/filesys/meta_cache/meta_cache_init.go
@ -14,7 +14,7 @@ func EnsureVisited(mc *MetaCache, client filer_pb.FilerClient, dirPath util.Full

 	mc.visitedBoundary.EnsureVisited(dirPath, func(path util.FullPath) (childDirectories []string, err error) {

-		glog.V(2).Infof("ReadDirAllEntries %s ...", path)
+		glog.V(5).Infof("ReadDirAllEntries %s ...", path)

 		err = filer_pb.ReadDirAllEntries(client, dirPath, "", func(pbEntry *filer_pb.Entry, isLast bool) error {
 			entry := filer2.FromPbEntry(string(dirPath), pbEntry)
--- a/weed/filesys/wfs.go
+++ b/weed/filesys/wfs.go
@ -65,7 +65,7 @@ type WFS struct {
 	root        fs.Node
 	fsNodeCache *FsCache

-	chunkCache *chunk_cache.ChunkCache
+	chunkCache *chunk_cache.TieredChunkCache
 	metaCache  *meta_cache.MetaCache
 }
 type statsCache struct {
@ -87,10 +87,7 @@ func NewSeaweedFileSystem(option *Option) *WFS {
 	cacheDir := path.Join(option.CacheDir, cacheUniqueId)
 	if option.CacheSizeMB > 0 {
 		os.MkdirAll(cacheDir, 0755)
-		wfs.chunkCache = chunk_cache.NewChunkCache(256, cacheDir, option.CacheSizeMB)
-		grace.OnInterrupt(func() {
-			wfs.chunkCache.Shutdown()
-		})
+		wfs.chunkCache = chunk_cache.NewTieredChunkCache(256, cacheDir, option.CacheSizeMB)
 	}

 	wfs.metaCache = meta_cache.NewMetaCache(path.Join(cacheDir, "meta"))
@ -113,7 +110,7 @@ func (wfs *WFS) Root() (fs.Node, error) {
 func (wfs *WFS) AcquireHandle(file *File, uid, gid uint32) (fileHandle *FileHandle) {

 	fullpath := file.fullpath()
-	glog.V(4).Infof("%s AcquireHandle uid=%d gid=%d", fullpath, uid, gid)
+	glog.V(4).Infof("AcquireHandle %s uid=%d gid=%d", fullpath, uid, gid)

 	wfs.handlesLock.Lock()
 	defer wfs.handlesLock.Unlock()
@ -127,7 +124,6 @@ func (wfs *WFS) AcquireHandle(file *File, uid, gid uint32) (fileHandle *FileHand
 	fileHandle = newFileHandle(file, uid, gid)
 	wfs.handles[inodeId] = fileHandle
 	fileHandle.handle = inodeId
-	glog.V(4).Infof("%s new fh %d", fullpath, fileHandle.handle)

 	return
 }
@ -146,7 +142,7 @@ func (wfs *WFS) ReleaseHandle(fullpath util.FullPath, handleId fuse.HandleID) {
 // Statfs is called to obtain file system metadata. Implements fuse.FSStatfser
 func (wfs *WFS) Statfs(ctx context.Context, req *fuse.StatfsRequest, resp *fuse.StatfsResponse) error {

-	glog.V(4).Infof("reading fs stats: %+v", req)
+	glog.V(5).Infof("reading fs stats: %+v", req)

 	if wfs.stats.lastChecked < time.Now().Unix()-20 {

@ -158,13 +154,13 @@ func (wfs *WFS) Statfs(ctx context.Context, req *fuse.StatfsRequest, resp *fuse.
 				Ttl:         fmt.Sprintf("%ds", wfs.option.TtlSec),
 			}

-			glog.V(4).Infof("reading filer stats: %+v", request)
+			glog.V(5).Infof("reading filer stats: %+v", request)
 			resp, err := client.Statistics(context.Background(), request)
 			if err != nil {
 				glog.V(0).Infof("reading filer stats %v: %v", request, err)
 				return err
 			}
-			glog.V(4).Infof("read filer stats: %+v", resp)
+			glog.V(5).Infof("read filer stats: %+v", resp)

 			wfs.stats.TotalSize = resp.TotalSize
 			wfs.stats.UsedSize = resp.UsedSize
--- a/weed/filesys/wfs_deletion.go
+++ b/weed/filesys/wfs_deletion.go
@ -38,7 +38,7 @@ func (wfs *WFS) deleteFileIds(grpcDialOption grpc.DialOption, client filer_pb.Se

 		m := make(map[string]operation.LookupResult)

-		glog.V(4).Infof("remove file lookup volume id locations: %v", vids)
+		glog.V(5).Infof("deleteFileIds lookup volume id locations: %v", vids)
 		resp, err := client.LookupVolume(context.Background(), &filer_pb.LookupVolumeRequest{
 			VolumeIds: vids,
 		})
--- a/weed/operation/upload_content.go
+++ b/weed/operation/upload_content.go
@ -33,6 +33,7 @@ type UploadResult struct {
 }

 func (uploadResult *UploadResult) ToPbFileChunk(fileId string, offset int64) *filer_pb.FileChunk {
+	fid, _ := filer_pb.ToFileIdObject(fileId)
 	return &filer_pb.FileChunk{
 		FileId:       fileId,
 		Offset:       offset,
@ -41,6 +42,7 @@ func (uploadResult *UploadResult) ToPbFileChunk(fileId string, offset int64) *fi
 		ETag:         uploadResult.ETag,
 		CipherKey:    uploadResult.CipherKey,
 		IsCompressed: uploadResult.Gzip > 0,
+		Fid:          fid,
 	}
 }

@ -63,7 +65,7 @@ var fileNameEscaper = strings.NewReplacer("\\", "\\\\", "\"", "\\\"")

 // Upload sends a POST request to a volume server to upload the content with adjustable compression level
 func UploadData(uploadUrl string, filename string, cipher bool, data []byte, isInputCompressed bool, mtype string, pairMap map[string]string, jwt security.EncodedJwt) (uploadResult *UploadResult, err error) {
-	uploadResult, err = doUploadData(uploadUrl, filename, cipher, data, isInputCompressed, mtype, pairMap, jwt)
+	uploadResult, err = retriedUploadData(uploadUrl, filename, cipher, data, isInputCompressed, mtype, pairMap, jwt)
 	return
 }

@ -79,10 +81,22 @@ func doUpload(uploadUrl string, filename string, cipher bool, reader io.Reader,
 		err = fmt.Errorf("read input: %v", err)
 		return
 	}
-	uploadResult, uploadErr := doUploadData(uploadUrl, filename, cipher, data, isInputCompressed, mtype, pairMap, jwt)
+	uploadResult, uploadErr := retriedUploadData(uploadUrl, filename, cipher, data, isInputCompressed, mtype, pairMap, jwt)
 	return uploadResult, uploadErr, data
 }

+func retriedUploadData(uploadUrl string, filename string, cipher bool, data []byte, isInputCompressed bool, mtype string, pairMap map[string]string, jwt security.EncodedJwt) (uploadResult *UploadResult, err error) {
+	for i := 0; i < 3; i++ {
+		uploadResult, err = doUploadData(uploadUrl, filename, cipher, data, isInputCompressed, mtype, pairMap, jwt)
+		if err == nil {
+			return
+		} else {
+			glog.Warningf("uploading to %s: %v", uploadUrl, err)
+		}
+	}
+	return
+}
+
 func doUploadData(uploadUrl string, filename string, cipher bool, data []byte, isInputCompressed bool, mtype string, pairMap map[string]string, jwt security.EncodedJwt) (uploadResult *UploadResult, err error) {
 	contentIsGzipped := isInputCompressed
 	shouldGzipNow := false
--- a/weed/pb/filer_pb/filer_client.go
+++ b/weed/pb/filer_pb/filer_client.go
@ -7,6 +7,7 @@ import (
 	"io"
 	"math"
 	"os"
+	"strings"
 	"time"

 	"github.com/chrislusf/seaweedfs/weed/glog"
@ -82,7 +83,7 @@ func doList(filerClient FilerClient, fullDirPath util.FullPath, prefix string, f
 			InclusiveStartFrom: inclusive,
 		}

-		glog.V(3).Infof("read directory: %v", request)
+		glog.V(5).Infof("read directory: %v", request)
 		ctx, cancel := context.WithCancel(context.Background())
 		stream, err := client.ListEntries(ctx, request)
 		if err != nil {
@ -224,9 +225,15 @@ func Remove(filerClient FilerClient, parentDirectoryPath, name string, isDeleteD
 			IgnoreRecursiveError: ignoreRecursiveErr,
 			IsFromOtherCluster:   isFromOtherCluster,
 		}); err != nil {
+			if strings.Contains(err.Error(), ErrNotFound.Error()){
+				return nil
+			}
 			return err
 		} else {
 			if resp.Error != "" {
+				if strings.Contains(resp.Error, ErrNotFound.Error()){
+					return nil
+				}
 				return errors.New(resp.Error)
 			}
 		}
--- a/weed/pb/filer_pb/filer_pb_helper.go
+++ b/weed/pb/filer_pb/filer_pb_helper.go
@ -10,7 +10,7 @@ import (
 	"github.com/chrislusf/seaweedfs/weed/storage/needle"
 )

-func toFileIdObject(fileIdStr string) (*FileId, error) {
+func ToFileIdObject(fileIdStr string) (*FileId, error) {
 	t, err := needle.ParseFileIdFromString(fileIdStr)
 	if err != nil {
 		return nil, err
@ -43,14 +43,14 @@ func BeforeEntrySerialization(chunks []*FileChunk) {
 	for _, chunk := range chunks {

 		if chunk.FileId != "" {
-			if fid, err := toFileIdObject(chunk.FileId); err == nil {
+			if fid, err := ToFileIdObject(chunk.FileId); err == nil {
 				chunk.Fid = fid
 				chunk.FileId = ""
 			}
 		}

 		if chunk.SourceFileId != "" {
-			if fid, err := toFileIdObject(chunk.SourceFileId); err == nil {
+			if fid, err := ToFileIdObject(chunk.SourceFileId); err == nil {
 				chunk.SourceFid = fid
 				chunk.SourceFileId = ""
 			}
@ -81,7 +81,7 @@ func CreateEntry(client SeaweedFilerClient, request *CreateEntryRequest) error {
 		return fmt.Errorf("CreateEntry: %v", err)
 	}
 	if resp.Error != "" {
-		glog.V(1).Infof("create entry %s/%s %v: %v", request.Directory, request.Entry.Name, request.OExcl, err)
+		glog.V(1).Infof("create entry %s/%s %v: %v", request.Directory, request.Entry.Name, request.OExcl, resp.Error)
 		return fmt.Errorf("CreateEntry : %v", resp.Error)
 	}
 	return nil
--- a/weed/pb/filer_pb/filer_pb_helper_test.go
+++ b/weed/pb/filer_pb/filer_pb_helper_test.go
@ -9,7 +9,7 @@ import (
 func TestFileIdSize(t *testing.T) {
 	fileIdStr := "11745,0293434534cbb9892b"

-	fid, _ := toFileIdObject(fileIdStr)
+	fid, _ := ToFileIdObject(fileIdStr)
 	bytes, _ := proto.Marshal(fid)

 	println(len(fileIdStr))
--- a/weed/pb/volume_server.proto
+++ b/weed/pb/volume_server.proto
@ -37,8 +37,12 @@ service VolumeServer {
    }
    rpc VolumeMarkReadonly (VolumeMarkReadonlyRequest) returns (VolumeMarkReadonlyResponse) {
    }
+    rpc VolumeMarkWritable (VolumeMarkWritableRequest) returns (VolumeMarkWritableResponse) {
+    }
    rpc VolumeConfigure (VolumeConfigureRequest) returns (VolumeConfigureResponse) {
    }
+    rpc VolumeStatus (VolumeStatusRequest) returns (VolumeStatusResponse) {
+    }

    // copy the .idx .dat files, and mount this volume
    rpc VolumeCopy (VolumeCopyRequest) returns (VolumeCopyResponse) {
@ -200,6 +204,12 @@ message VolumeMarkReadonlyRequest {
 message VolumeMarkReadonlyResponse {
 }

+message VolumeMarkWritableRequest {
+    uint32 volume_id = 1;
+}
+message VolumeMarkWritableResponse {
+}
+
 message VolumeConfigureRequest {
    uint32 volume_id = 1;
    string replication = 2;
@ -208,6 +218,13 @@ message VolumeConfigureResponse {
    string error = 1;
 }

+message VolumeStatusRequest {
+    uint32 volume_id = 1;
+}
+message VolumeStatusResponse {
+    bool is_read_only = 1;
+}
+
 message VolumeCopyRequest {
    uint32 volume_id = 1;
    string collection = 2;
--- a/weed/pb/volume_server_pb/volume_server.pb.go
+++ b/weed/pb/volume_server_pb/volume_server.pb.go
--- a/weed/replication/sink/azuresink/azure_sink.go
+++ b/weed/replication/sink/azuresink/azure_sink.go
@ -95,7 +95,7 @@ func (g *AzureSink) CreateEntry(key string, entry *filer_pb.Entry) error {
 		return nil
 	}

-	totalSize := filer2.TotalSize(entry.Chunks)
+	totalSize := filer2.FileSize(entry)
 	chunkViews := filer2.ViewFromChunks(g.filerSource.LookupFileId, entry.Chunks, 0, int64(totalSize))

 	// Create a URL that references a to-be-created blob in your
@ -115,7 +115,7 @@ func (g *AzureSink) CreateEntry(key string, entry *filer_pb.Entry) error {
 		}

 		var writeErr error
-		readErr := util.ReadUrlAsStream(fileUrl, nil, false, chunk.IsFullChunk(), chunk.Offset, int(chunk.Size), func(data []byte) {
+		readErr := util.ReadUrlAsStream(fileUrl+"?readDeleted=true", nil, false, chunk.IsFullChunk(), chunk.Offset, int(chunk.Size), func(data []byte) {
 			_, writeErr = appendBlobURL.AppendBlock(context.Background(), bytes.NewReader(data), azblob.AppendBlobAccessConditions{}, nil)
 		})

--- a/weed/replication/sink/b2sink/b2_sink.go
+++ b/weed/replication/sink/b2sink/b2_sink.go
@ -84,7 +84,7 @@ func (g *B2Sink) CreateEntry(key string, entry *filer_pb.Entry) error {
 		return nil
 	}

-	totalSize := filer2.TotalSize(entry.Chunks)
+	totalSize := filer2.FileSize(entry)
 	chunkViews := filer2.ViewFromChunks(g.filerSource.LookupFileId, entry.Chunks, 0, int64(totalSize))

 	bucket, err := g.client.Bucket(context.Background(), g.bucket)
@ -103,7 +103,7 @@ func (g *B2Sink) CreateEntry(key string, entry *filer_pb.Entry) error {
 		}

 		var writeErr error
-		readErr := util.ReadUrlAsStream(fileUrl, nil, false, chunk.IsFullChunk(), chunk.Offset, int(chunk.Size), func(data []byte) {
+		readErr := util.ReadUrlAsStream(fileUrl+"?readDeleted=true", nil, false, chunk.IsFullChunk(), chunk.Offset, int(chunk.Size), func(data []byte) {
 			_, err := writer.Write(data)
 			if err != nil {
 				writeErr = err
--- a/weed/replication/sink/gcssink/gcs_sink.go
+++ b/weed/replication/sink/gcssink/gcs_sink.go
@ -89,7 +89,7 @@ func (g *GcsSink) CreateEntry(key string, entry *filer_pb.Entry) error {
 		return nil
 	}

-	totalSize := filer2.TotalSize(entry.Chunks)
+	totalSize := filer2.FileSize(entry)
 	chunkViews := filer2.ViewFromChunks(g.filerSource.LookupFileId, entry.Chunks, 0, int64(totalSize))

 	wc := g.client.Bucket(g.bucket).Object(key).NewWriter(context.Background())
@ -101,7 +101,7 @@ func (g *GcsSink) CreateEntry(key string, entry *filer_pb.Entry) error {
 			return err
 		}

-		err = util.ReadUrlAsStream(fileUrl, nil, false, chunk.IsFullChunk(), chunk.Offset, int(chunk.Size), func(data []byte) {
+		err = util.ReadUrlAsStream(fileUrl+"?readDeleted=true", nil, false, chunk.IsFullChunk(), chunk.Offset, int(chunk.Size), func(data []byte) {
 			wc.Write(data)
 		})

--- a/weed/replication/sink/s3sink/s3_sink.go
+++ b/weed/replication/sink/s3sink/s3_sink.go
@ -107,7 +107,7 @@ func (s3sink *S3Sink) CreateEntry(key string, entry *filer_pb.Entry) error {
 		return err
 	}

-	totalSize := filer2.TotalSize(entry.Chunks)
+	totalSize := filer2.FileSize(entry)
 	chunkViews := filer2.ViewFromChunks(s3sink.filerSource.LookupFileId, entry.Chunks, 0, int64(totalSize))

 	parts := make([]*s3.CompletedPart, len(chunkViews))
--- a/Show More
+++ b/Show More