seaweedfs/weed/filer/filechunk_manifest.go

279 lines
7.9 KiB
Go
Raw Normal View History

2020-09-01 15:21:19 +08:00
package filer
import (
"bytes"
"fmt"
"io"
"math"
2021-09-08 10:29:42 +08:00
"net/url"
"strings"
2022-03-03 05:50:46 +08:00
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
2022-08-18 03:05:07 +08:00
"google.golang.org/protobuf/proto"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/util"
)
2020-07-20 18:34:06 +08:00
const (
2022-03-07 16:07:53 +08:00
ManifestBatch = 10000
2020-07-20 18:34:06 +08:00
)
2022-03-03 05:50:46 +08:00
var bytesBufferPool = sync.Pool{
New: func() interface{} {
return new(bytes.Buffer)
},
}
func HasChunkManifest(chunks []*filer_pb.FileChunk) bool {
for _, chunk := range chunks {
if chunk.IsChunkManifest {
return true
}
}
return false
}
2020-08-24 09:30:11 +08:00
func SeparateManifestChunks(chunks []*filer_pb.FileChunk) (manifestChunks, nonManifestChunks []*filer_pb.FileChunk) {
for _, c := range chunks {
if c.IsChunkManifest {
2020-08-24 09:30:11 +08:00
manifestChunks = append(manifestChunks, c)
} else {
nonManifestChunks = append(nonManifestChunks, c)
}
}
return
}
func ResolveChunkManifest(lookupFileIdFn wdclient.LookupFileIdFunctionType, chunks []*filer_pb.FileChunk, startOffset, stopOffset int64) (dataChunks, manifestChunks []*filer_pb.FileChunk, manifestResolveErr error) {
// TODO maybe parallel this
for _, chunk := range chunks {
if max(chunk.Offset, startOffset) >= min(chunk.Offset+int64(chunk.Size), stopOffset) {
continue
}
if !chunk.IsChunkManifest {
dataChunks = append(dataChunks, chunk)
continue
}
resolvedChunks, err := ResolveOneChunkManifest(lookupFileIdFn, chunk)
if err != nil {
return dataChunks, nil, err
}
manifestChunks = append(manifestChunks, chunk)
// recursive
subDataChunks, subManifestChunks, subErr := ResolveChunkManifest(lookupFileIdFn, resolvedChunks, startOffset, stopOffset)
if subErr != nil {
return dataChunks, nil, subErr
}
dataChunks = append(dataChunks, subDataChunks...)
manifestChunks = append(manifestChunks, subManifestChunks...)
}
return
}
func ResolveOneChunkManifest(lookupFileIdFn wdclient.LookupFileIdFunctionType, chunk *filer_pb.FileChunk) (dataChunks []*filer_pb.FileChunk, manifestResolveErr error) {
if !chunk.IsChunkManifest {
return
}
// IsChunkManifest
2022-03-03 05:50:46 +08:00
bytesBuffer := bytesBufferPool.Get().(*bytes.Buffer)
2022-03-06 21:06:04 +08:00
bytesBuffer.Reset()
2022-03-03 05:50:46 +08:00
defer bytesBufferPool.Put(bytesBuffer)
err := fetchWholeChunk(bytesBuffer, lookupFileIdFn, chunk.GetFileIdString(), chunk.CipherKey, chunk.IsCompressed)
if err != nil {
return nil, fmt.Errorf("fail to read manifest %s: %v", chunk.GetFileIdString(), err)
}
m := &filer_pb.FileChunkManifest{}
2022-03-03 05:50:46 +08:00
if err := proto.Unmarshal(bytesBuffer.Bytes(), m); err != nil {
return nil, fmt.Errorf("fail to unmarshal manifest %s: %v", chunk.GetFileIdString(), err)
}
// recursive
filer_pb.AfterEntryDeserialization(m.Chunks)
return m.Chunks, nil
}
2020-07-20 18:34:06 +08:00
// TODO fetch from cache for weed mount?
2022-03-03 05:50:46 +08:00
func fetchWholeChunk(bytesBuffer *bytes.Buffer, lookupFileIdFn wdclient.LookupFileIdFunctionType, fileId string, cipherKey []byte, isGzipped bool) error {
urlStrings, err := lookupFileIdFn(fileId)
if err != nil {
2020-07-20 18:34:06 +08:00
glog.Errorf("operation LookupFileId %s failed, err: %v", fileId, err)
2022-03-03 05:50:46 +08:00
return err
}
err = retriedStreamFetchChunkData(bytesBuffer, urlStrings, cipherKey, isGzipped, true, 0, 0)
if err != nil {
return err
}
2022-03-03 05:50:46 +08:00
return nil
2020-10-09 14:19:42 +08:00
}
func fetchChunkRange(buffer []byte, lookupFileIdFn wdclient.LookupFileIdFunctionType, fileId string, cipherKey []byte, isGzipped bool, offset int64) (int, error) {
urlStrings, err := lookupFileIdFn(fileId)
if err != nil {
glog.Errorf("operation LookupFileId %s failed, err: %v", fileId, err)
return 0, err
}
return retriedFetchChunkData(buffer, urlStrings, cipherKey, isGzipped, false, offset)
}
2022-02-26 18:16:47 +08:00
func retriedFetchChunkData(buffer []byte, urlStrings []string, cipherKey []byte, isGzipped bool, isFullChunk bool, offset int64) (n int, err error) {
2020-10-09 14:19:42 +08:00
2020-10-13 15:29:46 +08:00
var shouldRetry bool
2020-11-01 18:36:43 +08:00
for waitTime := time.Second; waitTime < util.RetryWaitTime; waitTime += waitTime / 2 {
for _, urlString := range urlStrings {
2022-02-26 18:16:47 +08:00
n = 0
2021-09-08 10:29:42 +08:00
if strings.Contains(urlString, "%") {
urlString = url.PathEscape(urlString)
}
2022-02-26 18:16:47 +08:00
shouldRetry, err = util.ReadUrlAsStream(urlString+"?readDeleted=true", cipherKey, isGzipped, isFullChunk, offset, len(buffer), func(data []byte) {
2022-02-26 19:23:06 +08:00
if n < len(buffer) {
x := copy(buffer[n:], data)
n += x
}
})
2020-10-13 15:29:46 +08:00
if !shouldRetry {
break
}
if err != nil {
glog.V(0).Infof("read %s failed, err: %v", urlString, err)
} else {
break
}
}
2020-10-14 10:50:46 +08:00
if err != nil && shouldRetry {
2020-10-14 10:50:22 +08:00
glog.V(0).Infof("retry reading in %v", waitTime)
2020-10-09 15:01:47 +08:00
time.Sleep(waitTime)
} else {
break
}
}
2022-02-26 18:16:47 +08:00
return n, err
}
func retriedStreamFetchChunkData(writer io.Writer, urlStrings []string, cipherKey []byte, isGzipped bool, isFullChunk bool, offset int64, size int) (err error) {
var shouldRetry bool
var totalWritten int
for waitTime := time.Second; waitTime < util.RetryWaitTime; waitTime += waitTime / 2 {
for _, urlString := range urlStrings {
var localProcessed int
2022-09-19 07:49:48 +08:00
var writeErr error
shouldRetry, err = util.ReadUrlAsStream(urlString+"?readDeleted=true", cipherKey, isGzipped, isFullChunk, offset, size, func(data []byte) {
if totalWritten > localProcessed {
toBeSkipped := totalWritten - localProcessed
if len(data) <= toBeSkipped {
localProcessed += len(data)
return // skip if already processed
}
2021-08-14 02:31:43 +08:00
data = data[toBeSkipped:]
localProcessed += toBeSkipped
}
2022-09-19 07:49:48 +08:00
var writtenCount int
writtenCount, writeErr = writer.Write(data)
localProcessed += writtenCount
totalWritten += writtenCount
})
if !shouldRetry {
break
}
2022-09-19 07:49:48 +08:00
if writeErr != nil {
err = writeErr
break
}
if err != nil {
glog.V(0).Infof("read %s failed, err: %v", urlString, err)
} else {
break
}
}
2021-08-14 02:13:30 +08:00
if err != nil && shouldRetry {
glog.V(0).Infof("retry reading in %v", waitTime)
time.Sleep(waitTime)
} else {
break
}
}
return err
}
2020-07-20 18:34:06 +08:00
func MaybeManifestize(saveFunc SaveDataAsChunkFunctionType, inputChunks []*filer_pb.FileChunk) (chunks []*filer_pb.FileChunk, err error) {
return doMaybeManifestize(saveFunc, inputChunks, ManifestBatch, mergeIntoManifest)
}
func doMaybeManifestize(saveFunc SaveDataAsChunkFunctionType, inputChunks []*filer_pb.FileChunk, mergeFactor int, mergefn func(saveFunc SaveDataAsChunkFunctionType, dataChunks []*filer_pb.FileChunk) (manifestChunk *filer_pb.FileChunk, err error)) (chunks []*filer_pb.FileChunk, err error) {
var dataChunks []*filer_pb.FileChunk
for _, chunk := range inputChunks {
if !chunk.IsChunkManifest {
dataChunks = append(dataChunks, chunk)
} else {
chunks = append(chunks, chunk)
}
}
remaining := len(dataChunks)
2020-07-20 18:34:06 +08:00
for i := 0; i+mergeFactor <= len(dataChunks); i += mergeFactor {
chunk, err := mergefn(saveFunc, dataChunks[i:i+mergeFactor])
if err != nil {
return dataChunks, err
}
chunks = append(chunks, chunk)
2020-07-20 18:34:06 +08:00
remaining -= mergeFactor
}
// remaining
for i := len(dataChunks) - remaining; i < len(dataChunks); i++ {
chunks = append(chunks, dataChunks[i])
}
return
}
func mergeIntoManifest(saveFunc SaveDataAsChunkFunctionType, dataChunks []*filer_pb.FileChunk) (manifestChunk *filer_pb.FileChunk, err error) {
2020-07-21 13:01:39 +08:00
filer_pb.BeforeEntrySerialization(dataChunks)
// create and serialize the manifest
data, serErr := proto.Marshal(&filer_pb.FileChunkManifest{
Chunks: dataChunks,
})
if serErr != nil {
return nil, fmt.Errorf("serializing manifest: %v", serErr)
}
minOffset, maxOffset := int64(math.MaxInt64), int64(math.MinInt64)
2020-07-20 18:34:06 +08:00
for _, chunk := range dataChunks {
if minOffset > int64(chunk.Offset) {
minOffset = chunk.Offset
}
if maxOffset < int64(chunk.Size)+chunk.Offset {
maxOffset = int64(chunk.Size) + chunk.Offset
}
}
more solid weed mount (#4089) * compare chunks by timestamp * fix slab clearing error * fix test compilation * move oldest chunk to sealed, instead of by fullness * lock on fh.entryViewCache * remove verbose logs * revert slat clearing * less logs * less logs * track write and read by timestamp * remove useless logic * add entry lock on file handle release * use mem chunk only, swap file chunk has problems * comment out code that maybe used later * add debug mode to compare data read and write * more efficient readResolvedChunks with linked list * small optimization * fix test compilation * minor fix on writer * add SeparateGarbageChunks * group chunks into sections * turn off debug mode * fix tests * fix tests * tmp enable swap file chunk * Revert "tmp enable swap file chunk" This reverts commit 985137ec472924e4815f258189f6ca9f2168a0a7. * simple refactoring * simple refactoring * do not re-use swap file chunk. Sealed chunks should not be re-used. * comment out debugging facilities * either mem chunk or swap file chunk is fine now * remove orderedMutex as *semaphore.Weighted not found impactful * optimize size calculation for changing large files * optimize performance to avoid going through the long list of chunks * still problems with swap file chunk * rename * tiny optimization * swap file chunk save only successfully read data * fix * enable both mem and swap file chunk * resolve chunks with range * rename * fix chunk interval list * also change file handle chunk group when adding chunks * pick in-active chunk with time-decayed counter * fix compilation * avoid nil with empty fh.entry * refactoring * rename * rename * refactor visible intervals to *list.List * refactor chunkViews to *list.List * add IntervalList for generic interval list * change visible interval to use IntervalList in generics * cahnge chunkViews to *IntervalList[*ChunkView] * use NewFileChunkSection to create * rename variables * refactor * fix renaming leftover * renaming * renaming * add insert interval * interval list adds lock * incrementally add chunks to readers Fixes: 1. set start and stop offset for the value object 2. clone the value object 3. use pointer instead of copy-by-value when passing to interval.Value 4. use insert interval since adding chunk could be out of order * fix tests compilation * fix tests compilation
2023-01-03 15:20:45 +08:00
manifestChunk, err = saveFunc(bytes.NewReader(data), "", 0, 0)
if err != nil {
return nil, err
}
manifestChunk.IsChunkManifest = true
manifestChunk.Offset = minOffset
manifestChunk.Size = uint64(maxOffset - minOffset)
return
}
more solid weed mount (#4089) * compare chunks by timestamp * fix slab clearing error * fix test compilation * move oldest chunk to sealed, instead of by fullness * lock on fh.entryViewCache * remove verbose logs * revert slat clearing * less logs * less logs * track write and read by timestamp * remove useless logic * add entry lock on file handle release * use mem chunk only, swap file chunk has problems * comment out code that maybe used later * add debug mode to compare data read and write * more efficient readResolvedChunks with linked list * small optimization * fix test compilation * minor fix on writer * add SeparateGarbageChunks * group chunks into sections * turn off debug mode * fix tests * fix tests * tmp enable swap file chunk * Revert "tmp enable swap file chunk" This reverts commit 985137ec472924e4815f258189f6ca9f2168a0a7. * simple refactoring * simple refactoring * do not re-use swap file chunk. Sealed chunks should not be re-used. * comment out debugging facilities * either mem chunk or swap file chunk is fine now * remove orderedMutex as *semaphore.Weighted not found impactful * optimize size calculation for changing large files * optimize performance to avoid going through the long list of chunks * still problems with swap file chunk * rename * tiny optimization * swap file chunk save only successfully read data * fix * enable both mem and swap file chunk * resolve chunks with range * rename * fix chunk interval list * also change file handle chunk group when adding chunks * pick in-active chunk with time-decayed counter * fix compilation * avoid nil with empty fh.entry * refactoring * rename * rename * refactor visible intervals to *list.List * refactor chunkViews to *list.List * add IntervalList for generic interval list * change visible interval to use IntervalList in generics * cahnge chunkViews to *IntervalList[*ChunkView] * use NewFileChunkSection to create * rename variables * refactor * fix renaming leftover * renaming * renaming * add insert interval * interval list adds lock * incrementally add chunks to readers Fixes: 1. set start and stop offset for the value object 2. clone the value object 3. use pointer instead of copy-by-value when passing to interval.Value 4. use insert interval since adding chunk could be out of order * fix tests compilation * fix tests compilation
2023-01-03 15:20:45 +08:00
type SaveDataAsChunkFunctionType func(reader io.Reader, name string, offset int64, tsNs int64) (chunk *filer_pb.FileChunk, err error)