2016-06-03 09:09:14 +08:00
package command
2013-01-21 11:44:23 +08:00
import (
"archive/tar"
2013-01-23 07:07:19 +08:00
"bytes"
2014-10-27 02:25:02 +08:00
"fmt"
2013-01-21 11:44:23 +08:00
"os"
"path"
2015-07-09 14:19:54 +08:00
"path/filepath"
2013-01-21 11:44:23 +08:00
"strconv"
"strings"
2013-01-23 07:07:19 +08:00
"text/template"
2013-01-21 11:44:23 +08:00
"time"
2014-10-27 02:34:55 +08:00
2016-06-03 09:09:14 +08:00
"github.com/chrislusf/seaweedfs/weed/glog"
"github.com/chrislusf/seaweedfs/weed/storage"
2013-01-21 11:44:23 +08:00
)
2013-01-23 07:07:19 +08:00
const (
defaultFnFormat = ` {{ .Mime }} / {{ .Id }} : {{ .Name }} `
2015-05-26 14:53:45 +08:00
timeFormat = "2006-01-02T15:04:05"
2013-01-23 07:07:19 +08:00
)
2015-06-02 10:25:01 +08:00
var (
export ExportOptions
)
type ExportOptions struct {
dir * string
collection * string
volumeId * int
}
2013-01-21 11:44:23 +08:00
var cmdExport = & Command {
2015-05-26 14:53:45 +08:00
UsageLine : "export -dir=/tmp -volumeId=234 -o=/dir/name.tar -fileNameFormat={{.Name}} -newer='" + timeFormat + "'" ,
2013-01-21 11:51:27 +08:00
Short : "list or export files from one volume data file" ,
Long : ` List all files in a volume , or Export all files in a volume to a tar file if the output is specified .
2014-08-26 02:37:00 +08:00
2013-03-19 12:29:25 +08:00
The format of file name in the tar file can be customized . Default is { { . Mime } } / { { . Id } } : { { . Name } } . Also available is { { . Key } } .
2013-01-21 11:44:23 +08:00
` ,
}
2015-06-02 10:25:01 +08:00
func init ( ) {
cmdExport . Run = runExport // break init cycle
export . dir = cmdExport . Flag . String ( "dir" , "." , "input data directory to store volume data files" )
export . collection = cmdExport . Flag . String ( "collection" , "" , "the volume collection name" )
export . volumeId = cmdExport . Flag . Int ( "volumeId" , - 1 , "a volume id. The volume .dat and .idx files should already exist in the dir." )
}
2013-01-21 11:44:23 +08:00
var (
2015-06-02 15:33:13 +08:00
output = cmdExport . Flag . String ( "o" , "" , "output tar file name, must ends with .tar, or just a \"-\" for stdout" )
2015-07-09 14:19:54 +08:00
format = cmdExport . Flag . String ( "fileNameFormat" , defaultFnFormat , "filename formatted with {{.Mime}} {{.Id}} {{.Name}} {{.Ext}}" )
2016-09-08 09:05:57 +08:00
newer = cmdExport . Flag . String ( "newer" , "" , "export only files newer than this time, default is all files. Must be specified in RFC3339 without timezone, e.g. 2006-01-02T15:04:05" )
2015-06-02 15:23:41 +08:00
2015-06-02 15:33:13 +08:00
tarOutputFile * tar . Writer
tarHeader tar . Header
fileNameTemplate * template . Template
fileNameTemplateBuffer = bytes . NewBuffer ( nil )
newerThan time . Time
newerThanUnix int64 = - 1
localLocation , _ = time . LoadLocation ( "Local" )
2013-01-21 11:44:23 +08:00
)
func runExport ( cmd * Command , args [ ] string ) bool {
2015-05-20 20:11:12 +08:00
var err error
if * newer != "" {
if newerThan , err = time . ParseInLocation ( timeFormat , * newer , localLocation ) ; err != nil {
fmt . Println ( "cannot parse 'newer' argument: " + err . Error ( ) )
return false
}
newerThanUnix = newerThan . Unix ( )
}
2015-06-02 15:23:41 +08:00
if * export . volumeId == - 1 {
2013-01-21 11:44:23 +08:00
return false
}
2015-06-02 15:33:13 +08:00
if * output != "" {
if * output != "-" && ! strings . HasSuffix ( * output , ".tar" ) {
fmt . Println ( "the output file" , * output , "should be '-' or end with .tar" )
2013-01-22 09:50:10 +08:00
return false
}
2013-01-23 07:07:19 +08:00
2015-06-02 15:33:13 +08:00
if fileNameTemplate , err = template . New ( "name" ) . Parse ( * format ) ; err != nil {
2013-01-23 07:07:19 +08:00
fmt . Println ( "cannot parse format " + * format + ": " + err . Error ( ) )
return false
}
2015-06-02 15:33:13 +08:00
var outputFile * os . File
if * output == "-" {
outputFile = os . Stdout
2013-01-21 11:44:23 +08:00
} else {
2015-06-02 15:33:13 +08:00
if outputFile , err = os . Create ( * output ) ; err != nil {
glog . Fatalf ( "cannot open output tar %s: %s" , * output , err )
2013-01-21 11:44:23 +08:00
}
}
2015-06-02 15:33:13 +08:00
defer outputFile . Close ( )
tarOutputFile = tar . NewWriter ( outputFile )
defer tarOutputFile . Close ( )
2013-01-21 11:44:23 +08:00
t := time . Now ( )
tarHeader = tar . Header { Mode : 0644 ,
ModTime : t , Uid : os . Getuid ( ) , Gid : os . Getgid ( ) ,
Typeflag : tar . TypeReg ,
AccessTime : t , ChangeTime : t }
}
2015-06-02 15:23:41 +08:00
fileName := strconv . Itoa ( * export . volumeId )
if * export . collection != "" {
fileName = * export . collection + "_" + fileName
2014-01-22 12:51:07 +08:00
}
2015-06-02 15:23:41 +08:00
vid := storage . VolumeId ( * export . volumeId )
indexFile , err := os . OpenFile ( path . Join ( * export . dir , fileName + ".idx" ) , os . O_RDONLY , 0644 )
2013-01-21 11:44:23 +08:00
if err != nil {
2013-08-09 14:57:22 +08:00
glog . Fatalf ( "Create Volume Index [ERROR] %s\n" , err )
2013-01-21 11:44:23 +08:00
}
defer indexFile . Close ( )
2015-06-02 15:33:13 +08:00
needleMap , err := storage . LoadNeedleMap ( indexFile )
2013-02-11 05:41:25 +08:00
if err != nil {
2014-04-17 15:16:44 +08:00
glog . Fatalf ( "cannot load needle map from %s: %s" , indexFile . Name ( ) , err )
2013-02-11 05:41:25 +08:00
}
2013-01-21 11:44:23 +08:00
2013-01-22 09:50:10 +08:00
var version storage . Version
2015-06-02 15:23:41 +08:00
err = storage . ScanVolumeFile ( * export . dir , * export . collection , vid ,
Add boltdb for volume needle map
boltdb is fairly slow to write, about 6 minutes for recreating index
for 1553934 files. Boltdb loads 1,553,934 x 16 = 24,862,944bytes from
disk, and generate the boltdb as large as 134,217,728 bytes in 6
minutes.
To compare, for leveldb, it recreates index in leveldb as large as
27,188,148 bytes in 8 seconds.
For in memory version, it loads the index in
To test the memory consumption, the leveldb or boltdb index are
created. And the server is restarted. Using the benchmark tool to read
lots of files. There are 7 volumes in benchmark collection, each with
about 1553K files.
For leveldb, the memory starts at 142,884KB, and stays at 179,340KB.
For boltdb, the memory starts at 73,756KB, and stays at 144,564KB.
For in-memory, the memory starts at 368,152KB, and stays at 448,032KB.
2015-03-30 02:04:32 +08:00
storage . NeedleMapInMemory ,
func ( superBlock storage . SuperBlock ) error {
version = superBlock . Version ( )
return nil
} , true , func ( n * storage . Needle , offset int64 ) error {
2015-06-02 15:33:13 +08:00
nv , ok := needleMap . Get ( n . Id )
Add boltdb for volume needle map
boltdb is fairly slow to write, about 6 minutes for recreating index
for 1553934 files. Boltdb loads 1,553,934 x 16 = 24,862,944bytes from
disk, and generate the boltdb as large as 134,217,728 bytes in 6
minutes.
To compare, for leveldb, it recreates index in leveldb as large as
27,188,148 bytes in 8 seconds.
For in memory version, it loads the index in
To test the memory consumption, the leveldb or boltdb index are
created. And the server is restarted. Using the benchmark tool to read
lots of files. There are 7 volumes in benchmark collection, each with
about 1553K files.
For leveldb, the memory starts at 142,884KB, and stays at 179,340KB.
For boltdb, the memory starts at 73,756KB, and stays at 144,564KB.
For in-memory, the memory starts at 368,152KB, and stays at 448,032KB.
2015-03-30 02:04:32 +08:00
glog . V ( 3 ) . Infof ( "key %d offset %d size %d disk_size %d gzip %v ok %v nv %+v" ,
n . Id , offset , n . Size , n . DiskSize ( ) , n . IsGzipped ( ) , ok , nv )
if ok && nv . Size > 0 && int64 ( nv . Offset ) * 8 == offset {
2015-05-20 20:11:12 +08:00
if newerThanUnix >= 0 && n . HasLastModifiedDate ( ) && n . LastModified < uint64 ( newerThanUnix ) {
glog . V ( 3 ) . Infof ( "Skipping this file, as it's old enough: LastModified %d vs %d" ,
n . LastModified , newerThanUnix )
return nil
}
Add boltdb for volume needle map
boltdb is fairly slow to write, about 6 minutes for recreating index
for 1553934 files. Boltdb loads 1,553,934 x 16 = 24,862,944bytes from
disk, and generate the boltdb as large as 134,217,728 bytes in 6
minutes.
To compare, for leveldb, it recreates index in leveldb as large as
27,188,148 bytes in 8 seconds.
For in memory version, it loads the index in
To test the memory consumption, the leveldb or boltdb index are
created. And the server is restarted. Using the benchmark tool to read
lots of files. There are 7 volumes in benchmark collection, each with
about 1553K files.
For leveldb, the memory starts at 142,884KB, and stays at 179,340KB.
For boltdb, the memory starts at 73,756KB, and stays at 144,564KB.
For in-memory, the memory starts at 368,152KB, and stays at 448,032KB.
2015-03-30 02:04:32 +08:00
return walker ( vid , n , version )
}
if ! ok {
glog . V ( 2 ) . Infof ( "This seems deleted %d size %d" , n . Id , n . Size )
} else {
glog . V ( 2 ) . Infof ( "Skipping later-updated Id %d size %d" , n . Id , n . Size )
}
return nil
} )
2013-01-21 11:44:23 +08:00
if err != nil {
2013-08-09 14:57:22 +08:00
glog . Fatalf ( "Export Volume File [ERROR] %s\n" , err )
2013-01-21 11:44:23 +08:00
}
return true
}
2013-01-23 07:07:19 +08:00
type nameParams struct {
Name string
Id uint64
Mime string
Key string
2015-07-09 14:19:54 +08:00
Ext string
2013-01-23 07:07:19 +08:00
}
2013-01-22 09:50:10 +08:00
func walker ( vid storage . VolumeId , n * storage . Needle , version storage . Version ) ( err error ) {
2014-03-24 12:57:10 +08:00
key := storage . NewFileIdFromNeedle ( vid , n ) . String ( )
2015-06-02 15:33:13 +08:00
if tarOutputFile != nil {
fileNameTemplateBuffer . Reset ( )
if err = fileNameTemplate . Execute ( fileNameTemplateBuffer ,
2015-07-09 14:19:54 +08:00
nameParams {
Name : string ( n . Name ) ,
2013-01-23 07:07:19 +08:00
Id : n . Id ,
Mime : string ( n . Mime ) ,
Key : key ,
2015-07-09 14:19:54 +08:00
Ext : filepath . Ext ( string ( n . Name ) ) ,
2013-01-23 07:07:19 +08:00
} ,
) ; err != nil {
return err
}
2015-06-02 15:33:13 +08:00
fileName := fileNameTemplateBuffer . String ( )
if n . IsGzipped ( ) && path . Ext ( fileName ) != ".gz" {
fileName = fileName + ".gz"
2013-01-23 07:07:19 +08:00
}
2015-06-02 15:33:13 +08:00
tarHeader . Name , tarHeader . Size = fileName , int64 ( len ( n . Data ) )
2015-05-20 20:11:12 +08:00
if n . HasLastModifiedDate ( ) {
tarHeader . ModTime = time . Unix ( int64 ( n . LastModified ) , 0 )
} else {
2015-05-26 14:53:45 +08:00
tarHeader . ModTime = time . Unix ( 0 , 0 )
2015-05-20 20:11:12 +08:00
}
tarHeader . ChangeTime = tarHeader . ModTime
2015-06-02 15:33:13 +08:00
if err = tarOutputFile . WriteHeader ( & tarHeader ) ; err != nil {
2013-01-21 11:44:23 +08:00
return err
}
2015-06-02 15:33:13 +08:00
_ , err = tarOutputFile . Write ( n . Data )
2013-01-21 11:44:23 +08:00
} else {
2013-01-22 09:50:10 +08:00
size := n . DataSize
if version == storage . Version1 {
size = n . Size
}
2013-01-21 11:44:23 +08:00
fmt . Printf ( "key=%s Name=%s Size=%d gzip=%t mime=%s\n" ,
2013-01-23 07:07:19 +08:00
key ,
2013-01-21 11:44:23 +08:00
n . Name ,
2013-01-22 09:50:10 +08:00
size ,
2013-01-21 11:47:04 +08:00
n . IsGzipped ( ) ,
2013-01-21 11:44:23 +08:00
n . Mime ,
2013-01-21 11:47:04 +08:00
)
2013-01-21 11:44:23 +08:00
}
return
}