From 60b0f2324ebb5c1c982b9091548d73c9ace90e79 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Wed, 17 Mar 2021 09:38:38 -0700 Subject: [PATCH] storage write call path optimizations (#11805) - write in o_dsync instead of o_direct for smaller objects to avoid unaligned double Write() situations that may arise for smaller objects < 128KiB - avoid fallocate() as its not useful since we do not use Append() semantics anymore, fallocate is not useful for streaming I/O we can save on a syscall - createFile() doesn't need to validate `bucket` name with a Lstat() call since createFile() is only used to write at `minioTmpBucket` - use io.Copy() when writing unAligned writes to allow usage of ReadFrom() from *os.File providing zero buffer writes(). --- Makefile | 2 +- buildscripts/verify-build.sh | 1 + buildscripts/verify-healing.sh | 2 + cmd/xl-storage.go | 106 ++++++++++----------------------- pkg/ioutil/ioutil.go | 45 +++++--------- 5 files changed, 49 insertions(+), 107 deletions(-) diff --git a/Makefile b/Makefile index 71b4437d7..0cc17df73 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ lint: check: test test: verifiers build @echo "Running unit tests" - @GO111MODULE=on CGO_ENABLED=0 go test -tags kqueue ./... 1>/dev/null + @GOGC=25 GO111MODULE=on CGO_ENABLED=0 go test -tags kqueue ./... 1>/dev/null test-race: verifiers build @echo "Running unit tests under -race" diff --git a/buildscripts/verify-build.sh b/buildscripts/verify-build.sh index a53b367f1..3f54678ff 100755 --- a/buildscripts/verify-build.sh +++ b/buildscripts/verify-build.sh @@ -33,6 +33,7 @@ export ACCESS_KEY="minio" export SECRET_KEY="minio123" export ENABLE_HTTPS=0 export GO111MODULE=on +export GOGC=25 MINIO_CONFIG_DIR="$WORK_DIR/.minio" MINIO=( "$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" ) diff --git a/buildscripts/verify-healing.sh b/buildscripts/verify-healing.sh index ec1a54411..9d5ce55a2 100755 --- a/buildscripts/verify-healing.sh +++ b/buildscripts/verify-healing.sh @@ -28,6 +28,8 @@ WORK_DIR="$PWD/.verify-$RANDOM" MINIO_CONFIG_DIR="$WORK_DIR/.minio" MINIO=( "$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" server ) +export GOGC=25 + function start_minio_3_node() { export MINIO_ROOT_USER=minio export MINIO_ROOT_PASSWORD=minio123 diff --git a/cmd/xl-storage.go b/cmd/xl-storage.go index e7cb3a34b..a7153d46e 100644 --- a/cmd/xl-storage.go +++ b/cmd/xl-storage.go @@ -224,17 +224,17 @@ func newXLStorage(ep Endpoint) (*xlStorage, error) { // is emphemeral and we should treat it // as root disk from the baremetal // terminology. - rootDisk, err = disk.IsRootDisk(path, "/") + rootDisk, err = disk.IsRootDisk(path, SlashSeparator) if err != nil { return nil, err } if !rootDisk { // No root disk was found, its possible that - // path is referenced at "/data" which has + // path is referenced at "/etc/hosts" which has // different device ID that points to the original // "/" on the host system, fall back to that instead // to verify of the device id is same. - rootDisk, err = disk.IsRootDisk(path, "/data") + rootDisk, err = disk.IsRootDisk(path, "/etc/hosts") if err != nil { return nil, err } @@ -242,7 +242,7 @@ func newXLStorage(ep Endpoint) (*xlStorage, error) { } else { // On baremetal setups its always "/" is the root disk. - rootDisk, err = disk.IsRootDisk(path, "/") + rootDisk, err = disk.IsRootDisk(path, SlashSeparator) if err != nil { return nil, err } @@ -1233,7 +1233,7 @@ func (s *xlStorage) openFile(volume, path string, mode int) (f *os.File, err err // Create top level directories if they don't exist. // with mode 0777 mkdir honors system umask. if err = mkdirAll(pathutil.Dir(filePath), 0777); err != nil { - return nil, err + return nil, osErrToFileErr(err) } w, err := os.OpenFile(filePath, mode|writeMode, 0666) @@ -1432,82 +1432,42 @@ func (s *xlStorage) CreateFile(ctx context.Context, volume, path string, fileSiz return errInvalidArgument } + if fileSize <= smallFileThreshold { + // For streams smaller than 128KiB we simply write them as O_DSYNC (fdatasync) + // and not O_DIRECT to avoid the complexities of aligned I/O. + w, err := s.openFile(volume, path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC) + if err != nil { + return err + } + defer w.Close() + + written, err := io.Copy(w, r) + if err != nil { + return osErrToFileErr(err) + } + + if written > fileSize { + return errMoreData + } + + return nil + } + volumeDir, err := s.getVolDir(volume) if err != nil { return err } - // Stat a volume entry. - _, err = os.Lstat(volumeDir) - if err != nil { - if osIsNotExist(err) { - return errVolumeNotFound - } else if isSysErrIO(err) { - return errFaultyDisk - } - return err - } - filePath := pathJoin(volumeDir, path) - if err = checkPathLength(filePath); err != nil { - return err - } - // Create top level directories if they don't exist. // with mode 0777 mkdir honors system umask. if err = mkdirAll(pathutil.Dir(filePath), 0777); err != nil { - switch { - case osIsPermission(err): - return errFileAccessDenied - case osIsExist(err): - return errFileAccessDenied - case isSysErrIO(err): - return errFaultyDisk - case isSysErrInvalidArg(err): - return errUnsupportedDisk - case isSysErrNoSpace(err): - return errDiskFull - } - return err + return osErrToFileErr(err) } w, err := disk.OpenFileDirectIO(filePath, os.O_CREATE|os.O_WRONLY|os.O_EXCL, 0666) if err != nil { - switch { - case osIsPermission(err): - return errFileAccessDenied - case osIsExist(err): - return errFileAccessDenied - case isSysErrIO(err): - return errFaultyDisk - case isSysErrInvalidArg(err): - return errUnsupportedDisk - case isSysErrNoSpace(err): - return errDiskFull - default: - return err - } - } - - var e error - if fileSize > 0 { - // Allocate needed disk space to append data - e = Fallocate(int(w.Fd()), 0, fileSize) - } - - // Ignore errors when Fallocate is not supported in the current system - if e != nil && !isSysErrNoSys(e) && !isSysErrOpNotSupported(e) { - switch { - case isSysErrNoSpace(e): - err = errDiskFull - case isSysErrIO(e): - err = errFaultyDisk - default: - // For errors: EBADF, EINTR, EINVAL, ENODEV, EPERM, ESPIPE and ETXTBSY - // Appending was failed anyway, returns unexpected error - err = errUnexpected - } - return err + return osErrToFileErr(err) } defer func() { @@ -1515,14 +1475,8 @@ func (s *xlStorage) CreateFile(ctx context.Context, volume, path string, fileSiz w.Close() }() - var bufp *[]byte - if fileSize <= smallFileThreshold { - bufp = s.poolSmall.Get().(*[]byte) - defer s.poolSmall.Put(bufp) - } else { - bufp = s.poolLarge.Get().(*[]byte) - defer s.poolLarge.Put(bufp) - } + bufp := s.poolLarge.Get().(*[]byte) + defer s.poolLarge.Put(bufp) written, err := xioutil.CopyAligned(w, r, *bufp, fileSize) if err != nil { diff --git a/pkg/ioutil/ioutil.go b/pkg/ioutil/ioutil.go index d034c611d..0ceb7295b 100644 --- a/pkg/ioutil/ioutil.go +++ b/pkg/ioutil/ioutil.go @@ -19,6 +19,7 @@ package ioutil import ( + "bytes" "io" "os" @@ -188,33 +189,15 @@ const directioAlignSize = 4096 // the file opened for writes with syscall.O_DIRECT flag. func CopyAligned(w *os.File, r io.Reader, alignedBuf []byte, totalSize int64) (int64, error) { // Writes remaining bytes in the buffer. - writeUnaligned := func(w *os.File, buf []byte) (remainingWritten int, err error) { - var n int - remaining := len(buf) - // The following logic writes the remainging data such that it writes whatever best is possible (aligned buffer) - // in O_DIRECT mode and remaining (unaligned buffer) in non-O_DIRECT mode. - remainingAligned := (remaining / directioAlignSize) * directioAlignSize - remainingAlignedBuf := buf[:remainingAligned] - remainingUnalignedBuf := buf[remainingAligned:] - if len(remainingAlignedBuf) > 0 { - n, err = w.Write(remainingAlignedBuf) - if err != nil { - return remainingWritten, err - } - remainingWritten += n + writeUnaligned := func(w *os.File, buf []byte) (remainingWritten int64, err error) { + // Disable O_DIRECT on fd's on unaligned buffer + // perform an amortized Fdatasync(fd) on the fd at + // the end, this is performed by the caller before + // closing 'w'. + if err = disk.DisableDirectIO(w); err != nil { + return remainingWritten, err } - if len(remainingUnalignedBuf) > 0 { - // Write on O_DIRECT fds fail if buffer is not 4K aligned, hence disable O_DIRECT. - if err = disk.DisableDirectIO(w); err != nil { - return remainingWritten, err - } - n, err = w.Write(remainingUnalignedBuf) - if err != nil { - return remainingWritten, err - } - remainingWritten += n - } - return remainingWritten, nil + return io.Copy(w, bytes.NewReader(buf)) } var written int64 @@ -232,21 +215,23 @@ func CopyAligned(w *os.File, r io.Reader, alignedBuf []byte, totalSize int64) (i return written, err } buf = buf[:nr] - var nw int + var nw int64 if len(buf)%directioAlignSize == 0 { + var n int // buf is aligned for directio write() - nw, err = w.Write(buf) + n, err = w.Write(buf) + nw = int64(n) } else { // buf is not aligned, hence use writeUnaligned() nw, err = writeUnaligned(w, buf) } if nw > 0 { - written += int64(nw) + written += nw } if err != nil { return written, err } - if nw != len(buf) { + if nw != int64(len(buf)) { return written, io.ErrShortWrite }