feat: add dynamic usage cache (#12229)

A cache structure will be kept with a tree of usages.
The cache is a tree structure where each keeps track 
of its children.

An uncompacted branch contains a count of the files 
only directly at the branch level, and contains link to 
children branches or leaves.

The leaves are "compacted" based on a number of properties.
A compacted leaf contains the totals of all files beneath it.

A leaf is only scanned once every dataUsageUpdateDirCycles,
rarer if the bloom filter for the path is clean and no lifecycles 
are applied. Skipped leaves have their totals transferred from 
the previous cycle.

A clean leaf will be included once every healFolderIncludeProb 
for partial heal scans. When selected there is a one in 
healObjectSelectProb that any object will be chosen for heal scan.

Compaction happens when either:

- The folder (and subfolders) contains less than dataScannerCompactLeastObject objects.
- The folder itself contains more than dataScannerCompactAtFolders folders.
- The folder only contains objects and no subfolders.
- A bucket root will never be compacted.

Furthermore, if a has more than dataScannerCompactAtChildren recursive 
children (uncompacted folders) the tree will be recursively scanned and the 
branches with the least number of objects will be compacted until the limit 
is reached.

This ensures that any branch will never contain an unreasonable amount 
of other branches, and also that small branches with few objects don't 
take up unreasonable amounts of space.

Whenever a branch is scanned, it is assumed that it will be un-compacted
before it hits any of the above limits. This will make the branch rebalance 
itself when scanned if the distribution of objects has changed.

TLDR; With current values: No bucket will ever have more than 10000 
child nodes recursively. No single folder will have more than 2500 child 
nodes by itself. All subfolders are compacted if they have less than 500 
objects in them recursively.

We accumulate the (non-deletemarker) version count for paths as well, 
since we are changing the structure anyway.
This commit is contained in:
Klaus Post
2021-05-12 03:36:15 +02:00
committed by GitHub
parent f63eedb2b4
commit 229d83bb75
9 changed files with 1047 additions and 1328 deletions

View File

@@ -20,6 +20,7 @@ package cmd
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io/ioutil"
"os"
@@ -60,6 +61,7 @@ func TestDataUsageUpdate(t *testing.T) {
return
}
sizeS.totalSize = s.Size()
sizeS.versions++
return sizeS, nil
}
return
@@ -93,36 +95,13 @@ func TestDataUsageUpdate(t *testing.T) {
},
{
path: "/dir1",
size: 2000,
objs: 1,
oSizes: sizeHistogram{1: 1},
size: 1302010,
objs: 5,
oSizes: sizeHistogram{0: 1, 1: 4},
},
{
path: "/dir1/dira",
flatten: true,
size: 1300010,
objs: 4,
oSizes: sizeHistogram{0: 1, 1: 3},
},
{
path: "/dir1/dira/",
flatten: true,
size: 1300010,
objs: 4,
oSizes: sizeHistogram{0: 1, 1: 3},
},
{
path: "/dir1",
size: 2000,
objs: 1,
oSizes: sizeHistogram{0: 0, 1: 1},
},
{
// Children are flattened
path: "/dir1/dira/",
size: 1300010,
objs: 4,
oSizes: sizeHistogram{0: 1, 1: 3},
path: "/dir1/dira",
isNil: true,
},
{
path: "/nonexistying",
@@ -143,7 +122,6 @@ func TestDataUsageUpdate(t *testing.T) {
if e == nil {
t.Fatal("got nil result")
}
t.Log(e.Children)
if w.flatten {
*e = got.flatten(*e)
}
@@ -153,6 +131,9 @@ func TestDataUsageUpdate(t *testing.T) {
if e.Objects != uint64(w.objs) {
t.Error("got objects", e.Objects, "want", w.objs)
}
if e.Versions != uint64(w.objs) {
t.Error("got versions", e.Versions, "want", w.objs)
}
if e.ObjSizes != w.oSizes {
t.Error("got histogram", e.ObjSizes, "want", w.oSizes)
}
@@ -184,80 +165,6 @@ func TestDataUsageUpdate(t *testing.T) {
name: "rootfile3",
size: 1000,
},
}
createUsageTestFiles(t, base, bucket, files)
got, err = scanDataFolder(context.Background(), base, got, getSize)
if err != nil {
t.Fatal(err)
}
want = []struct {
path string
isNil bool
size, objs int
flatten bool
oSizes sizeHistogram
}{
{
path: "/",
size: 1363315,
flatten: true,
objs: 14,
oSizes: sizeHistogram{0: 6, 1: 8},
},
{
path: "/",
size: 21000,
objs: 3,
oSizes: sizeHistogram{0: 1, 1: 2},
},
{
path: "/newfolder",
size: 5,
objs: 3,
oSizes: sizeHistogram{0: 3},
},
{
path: "/dir1/dira",
size: 1300010,
flatten: true,
objs: 4,
oSizes: sizeHistogram{0: 1, 1: 3},
},
{
path: "/nonexistying",
isNil: true,
},
}
for _, w := range want {
t.Run(w.path, func(t *testing.T) {
e := got.find(path.Join(bucket, w.path))
if w.isNil {
if e != nil {
t.Error("want nil, got", e)
}
return
}
if e == nil {
t.Fatal("got nil result")
}
if w.flatten {
*e = got.flatten(*e)
}
if e.Size != int64(w.size) {
t.Error("got size", e.Size, "want", w.size)
}
if e.Objects != uint64(w.objs) {
t.Error("got objects", e.Objects, "want", w.objs)
}
if e.ObjSizes != w.oSizes {
t.Error("got histogram", e.ObjSizes, "want", w.oSizes)
}
})
}
files = []usageTestFile{
{
name: "dir1/dira/dirasub/fileindira2",
size: 200,
@@ -292,11 +199,21 @@ func TestDataUsageUpdate(t *testing.T) {
oSizes: sizeHistogram{0: 7, 1: 7},
},
{
path: "/dir1/dira",
size: 300210,
objs: 4,
flatten: true,
oSizes: sizeHistogram{0: 2, 1: 2},
path: "/dir1",
size: 342210,
objs: 7,
flatten: false,
oSizes: sizeHistogram{0: 2, 1: 5},
},
{
path: "/newfolder",
size: 5,
objs: 3,
oSizes: sizeHistogram{0: 3},
},
{
path: "/nonexistying",
isNil: true,
},
}
@@ -322,6 +239,9 @@ func TestDataUsageUpdate(t *testing.T) {
if e.Objects != uint64(w.objs) {
t.Error("got objects", e.Objects, "want", w.objs)
}
if e.Versions != uint64(w.objs) {
t.Error("got versions", e.Versions, "want", w.objs)
}
if e.ObjSizes != w.oSizes {
t.Error("got histogram", e.ObjSizes, "want", w.oSizes)
}
@@ -334,7 +254,7 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
if err != nil {
t.Skip(err)
}
base = filepath.Join(base, "bucket")
scannerSleeper.Update(0, 0)
defer os.RemoveAll(base)
var files = []usageTestFile{
{name: "bucket/rootfile", size: 10000},
@@ -347,6 +267,13 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
{name: "bucket/dir1/dira/dirasub/sublevel3/dccccfile", size: 10},
}
createUsageTestFiles(t, base, "", files)
const foldersBelow = 3
const filesBelowT = dataScannerCompactLeastObject / 2
const filesAboveT = dataScannerCompactAtFolders + 1
const expectSize = foldersBelow*filesBelowT + filesAboveT
generateUsageTestFiles(t, base, "bucket/dirwithalot", foldersBelow, filesBelowT, 1)
generateUsageTestFiles(t, base, "bucket/dirwithevenmore", filesAboveT, 1, 1)
getSize := func(item scannerItem) (sizeS sizeSummary, err error) {
if item.Typ&os.ModeDir == 0 {
@@ -356,6 +283,7 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
return
}
sizeS.totalSize = s.Size()
sizeS.versions++
return
}
return
@@ -381,9 +309,9 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
}{
{
path: "flat",
size: 1322310,
objs: 8,
oSizes: sizeHistogram{0: 2, 1: 6},
size: 1322310 + expectSize,
objs: 8 + expectSize,
oSizes: sizeHistogram{0: 2 + expectSize, 1: 6},
},
{
path: "bucket/",
@@ -392,22 +320,32 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
oSizes: sizeHistogram{1: 2},
},
{
// Gets compacted...
path: "bucket/dir1",
size: 2000,
objs: 1,
oSizes: sizeHistogram{1: 1},
size: 1302010,
objs: 5,
oSizes: sizeHistogram{0: 1, 1: 4},
},
{
path: "bucket/dir1/dira",
size: 1300010,
objs: 4,
oSizes: sizeHistogram{0: 1, 1: 3},
// Gets compacted at this level...
path: "bucket/dirwithalot/0",
size: filesBelowT,
objs: filesBelowT,
oSizes: sizeHistogram{0: filesBelowT},
},
{
path: "bucket/dir1/dira/",
size: 1300010,
objs: 4,
oSizes: sizeHistogram{0: 1, 1: 3},
// Gets compacted at this level (below obj threshold)...
path: "bucket/dirwithalot/0",
size: filesBelowT,
objs: filesBelowT,
oSizes: sizeHistogram{0: filesBelowT},
},
{
// Gets compacted at this level...
path: "bucket/dirwithevenmore",
size: filesAboveT,
objs: filesAboveT,
oSizes: sizeHistogram{0: filesAboveT},
},
{
path: "bucket/nonexistying",
@@ -437,6 +375,9 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
if e.Objects != uint64(w.objs) {
t.Error("got objects", e.Objects, "want", w.objs)
}
if e.Versions != uint64(w.objs) {
t.Error("got versions", e.Versions, "want", w.objs)
}
if e.ObjSizes != w.oSizes {
t.Error("got histogram", e.ObjSizes, "want", w.oSizes)
}
@@ -468,78 +409,6 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
name: "bucket/rootfile3",
size: 1000,
},
}
createUsageTestFiles(t, base, "", files)
got, err = scanDataFolder(context.Background(), base, got, getSize)
if err != nil {
t.Fatal(err)
}
want = []struct {
path string
isNil bool
size, objs int
oSizes sizeHistogram
}{
{
path: "flat",
size: 1363315,
objs: 14,
oSizes: sizeHistogram{0: 6, 1: 8},
},
{
path: "bucket/",
size: 21000,
objs: 3,
oSizes: sizeHistogram{0: 1, 1: 2},
},
{
path: "bucket/newfolder",
size: 5,
objs: 3,
oSizes: sizeHistogram{0: 3},
},
{
path: "bucket/dir1/dira",
size: 1300010,
objs: 4,
oSizes: sizeHistogram{0: 1, 1: 3},
},
{
path: "bucket/nonexistying",
isNil: true,
},
}
for _, w := range want {
t.Run(w.path, func(t *testing.T) {
e := got.find(w.path)
if w.path == "flat" {
f := got.flatten(*got.root())
e = &f
}
if w.isNil {
if e != nil {
t.Error("want nil, got", e)
}
return
}
if e == nil {
t.Fatal("got nil result")
}
if e.Size != int64(w.size) {
t.Error("got size", e.Size, "want", w.size)
}
if e.Objects != uint64(w.objs) {
t.Error("got objects", e.Objects, "want", w.objs)
}
if e.ObjSizes != w.oSizes {
t.Error("got histogram", e.ObjSizes, "want", w.oSizes)
}
})
}
files = []usageTestFile{
{
name: "bucket/dir1/dira/dirasub/fileindira2",
size: 200,
@@ -567,15 +436,36 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
}{
{
path: "flat",
size: 363515,
objs: 14,
oSizes: sizeHistogram{0: 7, 1: 7},
size: 363515 + expectSize,
objs: 14 + expectSize,
oSizes: sizeHistogram{0: 7 + expectSize, 1: 7},
},
{
path: "bucket/dir1/dira",
size: 300210,
objs: 4,
oSizes: sizeHistogram{0: 2, 1: 2},
path: "bucket/dir1",
size: 342210,
objs: 7,
oSizes: sizeHistogram{0: 2, 1: 5},
},
{
path: "bucket/",
size: 21000,
objs: 3,
oSizes: sizeHistogram{0: 1, 1: 2},
},
{
path: "bucket/newfolder",
size: 5,
objs: 3,
oSizes: sizeHistogram{0: 3},
},
{
// Compacted into bucket/dir1
path: "bucket/dir1/dira",
isNil: true,
},
{
path: "bucket/nonexistying",
isNil: true,
},
}
@@ -593,7 +483,8 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
return
}
if e == nil {
t.Fatal("got nil result")
t.Error("got nil result")
return
}
if e.Size != int64(w.size) {
t.Error("got size", e.Size, "want", w.size)
@@ -601,6 +492,9 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
if e.Objects != uint64(w.objs) {
t.Error("got objects", e.Objects, "want", w.objs)
}
if e.Versions != uint64(w.objs) {
t.Error("got versions", e.Versions, "want", w.objs)
}
if e.ObjSizes != w.oSizes {
t.Error("got histogram", e.ObjSizes, "want", w.oSizes)
}
@@ -621,6 +515,25 @@ func createUsageTestFiles(t *testing.T, base, bucket string, files []usageTestFi
}
}
// generateUsageTestFiles create nFolders * nFiles files of size bytes each.
func generateUsageTestFiles(t *testing.T, base, bucket string, nFolders, nFiles, size int) {
pl := make([]byte, size)
for i := 0; i < nFolders; i++ {
name := filepath.Join(base, bucket, fmt.Sprint(i), "0.txt")
err := os.MkdirAll(filepath.Dir(name), os.ModePerm)
if err != nil {
t.Fatal(err)
}
for j := 0; j < nFiles; j++ {
name := filepath.Join(base, bucket, fmt.Sprint(i), fmt.Sprint(j)+".txt")
err = ioutil.WriteFile(name, pl, os.ModePerm)
if err != nil {
t.Fatal(err)
}
}
}
}
func TestDataUsageCacheSerialize(t *testing.T) {
base, err := ioutil.TempDir("", "TestDataUsageCacheSerialize")
if err != nil {
@@ -654,6 +567,7 @@ func TestDataUsageCacheSerialize(t *testing.T) {
if err != nil {
return
}
sizeS.versions++
sizeS.totalSize = s.Size()
return
}
@@ -663,6 +577,20 @@ func TestDataUsageCacheSerialize(t *testing.T) {
if err != nil {
t.Fatal(err)
}
e := want.find("abucket/dir2")
e.ReplicationStats = &replicationStats{
PendingSize: 1,
ReplicatedSize: 2,
FailedSize: 3,
ReplicaSize: 4,
FailedCount: 5,
PendingCount: 6,
MissedThresholdSize: 7,
AfterThresholdSize: 8,
MissedThresholdCount: 9,
AfterThresholdCount: 10,
}
want.replace("abucket/dir2", "", *e)
var buf bytes.Buffer
err = want.serializeTo(&buf)
if err != nil {
@@ -686,9 +614,21 @@ func TestDataUsageCacheSerialize(t *testing.T) {
}
for wkey, wval := range want.Cache {
gotv := got.Cache[wkey]
if fmt.Sprint(gotv) != fmt.Sprint(wval) {
t.Errorf("deserialize mismatch, key %v\nwant: %+v\ngot: %+v", wkey, wval, gotv)
if !equalAsJSON(gotv, wval) {
t.Errorf("deserialize mismatch, key %v\nwant: %#v\ngot: %#v", wkey, wval, gotv)
}
}
}
// equalAsJSON returns whether the values are equal when encoded as JSON.
func equalAsJSON(a, b interface{}) bool {
aj, err := json.Marshal(a)
if err != nil {
panic(err)
}
bj, err := json.Marshal(b)
if err != nil {
panic(err)
}
return bytes.Equal(aj, bj)
}