Skip to content

Commit 938d5c6

Browse files
committed
objstorageprovider: add dual-write for cold metadata
This commit adds logic to write the metadata portion of cold blob files on both tiers, and to transparently read the metadata from the hot tier. We assume that the metadata is a suffix of the file; the metadata filenames encode the start offset, for example "000001.blobmeta.1234" means that the file contains the data from "00001.blob" starting at offset 1234. We add a `Writable` implementation which writes the metadata portion of a file to both cold and hot storage and a `Readable` implementation which reads data from a cold tier Readable but uses a `vfs.File` for the metadata. The provider keeps track internally of which hot metadata files exist and their start offsets.
1 parent ae2809c commit 938d5c6

File tree

13 files changed

+732
-11
lines changed

13 files changed

+732
-11
lines changed

internal/base/filenames.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,12 @@ const (
128128
FileTypeOldTemp
129129
FileTypeTemp
130130
FileTypeBlob
131+
// FileTypeBlobMeta is a file that contains only the metadata portion of a
132+
// blob file (used when the blob file in on cold storage). The filename for
133+
// blobmeta files is of the form `<file-num>.blobmeta.<offset>`, where
134+
// <offset> indicates that the file mirrors the contents of the corresponding
135+
// blob file starting at this offset.
136+
FileTypeBlobMeta
131137
)
132138

133139
var fileTypeStrings = [...]string{
@@ -139,6 +145,7 @@ var fileTypeStrings = [...]string{
139145
FileTypeOldTemp: "old-temp",
140146
FileTypeTemp: "temp",
141147
FileTypeBlob: "blob",
148+
FileTypeBlobMeta: "blobmeta",
142149
}
143150

144151
// FileTypeFromName parses a FileType from its string representation.
@@ -166,6 +173,8 @@ func (ft FileType) String() string {
166173
}
167174

168175
// MakeFilename builds a filename from components.
176+
//
177+
// Note that for FileTypeBlobMeta, ".<offset>" must be appended to the filename.
169178
func MakeFilename(fileType FileType, dfn DiskFileNum) string {
170179
// Make a buffer sufficiently large for most possible filenames, especially
171180
// the common case of a numbered table or blob file.
@@ -192,18 +201,25 @@ func appendFilename(buf []byte, fileType FileType, dfn DiskFileNum) []byte {
192201
buf = fmt.Appendf(buf, "temporary.%06d.dbtmp", uint64(dfn))
193202
case FileTypeBlob:
194203
buf = fmt.Appendf(buf, "%06d.blob", uint64(dfn))
204+
case FileTypeBlobMeta:
205+
buf = fmt.Appendf(buf, "%06d.blobmeta", uint64(dfn))
195206
default:
196207
panic("unreachable")
197208
}
198209
return buf
199210
}
200211

201212
// MakeFilepath builds a filepath from components.
213+
//
214+
// Note that for FileTypeBlobMeta, ".<offset>" must be appended to the filepath.
202215
func MakeFilepath(fs vfs.FS, dirname string, fileType FileType, dfn DiskFileNum) string {
203216
return fs.PathJoin(dirname, MakeFilename(fileType, dfn))
204217
}
205218

206219
// ParseFilename parses the components from a filename.
220+
//
221+
// Note that the offset component of a FileTypeBlobMeta is not parsed by this
222+
// function.
207223
func ParseFilename(fs vfs.FS, filename string) (fileType FileType, dfn DiskFileNum, ok bool) {
208224
filename = fs.PathBase(filename)
209225
switch {
@@ -250,6 +266,9 @@ func ParseFilename(fs vfs.FS, filename string) (fileType FileType, dfn DiskFileN
250266
case "blob":
251267
return FileTypeBlob, dfn, true
252268
}
269+
if strings.HasPrefix(filename[i+1:], "blobmeta.") {
270+
return FileTypeBlobMeta, dfn, true
271+
}
253272
}
254273
return 0, dfn, false
255274
}

internal/base/filenames_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package base
77
import (
88
"bytes"
99
"fmt"
10+
"math/rand/v2"
1011
"os"
1112
"testing"
1213

@@ -49,6 +50,7 @@ func TestParseFilename(t *testing.T) {
4950
"000000.blob": true,
5051
"000001.blob": true,
5152
"935203523.blob": true,
53+
"000001.blobmeta.0": true,
5254
}
5355
fs := vfs.NewMem()
5456
for tc, want := range testCases {
@@ -95,6 +97,17 @@ func TestFilenameRoundTrip(t *testing.T) {
9597
}
9698
}
9799

100+
func TestFilenameBlobMeta(t *testing.T) {
101+
fileNum := DiskFileNum(rand.Uint64())
102+
offset := rand.Int64()
103+
fs := vfs.NewMem()
104+
path := fmt.Sprintf("%s.%d", MakeFilepath(fs, "foo", FileTypeBlobMeta, fileNum), offset)
105+
typ, fn, ok := ParseFilename(fs, path)
106+
require.True(t, ok)
107+
require.Equal(t, FileTypeBlobMeta, typ)
108+
require.Equal(t, fileNum, fn)
109+
}
110+
98111
type bufferFataler struct {
99112
buf bytes.Buffer
100113
}

objstorage/objstorage.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,22 @@ type Writable interface {
107107

108108
// Finish completes the object and makes the data durable.
109109
// No further calls are allowed after calling Finish.
110+
//
111+
// If Finish fails, it is expected that the caller will delete the created
112+
// object. If the process crashes during Finish, it is expected that the file
113+
// will be deleted on startup.
110114
Finish() error
111115

116+
// StartMetadataPortion signals to the writer that the metadata part of the
117+
// object starts here. If the object is being written to the cold tier, data
118+
// in subsequent Write() calls will also be written to the hot tier.
119+
//
120+
// The function should be called at most one time.
121+
//
122+
// An error means that we won't be able to successfully finish this object.
123+
// - Any constraints on when this can be called relative to Write()
124+
StartMetadataPortion() error
125+
112126
// Abort gives up on finishing the object. There is no guarantee about whether
113127
// the object exists after calling Abort.
114128
// No further calls are allowed after calling Abort.
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package objstorageprovider
6+
7+
import (
8+
"context"
9+
"sync"
10+
11+
"github.com/cockroachdb/pebble/objstorage"
12+
"github.com/cockroachdb/pebble/vfs"
13+
)
14+
15+
// newColdReadable returns an objstorage.Readable that reads the main data from
16+
// the wrapped "cold storage" readable, and the metadata from a separate file in
17+
// a local filesystem. The separate file contains a suffix of the full file,
18+
// starting at metaStartOffset.
19+
func newColdReadable(
20+
cold objstorage.Readable, metaFS vfs.FS, metaFilepath string, metaStartOffset int64,
21+
) *coldReadable {
22+
r := &coldReadable{
23+
cold: cold,
24+
}
25+
r.meta.fs = metaFS
26+
r.meta.filepath = metaFilepath
27+
r.meta.startOffset = metaStartOffset
28+
return r
29+
}
30+
31+
type coldReadable struct {
32+
cold objstorage.Readable
33+
34+
meta struct {
35+
fs vfs.FS
36+
filepath string
37+
startOffset int64
38+
once struct {
39+
sync.Once
40+
file vfs.File
41+
err error
42+
}
43+
}
44+
}
45+
46+
var _ objstorage.Readable = (*coldReadable)(nil)
47+
48+
// readMetaAt reads from the metadata file at the given offset.
49+
func (r *coldReadable) readMetaAt(p []byte, off int64) error {
50+
r.meta.once.Do(func() {
51+
r.meta.once.file, r.meta.once.err = r.meta.fs.Open(r.meta.filepath, vfs.RandomReadsOption)
52+
})
53+
if r.meta.once.err != nil {
54+
return r.meta.once.err
55+
}
56+
_, err := r.meta.once.file.ReadAt(p, off)
57+
return err
58+
}
59+
60+
// ReadAt is part of the objstorage.Readable interface.
61+
func (r *coldReadable) ReadAt(ctx context.Context, p []byte, off int64) error {
62+
// We don't expect reads that span both regions, but in that case it is
63+
// correct to read it all from the cold file (which contains all the data).
64+
if off < r.meta.startOffset {
65+
return r.cold.ReadAt(ctx, p, off)
66+
}
67+
return r.readMetaAt(p, off-r.meta.startOffset)
68+
}
69+
70+
// Close is part of the objstorage.Readable interface.
71+
func (r *coldReadable) Close() error {
72+
err := r.cold.Close()
73+
if r.meta.once.file != nil {
74+
err = firstError(err, r.meta.once.file.Close())
75+
r.meta.once.file = nil
76+
}
77+
return err
78+
}
79+
80+
// Size is part of the objstorage.Readable interface.
81+
func (r *coldReadable) Size() int64 {
82+
return r.cold.Size()
83+
}
84+
85+
// NewReadHandle is part of the objstorage.Readable interface.
86+
func (r *coldReadable) NewReadHandle(
87+
readBeforeSize objstorage.ReadBeforeSize,
88+
) objstorage.ReadHandle {
89+
return &coldReadHandle{
90+
r: r,
91+
cold: r.cold.NewReadHandle(readBeforeSize),
92+
}
93+
}
94+
95+
type coldReadHandle struct {
96+
r *coldReadable
97+
cold objstorage.ReadHandle
98+
}
99+
100+
var _ objstorage.ReadHandle = (*coldReadHandle)(nil)
101+
102+
// ReadAt is part of the objstorage.ReadHandle interface.
103+
func (rh *coldReadHandle) ReadAt(ctx context.Context, p []byte, off int64) error {
104+
if off < rh.r.meta.startOffset {
105+
// Read from cold storage only.
106+
return rh.cold.ReadAt(ctx, p, off)
107+
}
108+
// Read from metadata only.
109+
return rh.r.readMetaAt(p, off-rh.r.meta.startOffset)
110+
}
111+
112+
// Close is part of the objstorage.ReadHandle interface.
113+
func (rh *coldReadHandle) Close() error {
114+
return rh.cold.Close()
115+
}
116+
117+
// SetupForCompaction is part of the objstorage.ReadHandle interface.
118+
func (rh *coldReadHandle) SetupForCompaction() {
119+
rh.cold.SetupForCompaction()
120+
}
121+
122+
// RecordCacheHit is part of the objstorage.ReadHandle interface.
123+
func (rh *coldReadHandle) RecordCacheHit(ctx context.Context, offset, size int64) {
124+
// We don't use prefetching for the metadata portion, so we only need to
125+
// report cache hits to the cold readable.
126+
if offset < rh.r.meta.startOffset {
127+
rh.cold.RecordCacheHit(ctx, offset, min(size, rh.r.meta.startOffset-offset))
128+
}
129+
}

0 commit comments

Comments
 (0)