Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require (
github.com/RaduBerinde/axisds v0.0.0-20250419182453-5135a0650657
github.com/cespare/xxhash/v2 v2.2.0
github.com/cockroachdb/crlib v0.0.0-20251122031428-fe658a2dbda1
github.com/cockroachdb/datadriven v1.0.3-0.20250911232732-d959cf14706c
github.com/cockroachdb/datadriven v1.0.3-0.20251123150250-ddff6747b112
github.com/cockroachdb/errors v1.11.3
github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895
github.com/cockroachdb/redact v1.1.5
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XL
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cockroachdb/crlib v0.0.0-20251122031428-fe658a2dbda1 h1:iX0YCYC5Jbt2/g7zNTP/QxhrV8Syp5kkzNiERKeN1uE=
github.com/cockroachdb/crlib v0.0.0-20251122031428-fe658a2dbda1/go.mod h1:NjNuToN/FbhwH1cCyM9G4Rhtxx+ZaOgtoqFR+thng7w=
github.com/cockroachdb/datadriven v1.0.3-0.20250911232732-d959cf14706c h1:a0m7gmtv2mzJQ4wP9BkxCmJAnjZ7fsvCi2IORGD1als=
github.com/cockroachdb/datadriven v1.0.3-0.20250911232732-d959cf14706c/go.mod h1:jsaKMvD3RBCATk1/jbUZM8C9idWBJME9+VRZ5+Liq1g=
github.com/cockroachdb/datadriven v1.0.3-0.20251123150250-ddff6747b112 h1:T1++5Vt0/4/IWZ1mHmUYl7fhQnz50QhNWIY+ITvLLIM=
github.com/cockroachdb/datadriven v1.0.3-0.20251123150250-ddff6747b112/go.mod h1:jsaKMvD3RBCATk1/jbUZM8C9idWBJME9+VRZ5+Liq1g=
github.com/cockroachdb/errors v1.11.3 h1:5bA+k2Y6r+oz/6Z/RFlNeVCesGARKuC6YymtcDrbC/I=
github.com/cockroachdb/errors v1.11.3/go.mod h1:m4UIW4CDjx+R5cybPsNrRbreomiFqt8o1h1wUVazSd8=
github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZeQy818SGhaone5OnYfxFR/+AzdY3sf5aE=
Expand Down
19 changes: 19 additions & 0 deletions internal/base/filenames.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ const (
FileTypeOldTemp
FileTypeTemp
FileTypeBlob
// FileTypeBlobMeta is a file that contains only the metadata portion of a
// blob file (used when the blob file in on cold storage). The filename for
// blobmeta files is of the form `<file-num>.blobmeta.<offset>`, where
// <offset> indicates that the file mirrors the contents of the corresponding
// blob file starting at this offset.
FileTypeBlobMeta
)

var fileTypeStrings = [...]string{
Expand All @@ -139,6 +145,7 @@ var fileTypeStrings = [...]string{
FileTypeOldTemp: "old-temp",
FileTypeTemp: "temp",
FileTypeBlob: "blob",
FileTypeBlobMeta: "blobmeta",
}

// FileTypeFromName parses a FileType from its string representation.
Expand Down Expand Up @@ -166,6 +173,8 @@ func (ft FileType) String() string {
}

// MakeFilename builds a filename from components.
//
// Note that for FileTypeBlobMeta, ".<offset>" must be appended to the filename.
func MakeFilename(fileType FileType, dfn DiskFileNum) string {
// Make a buffer sufficiently large for most possible filenames, especially
// the common case of a numbered table or blob file.
Expand All @@ -192,18 +201,25 @@ func appendFilename(buf []byte, fileType FileType, dfn DiskFileNum) []byte {
buf = fmt.Appendf(buf, "temporary.%06d.dbtmp", uint64(dfn))
case FileTypeBlob:
buf = fmt.Appendf(buf, "%06d.blob", uint64(dfn))
case FileTypeBlobMeta:
buf = fmt.Appendf(buf, "%06d.blobmeta", uint64(dfn))
default:
panic("unreachable")
}
return buf
}

// MakeFilepath builds a filepath from components.
//
// Note that for FileTypeBlobMeta, ".<offset>" must be appended to the filepath.
func MakeFilepath(fs vfs.FS, dirname string, fileType FileType, dfn DiskFileNum) string {
return fs.PathJoin(dirname, MakeFilename(fileType, dfn))
}

// ParseFilename parses the components from a filename.
//
// Note that the offset component of a FileTypeBlobMeta is not parsed by this
// function.
func ParseFilename(fs vfs.FS, filename string) (fileType FileType, dfn DiskFileNum, ok bool) {
filename = fs.PathBase(filename)
switch {
Expand Down Expand Up @@ -250,6 +266,9 @@ func ParseFilename(fs vfs.FS, filename string) (fileType FileType, dfn DiskFileN
case "blob":
return FileTypeBlob, dfn, true
}
if strings.HasPrefix(filename[i+1:], "blobmeta.") {
return FileTypeBlobMeta, dfn, true
}
}
return 0, dfn, false
}
Expand Down
13 changes: 13 additions & 0 deletions internal/base/filenames_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package base
import (
"bytes"
"fmt"
"math/rand/v2"
"os"
"testing"

Expand Down Expand Up @@ -49,6 +50,7 @@ func TestParseFilename(t *testing.T) {
"000000.blob": true,
"000001.blob": true,
"935203523.blob": true,
"000001.blobmeta.0": true,
}
fs := vfs.NewMem()
for tc, want := range testCases {
Expand Down Expand Up @@ -95,6 +97,17 @@ func TestFilenameRoundTrip(t *testing.T) {
}
}

func TestFilenameBlobMeta(t *testing.T) {
fileNum := DiskFileNum(rand.Uint64())
offset := rand.Int64()
fs := vfs.NewMem()
path := fmt.Sprintf("%s.%d", MakeFilepath(fs, "foo", FileTypeBlobMeta, fileNum), offset)
typ, fn, ok := ParseFilename(fs, path)
require.True(t, ok)
require.Equal(t, FileTypeBlobMeta, typ)
require.Equal(t, fileNum, fn)
}

type bufferFataler struct {
buf bytes.Buffer
}
Expand Down
14 changes: 14 additions & 0 deletions objstorage/objstorage.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,22 @@ type Writable interface {

// Finish completes the object and makes the data durable.
// No further calls are allowed after calling Finish.
//
// If Finish fails, it is expected that the caller will delete the created
// object. If the process crashes during Finish, it is expected that the file
// will be deleted on startup.
Finish() error

// StartMetadataPortion signals to the writer that the metadata part of the
// object starts here. If the object is being written to the cold tier, data
// in subsequent Write() calls will also be written to the hot tier.
//
// The function should be called at most one time.
//
// An error means that we won't be able to successfully finish this object.
// - Any constraints on when this can be called relative to Write()
StartMetadataPortion() error

// Abort gives up on finishing the object. There is no guarantee about whether
// the object exists after calling Abort.
// No further calls are allowed after calling Abort.
Expand Down
129 changes: 129 additions & 0 deletions objstorage/objstorageprovider/cold_readable.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.

package objstorageprovider

import (
"context"
"sync"

"github.com/cockroachdb/pebble/objstorage"
"github.com/cockroachdb/pebble/vfs"
)

// newColdReadable returns an objstorage.Readable that reads the main data from
// the wrapped "cold storage" readable, and the metadata from a separate file in
// a local filesystem. The separate file contains a suffix of the full file,
// starting at metaStartOffset.
func newColdReadable(
cold objstorage.Readable, metaFS vfs.FS, metaFilepath string, metaStartOffset int64,
) *coldReadable {
r := &coldReadable{
cold: cold,
}
r.meta.fs = metaFS
r.meta.filepath = metaFilepath
r.meta.startOffset = metaStartOffset
return r
}

type coldReadable struct {
cold objstorage.Readable

meta struct {
fs vfs.FS
filepath string
startOffset int64
once struct {
sync.Once
file vfs.File
err error
}
}
}

var _ objstorage.Readable = (*coldReadable)(nil)

// readMetaAt reads from the metadata file at the given offset.
func (r *coldReadable) readMetaAt(p []byte, off int64) error {
r.meta.once.Do(func() {
r.meta.once.file, r.meta.once.err = r.meta.fs.Open(r.meta.filepath, vfs.RandomReadsOption)
})
if r.meta.once.err != nil {
return r.meta.once.err
}
_, err := r.meta.once.file.ReadAt(p, off)
return err
}

// ReadAt is part of the objstorage.Readable interface.
func (r *coldReadable) ReadAt(ctx context.Context, p []byte, off int64) error {
// We don't expect reads that span both regions, but in that case it is
// correct to read it all from the cold file (which contains all the data).
if off < r.meta.startOffset {
return r.cold.ReadAt(ctx, p, off)
}
return r.readMetaAt(p, off-r.meta.startOffset)
}

// Close is part of the objstorage.Readable interface.
func (r *coldReadable) Close() error {
err := r.cold.Close()
if r.meta.once.file != nil {
err = firstError(err, r.meta.once.file.Close())
r.meta.once.file = nil
}
return err
}

// Size is part of the objstorage.Readable interface.
func (r *coldReadable) Size() int64 {
return r.cold.Size()
}

// NewReadHandle is part of the objstorage.Readable interface.
func (r *coldReadable) NewReadHandle(
readBeforeSize objstorage.ReadBeforeSize,
) objstorage.ReadHandle {
return &coldReadHandle{
r: r,
cold: r.cold.NewReadHandle(readBeforeSize),
}
}

type coldReadHandle struct {
r *coldReadable
cold objstorage.ReadHandle
}

var _ objstorage.ReadHandle = (*coldReadHandle)(nil)

// ReadAt is part of the objstorage.ReadHandle interface.
func (rh *coldReadHandle) ReadAt(ctx context.Context, p []byte, off int64) error {
if off < rh.r.meta.startOffset {
// Read from cold storage only.
return rh.cold.ReadAt(ctx, p, off)
}
// Read from metadata only.
return rh.r.readMetaAt(p, off-rh.r.meta.startOffset)
}

// Close is part of the objstorage.ReadHandle interface.
func (rh *coldReadHandle) Close() error {
return rh.cold.Close()
}

// SetupForCompaction is part of the objstorage.ReadHandle interface.
func (rh *coldReadHandle) SetupForCompaction() {
rh.cold.SetupForCompaction()
}

// RecordCacheHit is part of the objstorage.ReadHandle interface.
func (rh *coldReadHandle) RecordCacheHit(ctx context.Context, offset, size int64) {
// We don't use prefetching for the metadata portion, so we only need to
// report cache hits to the cold readable.
if offset < rh.r.meta.startOffset {
rh.cold.RecordCacheHit(ctx, offset, min(size, rh.r.meta.startOffset-offset))
}
}
Loading