@@ -10,6 +10,7 @@ import (
1010
1111 "github.com/go-kit/log"
1212 "github.com/go-kit/log/level"
13+ "github.com/prometheus/client_golang/prometheus"
1314 "github.com/thanos-io/objstore"
1415 "google.golang.org/grpc/codes"
1516 "google.golang.org/grpc/status"
@@ -36,18 +37,20 @@ type Recovery struct {
3637 logger log.Logger
3738 metastore Metastore
3839 bucket objstore.Bucket
40+ metrics * metrics
3941
4042 started bool
4143 cancel func ()
4244 m sync.Mutex
4345}
4446
45- func NewRecovery (logger log.Logger , config Config , metastore Metastore , bucket objstore.Bucket ) * Recovery {
47+ func NewRecovery (logger log.Logger , config Config , metastore Metastore , bucket objstore.Bucket , reg prometheus. Registerer ) * Recovery {
4648 return & Recovery {
4749 config : config ,
4850 logger : logger ,
4951 metastore : metastore ,
5052 bucket : bucket ,
53+ metrics : newMetrics (reg ),
5154 }
5255}
5356
@@ -121,37 +124,46 @@ func (r *Recovery) recover(ctx context.Context, path string) (err error) {
121124 switch {
122125 case err == nil :
123126 case errors .Is (err , context .Canceled ):
127+ r .metrics .recoveryAttempts .WithLabelValues ("canceled" ).Inc ()
124128 return err
125129 case r .bucket .IsObjNotFoundErr (err ):
126130 // This is somewhat opportunistic: the error is likely caused by a competing recovery
127131 // process that has already recovered the block, before we've discovered that the
128132 // leadership has changed.
133+ r .metrics .recoveryAttempts .WithLabelValues ("not_found" ).Inc ()
129134 level .Warn (r .logger ).Log ("msg" , "block metadata not found; skipping" , "path" , path )
130135 return nil
131136 default :
132137 // This is somewhat opportunistic, as we don't know if the error is transient or not.
133138 // we should consider an explicit retry mechanism with backoff and a limit on the
134139 // number of attempts.
140+ r .metrics .recoveryAttempts .WithLabelValues ("read_error" ).Inc ()
135141 level .Warn (r .logger ).Log ("msg" , "failed to read block metadata; to be retried" , "err" , err , "path" , path )
136142 return err
137143 }
138144
139145 var meta metastorev1.BlockMeta
140146 if err = meta .UnmarshalVT (b ); err != nil {
141- level .Error (r .logger ).Log ("msg" , "invalid block metadata; skipping" , "err" , err , "path" , path )
147+ r .metrics .recoveryAttempts .WithLabelValues ("unmarshal_error" ).Inc ()
148+ level .Error (r .logger ).Log ("msg" , "failed to unmarshal block metadata; skipping" , "err" , err , "path" , path )
142149 return nil
143150 }
144151
145152 switch _ , err = r .metastore .AddRecoveredBlock (ctx , & metastorev1.AddBlockRequest {Block : & meta }); {
146153 case err == nil :
154+ r .metrics .recoveryAttempts .WithLabelValues ("success" ).Inc ()
155+ level .Debug (r .logger ).Log ("msg" , "successfully recovered block from DLQ" , "block_id" , meta .Id , "path" , path )
147156 return nil
148157 case status .Code (err ) == codes .InvalidArgument :
149- level .Error (r .logger ).Log ("msg" , "invalid block metadata" , "err" , err , "path" , path )
158+ r .metrics .recoveryAttempts .WithLabelValues ("invalid_metadata" ).Inc ()
159+ level .Error (r .logger ).Log ("msg" , "block metadata rejected by metastore; skipping" , "err" , err , "block_id" , meta .Id , "path" , path )
150160 return nil
151161 case raftnode .IsRaftLeadershipError (err ):
162+ r .metrics .recoveryAttempts .WithLabelValues ("leadership_change" ).Inc ()
152163 level .Warn (r .logger ).Log ("msg" , "leadership change; recovery interrupted" , "err" , err , "path" , path )
153164 return err
154165 default :
166+ r .metrics .recoveryAttempts .WithLabelValues ("metastore_error" ).Inc ()
155167 level .Error (r .logger ).Log ("msg" , "failed to add block metadata; to be retried" , "err" , err , "path" , path )
156168 return err
157169 }
0 commit comments