From 97f2b521fb2bdf16fdc070a7c10ead60888e4fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Magiera?= Date: Mon, 30 Sep 2024 23:52:41 +0200 Subject: [PATCH 1/2] storage: Add post challenge reading metrics --- harmony/harmonydb/metrics.go | 9 +++--- lib/paths/local.go | 56 ++++++++++++++++++++++++++++++++++-- lib/paths/metrics.go | 48 +++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 6 deletions(-) create mode 100644 lib/paths/metrics.go diff --git a/harmony/harmonydb/metrics.go b/harmony/harmonydb/metrics.go index 7fa78936f..cc90b260e 100644 --- a/harmony/harmonydb/metrics.go +++ b/harmony/harmonydb/metrics.go @@ -5,8 +5,6 @@ import ( "go.opencensus.io/stats" "go.opencensus.io/stats/view" "go.opencensus.io/tag" - - "github.com/filecoin-project/lotus/metrics" ) var ( @@ -43,7 +41,7 @@ var DBMeasures = struct { // CacheViews groups all cache-related default views. func init() { - metrics.RegisterViews( + err := view.Register( &view.View{ Measure: DBMeasures.Hits, Aggregation: view.Sum(), @@ -65,7 +63,10 @@ func init() { TagKeys: []tag.Key{dbTag}, }, ) - err := prometheus.Register(DBMeasures.Waits) + if err != nil { + panic(err) + } + err = prometheus.Register(DBMeasures.Waits) if err != nil { panic(err) } diff --git a/lib/paths/local.go b/lib/paths/local.go index 6b8944983..d9c5f1671 100644 --- a/lib/paths/local.go +++ b/lib/paths/local.go @@ -4,6 +4,9 @@ import ( "bytes" "context" "encoding/json" + "fmt" + "go.opencensus.io/stats" + "go.opencensus.io/tag" "math/bits" "math/rand" "os" @@ -30,6 +33,9 @@ import ( "github.com/filecoin-project/lotus/storage/sealer/fsutil" ) +// time abow which a warn log will be emitted for slow PoSt reads +var SlowPoStCheckThreshold = 45 * time.Second + type LocalStorage interface { GetStorage() (storiface.StorageConfig, error) SetStorage(func(*storiface.StorageConfig)) error @@ -959,6 +965,13 @@ func (st *Local) GenerateSingleVanillaProof(ctx context.Context, minerID abi.Act if si.Update { src, si, err := st.AcquireSector(ctx, sr, storiface.FTUpdate|storiface.FTUpdateCache, storiface.FTNone, storiface.PathStorage, storiface.AcquireMove) if err != nil { + // Record the error with tags + ctx, _ = tag.New(ctx, + tag.Upsert(updateTagKey, fmt.Sprintf("%t", si.Update)), + tag.Upsert(cacheIDTagKey, ""), + tag.Upsert(sealedIDTagKey, ""), + ) + stats.Record(ctx, GenerateSingleVanillaProofErrors.M(1)) return nil, xerrors.Errorf("acquire sector: %w", err) } cache, sealed = src.UpdateCache, src.Update @@ -966,6 +979,13 @@ func (st *Local) GenerateSingleVanillaProof(ctx context.Context, minerID abi.Act } else { src, si, err := st.AcquireSector(ctx, sr, storiface.FTSealed|storiface.FTCache, storiface.FTNone, storiface.PathStorage, storiface.AcquireMove) if err != nil { + // Record the error with tags + ctx, _ = tag.New(ctx, + tag.Upsert(updateTagKey, fmt.Sprintf("%t", si.Update)), + tag.Upsert(cacheIDTagKey, ""), + tag.Upsert(sealedIDTagKey, ""), + ) + stats.Record(ctx, GenerateSingleVanillaProofErrors.M(1)) return nil, xerrors.Errorf("acquire sector: %w", err) } cache, sealed = src.Cache, src.Sealed @@ -973,9 +993,29 @@ func (st *Local) GenerateSingleVanillaProof(ctx context.Context, minerID abi.Act } if sealed == "" || cache == "" { + // Record the error with tags + ctx, _ = tag.New(ctx, + tag.Upsert(updateTagKey, fmt.Sprintf("%t", si.Update)), + tag.Upsert(cacheIDTagKey, cacheID), + tag.Upsert(sealedIDTagKey, sealedID), + ) + stats.Record(ctx, GenerateSingleVanillaProofErrors.M(1)) return nil, errPathNotFound } + // Add metrics context with tags + ctx, err := tag.New(ctx, + tag.Upsert(updateTagKey, fmt.Sprintf("%t", si.Update)), + tag.Upsert(cacheIDTagKey, cacheID), + tag.Upsert(sealedIDTagKey, sealedID), + ) + if err != nil { + log.Errorw("failed to create tagged context", "err", err) + } + + // Record that the function was called + stats.Record(ctx, GenerateSingleVanillaProofCalls.M(1)) + psi := ffi.PrivateSectorInfo{ SectorInfo: proof.SectorInfo{ SealProof: si.SealProof, @@ -996,11 +1036,23 @@ func (st *Local) GenerateSingleVanillaProof(ctx context.Context, minerID abi.Act select { case r := <-resCh: + // Record the duration upon successful completion + duration := time.Since(start).Milliseconds() + stats.Record(ctx, GenerateSingleVanillaProofDuration.M(duration)) + + if duration > SlowPoStCheckThreshold.Milliseconds() { + log.Warnw("slow GenerateSingleVanillaProof", "duration", duration, "cache-id", cacheID, "sealed-id", sealedID, "cache", cache, "sealed", sealed, "sector", si) + } + return r.Unwrap() case <-ctx.Done(): - log.Errorw("failed to generate valilla PoSt proof before context cancellation", "err", ctx.Err(), "duration", time.Since(start), "cache-id", cacheID, "sealed-id", sealedID, "cache", cache, "sealed", sealed) + // Record the duration and error if the context is canceled + duration := time.Since(start).Milliseconds() + stats.Record(ctx, GenerateSingleVanillaProofDuration.M(duration)) + stats.Record(ctx, GenerateSingleVanillaProofErrors.M(1)) + log.Errorw("failed to generate vanilla PoSt proof before context cancellation", "err", ctx.Err(), "duration", duration, "cache-id", cacheID, "sealed-id", sealedID, "cache", cache, "sealed", sealed) - // this will leave the GenerateSingleVanillaProof goroutine hanging, but that's still less bad than failing PoSt + // This will leave the GenerateSingleVanillaProof goroutine hanging, but that's still less bad than failing PoSt return nil, xerrors.Errorf("failed to generate vanilla proof before context cancellation: %w", ctx.Err()) } } diff --git a/lib/paths/metrics.go b/lib/paths/metrics.go new file mode 100644 index 000000000..2523263c7 --- /dev/null +++ b/lib/paths/metrics.go @@ -0,0 +1,48 @@ +package paths + +import ( + "go.opencensus.io/stats" + "go.opencensus.io/stats/view" + "go.opencensus.io/tag" +) + +var ( + updateTagKey, _ = tag.NewKey("update") + cacheIDTagKey, _ = tag.NewKey("cache_id") + sealedIDTagKey, _ = tag.NewKey("sealed_id") + + pre = "curio_stor_" + + // Buckets for the duration histogram (in seconds) + durationBuckets = []float64{0.1, 1, 5, 12, 20, 60, 90, 150, 300, 500, 900, 1500, 3000, 6000, 15000, 30000, 60000, 90000, 200_000, 600_000, 1000_000} +) + +var ( + // Measures + GenerateSingleVanillaProofCalls = stats.Int64(pre+"generate_single_vanilla_proof_calls", "Number of calls to GenerateSingleVanillaProof", stats.UnitDimensionless) + GenerateSingleVanillaProofErrors = stats.Int64(pre+"generate_single_vanilla_proof_errors", "Number of errors in GenerateSingleVanillaProof", stats.UnitDimensionless) + GenerateSingleVanillaProofDuration = stats.Int64(pre+"generate_single_vanilla_proof_duration_seconds", "Duration of GenerateSingleVanillaProof in seconds", stats.UnitMilliseconds) +) + +func init() { + err := view.Register( + &view.View{ + Measure: GenerateSingleVanillaProofCalls, + Aggregation: view.Count(), + TagKeys: []tag.Key{updateTagKey, cacheIDTagKey, sealedIDTagKey}, + }, + &view.View{ + Measure: GenerateSingleVanillaProofErrors, + Aggregation: view.Count(), + TagKeys: []tag.Key{updateTagKey, cacheIDTagKey, sealedIDTagKey}, + }, + &view.View{ + Measure: GenerateSingleVanillaProofDuration, + Aggregation: view.Distribution(durationBuckets...), + TagKeys: []tag.Key{updateTagKey, cacheIDTagKey, sealedIDTagKey}, + }, + ) + if err != nil { + panic(err) + } +} From 8c05c5901c60206a2b87efe39f3d56d5ebfc6ca1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Magiera?= Date: Wed, 9 Oct 2024 23:36:20 +0200 Subject: [PATCH 2/2] make gen --- lib/paths/local.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/paths/local.go b/lib/paths/local.go index d9c5f1671..169a46387 100644 --- a/lib/paths/local.go +++ b/lib/paths/local.go @@ -5,8 +5,6 @@ import ( "context" "encoding/json" "fmt" - "go.opencensus.io/stats" - "go.opencensus.io/tag" "math/bits" "math/rand" "os" @@ -18,6 +16,8 @@ import ( "time" "github.com/ipfs/go-cid" + "go.opencensus.io/stats" + "go.opencensus.io/tag" "golang.org/x/xerrors" ffi "github.com/filecoin-project/filecoin-ffi"