Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/debuginfo/metadata_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ func TestMetadata(t *testing.T) {
store, err := NewStore(
tracer,
logger,
prometheus.NewRegistry(),
cacheDir,
NewObjectStoreMetadata(logger, bucket),
bucket,
Expand Down
97 changes: 88 additions & 9 deletions pkg/debuginfo/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/nanmu42/limitio"
"github.com/prometheus/client_golang/prometheus"
"github.com/thanos-io/objstore"
"github.com/thanos-io/objstore/client"
"go.opentelemetry.io/otel/attribute"
Expand Down Expand Up @@ -79,28 +80,96 @@ type Store struct {

metadata MetadataManager
debuginfodClient DebugInfodClient

validationAttemptsTotal prometheus.Counter
validationErrorsTotal prometheus.CounterVec
metadataUpdateDuration prometheus.Histogram
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should encapsulate this in metadata types and files rather than here.

Copy link
Author

@ksankeerth ksankeerth Sep 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A small doubt on this. I think We can move metadataUpdateDuration to metadata.go and encapsulate inside ObjectStoreMetadata struct. I'm not sure this comment for validationAttemptsTotal and validationErrorsTotal. Currently validation metrics are around this

if err := elfutils.ValidateFile(objFile); err != nil {

Could you please clarify this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we don't need those at all. Considering we'll always try to validate these files, maybe just having the duration is enough. What do you think?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ksankeerth, I think we can get rid of additional validation attempts and error metrics. And add additional reason ("validation") to the actual action metrics (e.g upload).

nice. It's clear now. I'll update the PR

uploadDebugInfoAttemptsTotal prometheus.Counter
uploadDebugInfoErrorsTotal prometheus.CounterVec
uploadDebugInfoDuration prometheus.Histogram
existsCheckDuration prometheus.Histogram
}

// NewStore returns a new debug info store.
func NewStore(
tracer trace.Tracer,
logger log.Logger,
reg prometheus.Registerer,
cacheDir string,
metadata MetadataManager,
bucket objstore.Bucket,
debuginfodClient DebugInfodClient,
) (*Store, error) {
uploadDebugInfoAttemptsTotal := prometheus.NewCounter(
prometheus.CounterOpts{
Name: "debuginfo_upload_attempts_total",
Help: "Total attempts to upload debuginfo.",
},
)
uploadDebugInfoErrorsTotal := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "debuginfo_upload_errors_total",
Help: "Total number of errors in uploading debuginfo.",
},
[]string{"reason"},
)
uploadDebugInfoDuration := prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "debuginfo_upload_duration_seconds",
Help: "How long it took in seconds to upload debuginfo.",
Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
},
)

existsCheckDuration := prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "debuginfo_exists_check_duration_seconds",
Help: "How long it took in seconds to check existing debuginfo.",
Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
},
)

metadataUpdateDuration := prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "debuginfo_metadata_update_duration_seconds",
Help: "How long it took in seconds to finish updating metadata.",
Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
},
)
validationAttemptsTotal := prometheus.NewCounter(
prometheus.CounterOpts{
Name: "debuginfo_validate_object_file_attempts_total",
Help: "Total number of validation of object file.",
},
)
validationErrorsTotal := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "debuginfo_validate_object_file_errors_total",
Help: "Total number of errors in validationg object file.",
},
[]string{"reason"},
)
return &Store{
tracer: tracer,
logger: log.With(logger, "component", "debuginfo"),
bucket: bucket,
cacheDir: cacheDir,
metadata: metadata,
debuginfodClient: debuginfodClient,
tracer: tracer,
logger: log.With(logger, "component", "debuginfo"),
bucket: bucket,
cacheDir: cacheDir,
metadata: metadata,
debuginfodClient: debuginfodClient,
metadataUpdateDuration: metadataUpdateDuration,
validationAttemptsTotal: validationAttemptsTotal,
validationErrorsTotal: *validationErrorsTotal,
uploadDebugInfoAttemptsTotal: uploadDebugInfoAttemptsTotal,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
uploadDebugInfoAttemptsTotal: uploadDebugInfoAttemptsTotal,
debugInfoUploadAttemptsTotal: uploadDebugInfoAttemptsTotal,

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
uploadDebugInfoAttemptsTotal: uploadDebugInfoAttemptsTotal,
debugInfoUploadAttemptsTotal: uploadDebugInfoAttemptsTotal,

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion. I'll make the changes.

uploadDebugInfoErrorsTotal: *uploadDebugInfoErrorsTotal,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
uploadDebugInfoErrorsTotal: *uploadDebugInfoErrorsTotal,
debugInfoUploadErrorsTotal: *uploadDebugInfoErrorsTotal,

Copy link
Author

@ksankeerth ksankeerth Sep 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1. will make the changes

uploadDebugInfoDuration: uploadDebugInfoDuration,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
uploadDebugInfoDuration: uploadDebugInfoDuration,
debugInfoUploadDuration: uploadDebugInfoDuration,

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will make the changes

existsCheckDuration: existsCheckDuration,
}, nil
}

func (s *Store) Exists(ctx context.Context, req *debuginfopb.ExistsRequest) (*debuginfopb.ExistsResponse, error) {
defer func(begin time.Time) {
s.existsCheckDuration.Observe(time.Since(begin).Seconds())
}(time.Now())
span := trace.SpanFromContext(ctx)
span.SetAttributes(attribute.String("build_id", req.GetBuildId()))

Expand Down Expand Up @@ -140,8 +209,13 @@ func (s *Store) Exists(ctx context.Context, req *debuginfopb.ExistsRequest) (*de
}

func (s *Store) Upload(stream debuginfopb.DebugInfoService_UploadServer) error {
defer func(begin time.Time) {
s.uploadDebugInfoDuration.Observe(time.Since(begin).Seconds())
}(time.Now())
s.uploadDebugInfoAttemptsTotal.Inc()
req, err := stream.Recv()
if err != nil {
s.uploadDebugInfoErrorsTotal.WithLabelValues("stream_receive").Inc()
msg := "failed to receive upload info"
level.Error(s.logger).Log("msg", msg, "err", err)
return status.Errorf(codes.Unknown, msg)
Expand All @@ -159,6 +233,7 @@ func (s *Store) Upload(stream debuginfopb.DebugInfoService_UploadServer) error {
span.SetAttributes(attribute.String("hash", hash))

if err := s.upload(ctx, buildID, hash, r); err != nil {
s.uploadDebugInfoErrorsTotal.WithLabelValues("store_upload").Inc()
return err
}

Expand Down Expand Up @@ -223,13 +298,14 @@ func (s *Store) upload(ctx context.Context, buildID, hash string, r io.Reader) e
if err != nil {
return status.Error(codes.Internal, err.Error())
}

s.validationAttemptsTotal.Inc()
if err := elfutils.ValidateFile(objFile); err != nil {
// Failed to validate. Mark the file as corrupted, and let the client try to upload it again.
if err := s.metadata.MarkAsCorrupted(ctx, buildID); err != nil {
level.Warn(s.logger).Log("msg", "failed to update metadata as corrupted", "err", err)
}
level.Error(s.logger).Log("msg", "failed to validate object file", "buildid", buildID)
s.validationErrorsTotal.WithLabelValues("validate_object_file").Inc()
// Client will retry.
return status.Error(codes.Internal, err.Error())
}
Expand All @@ -242,9 +318,9 @@ func (s *Store) upload(ctx context.Context, buildID, hash string, r io.Reader) e
hasDWARF, err := elfutils.HasDWARF(f)
if err != nil {
level.Debug(s.logger).Log("msg", "failed to check for DWARF", "err", err)
s.validationErrorsTotal.WithLabelValues("has_dwarf_object_file").Inc()
}
f.Close()

if hasDWARF {
return status.Error(codes.AlreadyExists, "debuginfo already exists")
}
Expand All @@ -253,7 +329,9 @@ func (s *Store) upload(ctx context.Context, buildID, hash string, r io.Reader) e

// At this point we know that we received a better version of the debug information file,
// so let the client upload it.

defer func(begin time.Time) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is semantically correct. There are lots of other operations happening in the rest of the method.

Copy link
Author

@ksankeerth ksankeerth Sep 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes. thanks for correcting it. I think I cannot use defer here. I'll update the PR

s.metadataUpdateDuration.Observe(time.Since(begin).Seconds())
}(time.Now())
if err := s.metadata.MarkAsUploading(ctx, buildID); err != nil {
err = fmt.Errorf("failed to update metadata before uploading: %w", err)
return status.Error(codes.Internal, err.Error())
Expand All @@ -279,6 +357,7 @@ func (s *Store) upload(ctx context.Context, buildID, hash string, r io.Reader) e
}

if err := elfutils.ValidateHeader(b); err != nil {
s.validationErrorsTotal.WithLabelValues("validate_header_object_file").Inc()
// Failed to validate. Mark the incoming stream as corrupted, and let the client try to upload it again.
if err := s.metadata.MarkAsCorrupted(ctx, buildID); err != nil {
err = fmt.Errorf("failed to update metadata after uploaded, as corrupted: %w", err)
Expand Down
1 change: 1 addition & 0 deletions pkg/debuginfo/store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ func TestStore(t *testing.T) {
s, err := NewStore(
tracer,
logger,
prometheus.NewRegistry(),
cacheDir,
NewObjectStoreMetadata(logger, bucket),
bucket,
Expand Down
1 change: 1 addition & 0 deletions pkg/parca/parca.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ func Run(ctx context.Context, logger log.Logger, reg *prometheus.Registry, flags
dbgInfo, err := debuginfo.NewStore(
tracerProvider.Tracer("debuginfo"),
logger,
reg,
flags.DebuginfoCacheDir,
dbgInfoMetadata,
objstore.NewPrefixedBucket(bucket, "debuginfo"),
Expand Down
1 change: 1 addition & 0 deletions pkg/symbolizer/symbolizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ func setup(t *testing.T) (*grpc.ClientConn, pb.MetastoreServiceClient, *Symboliz
dbgStr, err := debuginfo.NewStore(
tracer,
logger,
prometheus.NewRegistry(),
debugInfoCacheDir,
metadata,
bucket,
Expand Down