Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto-create tracing spans for log groups #2309

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions agent/agent_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,5 @@ type AgentConfiguration struct {
AcquireJob string
TracingBackend string
TracingServiceName string
TraceLogGroups bool
}
1 change: 1 addition & 0 deletions agent/job_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ func (r *JobRunner) createEnvironment() ([]string, error) {
if r.conf.AgentConfiguration.TracingBackend != "" {
env["BUILDKITE_TRACING_BACKEND"] = r.conf.AgentConfiguration.TracingBackend
env["BUILDKITE_TRACING_SERVICE_NAME"] = r.conf.AgentConfiguration.TracingServiceName
env["BUILDKITE_TRACE_LOG_GROUPS"] = fmt.Sprintf("%t", r.conf.AgentConfiguration.TraceLogGroups)
}

// see documentation for BuildkiteMessageMax
Expand Down
13 changes: 12 additions & 1 deletion clicommand/agent_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ type AgentStartConfig struct {
MetricsDatadogDistributions bool `cli:"metrics-datadog-distributions"`
TracingBackend string `cli:"tracing-backend"`
TracingServiceName string `cli:"tracing-service-name"`
TraceLogGroups bool `cli:"trace-log-groups"`

// Global flags
Debug bool `cli:"debug"`
Expand Down Expand Up @@ -582,6 +583,11 @@ var AgentStartCommand = cli.Command{
EnvVar: "BUILDKITE_TRACING_SERVICE_NAME",
Value: "buildkite-agent",
},
cli.BoolFlag{
Name: "trace-log-groups",
Usage: "Automatically creates tracing spans for log groups when tracing is enabled.",
EnvVar: "BUILDKITE_TRACE_LOG_GROUPS",
},
cli.StringFlag{
Name: "job-verification-key-path",
Usage: "Path to a file containing a verification key. Passing this flag enables job verification. For hmac-sha256, the raw file content is used as the shared key",
Expand Down Expand Up @@ -800,9 +806,13 @@ var AgentStartCommand = cli.Command{
})

// Sense check supported tracing backends, we don't want bootstrapped jobs to silently have no tracing
if _, has := tracetools.ValidTracingBackends[cfg.TracingBackend]; !has {
_, tracingEnabled := tracetools.ValidTracingBackends[cfg.TracingBackend]
if !tracingEnabled {
l.Fatal("The given tracing backend %q is not supported. Valid backends are: %q", cfg.TracingBackend, maps.Keys(tracetools.ValidTracingBackends))
}
if !tracingEnabled && cfg.TraceLogGroups {
l.Fatal("Log group tracing cannot be enabled without a tracing backend.")
}

if experiments.IsEnabled(experiments.AgentAPI) {
shutdown := runAgentAPI(ctx, l, cfg.SocketsPath)
Expand Down Expand Up @@ -864,6 +874,7 @@ var AgentStartCommand = cli.Command{
AcquireJob: cfg.AcquireJob,
TracingBackend: cfg.TracingBackend,
TracingServiceName: cfg.TracingServiceName,
TraceLogGroups: cfg.TraceLogGroups,
JobVerificationNoSignatureBehavior: cfg.JobVerificationNoSignatureBehavior,
JobVerificationInvalidSignatureBehavior: cfg.JobVerificationInvalidSignatureBehavior,

Expand Down
7 changes: 7 additions & 0 deletions clicommand/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ type BootstrapConfig struct {
RedactedVars []string `cli:"redacted-vars" normalize:"list"`
TracingBackend string `cli:"tracing-backend"`
TracingServiceName string `cli:"tracing-service-name"`
TraceLogGroups bool `cli:"trace-log-groups"`
}

var BootstrapCommand = cli.Command{
Expand Down Expand Up @@ -355,6 +356,11 @@ var BootstrapCommand = cli.Command{
EnvVar: "BUILDKITE_TRACING_SERVICE_NAME",
Value: "buildkite-agent",
},
cli.BoolFlag{
Name: "trace-log-groups",
Usage: "Automatically creates tracing spans for log groups when tracing is enabled.",
EnvVar: "BUILDKITE_TRACE_LOG_GROUPS",
},
DebugFlag,
LogLevelFlag,
ExperimentsFlag,
Expand Down Expand Up @@ -439,6 +445,7 @@ var BootstrapCommand = cli.Command{
Tag: cfg.Tag,
TracingBackend: cfg.TracingBackend,
TracingServiceName: cfg.TracingServiceName,
TraceLogGroups: cfg.TraceLogGroups,
})

cctx, cancel := context.WithCancel(ctx)
Expand Down
3 changes: 3 additions & 0 deletions internal/job/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ type ExecutorConfig struct {

// Service name to use when reporting traces.
TracingServiceName string

// Whether to auto-create tracing spans for log groups.
TraceLogGroups bool
}

// ReadFromEnvironment reads configuration from the Environment, returns a map
Expand Down
1 change: 1 addition & 0 deletions internal/job/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) {
e.shell.Debug = e.ExecutorConfig.Debug
e.shell.InterruptSignal = e.ExecutorConfig.CancelSignal
e.shell.SignalGracePeriod = e.ExecutorConfig.SignalGracePeriod
e.shell.TraceLogGroups = e.ExecutorConfig.TraceLogGroups
}
if experiments.IsEnabled(experiments.KubernetesExec) {
kubernetesClient := &kubernetes.Client{}
Expand Down
43 changes: 40 additions & 3 deletions internal/job/shell/shell.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ type Shell struct {

// Amount of time to wait between sending the InterruptSignal and SIGKILL
SignalGracePeriod time.Duration

// Whether to auto create tracing spans for log groups
TraceLogGroups bool
}

// New returns a new Shell
Expand Down Expand Up @@ -532,6 +535,32 @@ func round(d time.Duration) time.Duration {
}
}

// spanMakerWriter is an io.Writer shim that captures logs and automatically creates trace spans for the log group.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file is getting pretty big. Can you put spanMakerWriter and its methods in a new file?

And I think its name should reflect that it creates spans from log groups. spansFromLogGroupWriter perhaps?

type spanMakerWriter struct {
w io.Writer
ctx context.Context
span opentracing.Span
}

func (s *spanMakerWriter) Write(p []byte) (n int, err error) {
if bytes.HasPrefix(p, []byte("~~~ ")) || bytes.HasPrefix(p, []byte("--- ")) || bytes.HasPrefix(p, []byte("+++ ")) {
s.FinishIfActive()
operationName, _, _ := strings.Cut(string(p[4:]), "\r\n")
// We don't store the context bc we don't need to create child spans (yet). If we stored it, every log group would
// look like a child of the previous log group, where they're all more like siblings under the same parent span,
// since Buildkite itself has no concept of log group hierarchy.
s.span, _ = opentracing.StartSpanFromContext(s.ctx, operationName)
}
return s.w.Write(p)
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the complete line is not guaranteed to be contained in a single Write. The +++, etc could occur in the middle of p. Even worse, it could be split across consecutive calls to Write.

I think what we need here is a Writer that has a state machine that will capture what's input between (for example) +++ and then next \n and creates a new span as appropriate. It will also have to have an upper bound on how much it captures to prevent an adversary from crafting an input that will cause the agent to run out of memory. Of course, a limit is also useful because the tracing backend will have a limit on how long a span's name can be.


func (s *spanMakerWriter) FinishIfActive() {
if s.span != nil {
s.span.Finish()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing missing is being able to mark a span as an error. I think we'd need to pass in the return of p.WaitResult here to know the process' exit code.

s.span = nil
}
}

func (s *Shell) executeCommand(
ctx context.Context,
cmd *command,
Expand All @@ -542,6 +571,14 @@ func (s *Shell) executeCommand(
tracedEnv := env.FromSlice(cmd.Env)
s.injectTraceCtx(ctx, tracedEnv)
cmd.Env = tracedEnv.ToSlice()
writer := w
writerCloser := func() {}
if s.TraceLogGroups {
logToSpanWriter := &spanMakerWriter{w: w, ctx: ctx, span: nil}
writer = logToSpanWriter
writerCloser = func() { logToSpanWriter.FinishIfActive() }
}
defer writerCloser()

s.cmdLock.Lock()
s.cmd = cmd
Expand All @@ -561,11 +598,11 @@ func (s *Shell) executeCommand(
// Modify process config based on execution flags
if flags.PTY {
cfg.PTY = true
cfg.Stdout = w
cfg.Stdout = writer
} else {
// Show stdout if requested or via debug
if flags.Stdout {
cfg.Stdout = w
cfg.Stdout = writer
} else if s.Debug {
stdOutStreamer := NewLoggerStreamer(s.Logger)
defer stdOutStreamer.Close()
Expand All @@ -574,7 +611,7 @@ func (s *Shell) executeCommand(

// Show stderr if requested or via debug
if flags.Stderr {
cfg.Stderr = w
cfg.Stderr = writer
} else if s.Debug {
stdErrStreamer := NewLoggerStreamer(s.Logger)
defer stdErrStreamer.Close()
Expand Down