[receiver/journald] Restart journalctl if it exits unexpectedly (open…

…-telemetry#35635)  #### Description According to the community, there are bugs in systemd that could corrupt the journal files or crash the log receiver: systemd/systemd#24320 systemd/systemd#24150 We've seen some issues reported to Elastic/beats project: elastic/beats#39352 elastic/beats#32782 elastic/beats#34077 Unfortunately, the otelcol is not immune from these issues. When the journalctl process exits for any reason, the log consumption from journald just stops. We've experienced this on some machines that have high log volume. Currently we monitors the journalctl processes started by otelcol, and restart the otelcol when some of them is missing. IMO, The journald receiver itself should monitor the journalctl process it starts, and does its best to keep it alive. In this PR, we try to restart the journalctl process when it exits unexpectedly. As long as the journalctl cmd can be started (via `Cmd.Start()`) successfully, the journald_input will always try to restart the journalctl process if it exits. The error reporting behaviour changes a bit in this PR. Before the PR, the `operator.Start` waits up to 1 sec to capture any immediate error returned from journalctl. After the PR, the error won't be reported back even if the journalctl exits immediately after start, instead, the error will be logged, and the process will be restarted. The fix is largely inspired by elastic/beats#40558.  #### Testing Add a simple bash script that print a line every second, and load it to systemd. `log_every_second.sh`: ```bash #!/bin/bash while true; do echo "Log message: $(date)" sleep 1 done ``` `log.service`: ``` [Unit] Description=Print logs to journald every second After=network.target [Service] ExecStart=/usr/local/bin/log_every_second.sh Restart=always StandardOutput=journal StandardError=journal [Install] WantedBy=multi-user.target ``` Start the otelcol with the following config: ```yaml service: telemetry: logs: level: debug pipelines: logs: receivers: [journald] processors: [] exporters: [debug] receivers: journald: exporters: debug: verbosity: basic sampling_initial: 1 sampling_thereafter: 1 ``` Kill the journalctl process and observe the otelcol's behaviour. The journactl process will be restarted after the backoff period (hardcoded to 2 sec): ```bash 2024-10-06T14:32:33.755Z info LogsExporter {"kind": "exporter", "data_type": "logs", "name": "debug", "resource logs": 1, "log records": 1} 2024-10-06T14:32:34.709Z error journald/input.go:98 journalctl command exited {"kind": "receiver", "name": "journald", "data_type": "logs", "operator_id": "journald_input", "operator_type": "journald_input", "error": "signal: terminated"} github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator/input/journald.(*Input).run github.com/open-telemetry/opentelemetry-collector-contrib/pkg/[email protected]/operator/input/journald/input.go:98 2024-10-06T14:32:36.712Z debug journald/input.go:94 Starting the journalctl command {"kind": "receiver", "name": "journald", "data_type": "logs", "operator_id": "journald_input", "operator_type": "journald_input"} 2024-10-06T14:32:36.756Z info LogsExporter {"kind": "exporter", "data_type": "logs", "name": "debug", "resource logs": 1, "log records": 10} ```  --------- Signed-off-by: Mengnan Gong <[email protected]>
jpbarto · Oct 29, 2024 · 3cdf8fe · 3cdf8fe
1 parent db0f58d
commit 3cdf8fe
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 80 deletions.
diff --git a/.chloggen/restart-journalctl.yaml b/.chloggen/restart-journalctl.yaml
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: enhancement
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: journaldreceiver
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: "Restart journalctl if it exits unexpectedly"
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [35635]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: []
diff --git a/pkg/stanza/operator/input/journald/config_linux.go b/pkg/stanza/operator/input/journald/config_linux.go
@@ -40,10 +40,12 @@ func (c Config) Build(set component.TelemetrySettings) (operator.Operator, error
 	return &Input{
 		InputOperator: inputOperator,
 		newCmd: func(ctx context.Context, cursor []byte) cmd {
+			// Copy args and if needed, add the cursor flag
+			journalArgs := append([]string{}, args...)
 			if cursor != nil {
-				args = append(args, "--after-cursor", string(cursor))
+				journalArgs = append(journalArgs, "--after-cursor", string(cursor))
 			}
-			return exec.CommandContext(ctx, "journalctl", args...) // #nosec - ...
+			return exec.CommandContext(ctx, "journalctl", journalArgs...) // #nosec - ...
 			// journalctl is an executable that is required for this operator to function
 		},
 		json: jsoniter.ConfigFastest,

diff --git a/pkg/stanza/operator/input/journald/input.go b/pkg/stanza/operator/input/journald/input.go
@@ -13,7 +13,6 @@ import (
 	"io"
 	"os/exec"
 	"strconv"
-	"strings"
 	"sync"
 	"time"
 
@@ -35,6 +34,7 @@ type Input struct {
 	json      jsoniter.API
 	cancel    context.CancelFunc
 	wg        sync.WaitGroup
+	errChan   chan error
 }
 
 type cmd interface {
@@ -44,9 +44,10 @@ type cmd interface {
 	Wait() error
 }
 
-type failedCommand struct {
-	err    string
-	output string
+type journalctl struct {
+	cmd    cmd
+	stdout io.ReadCloser
+	stderr io.ReadCloser
 }
 
 var lastReadCursorKey = "lastReadCursor"
@@ -56,85 +57,123 @@ func (operator *Input) Start(persister operator.Persister) error {
 	ctx, cancel := context.WithCancel(context.Background())
 	operator.cancel = cancel
 
+	operator.persister = persister
+	operator.errChan = make(chan error)
+
+	go operator.run(ctx)
+
+	select {
+	case err := <-operator.errChan:
+		return fmt.Errorf("journalctl command failed: %w", err)
+	case <-time.After(waitDuration):
+		return nil
+	}
+}
+
+// run starts the journalctl process and monitor it.
+// If there is an error in operator.newJournalctl, the error will be sent to operator.errChan.
+// If the journalctl process started successfully, but there is an error in operator.runJournalctl,
+// The error will be logged and the journalctl process will be restarted.
+func (operator *Input) run(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		default:
+			jctl, err := operator.newJournalctl(ctx)
+			// If we can't start journalctl, there is nothing we can do but logging the error and return.
+			if err != nil {
+				select {
+				case operator.errChan <- err:
+				case <-time.After(waitDuration):
+					operator.Logger().Error("Failed to init and start journalctl", zap.Error(err))
+				}
+				return
+			}
+
+			operator.Logger().Debug("Starting the journalctl command")
+			if err := operator.runJournalctl(ctx, jctl); err != nil {
+				ee := &exec.ExitError{}
+				if ok := errors.As(err, &ee); ok && ee.ExitCode() != 0 {
+					operator.Logger().Error("journalctl command exited", zap.Error(ee))
+				} else {
+					operator.Logger().Info("journalctl command exited")
+				}
+			}
+			// Backoff before restart.
+			select {
+			case <-ctx.Done():
+				return
+			case <-time.After(2 * time.Second):
+			}
+		}
+	}
+}
+
+// newJournalctl creates a new journalctl command.
+func (operator *Input) newJournalctl(ctx context.Context) (*journalctl, error) {
 	// Start from a cursor if there is a saved offset
-	cursor, err := persister.Get(ctx, lastReadCursorKey)
+	cursor, err := operator.persister.Get(ctx, lastReadCursorKey)
 	if err != nil {
-		return fmt.Errorf("failed to get journalctl state: %w", err)
+		return nil, fmt.Errorf("failed to get journalctl state: %w", err)
 	}
 
-	operator.persister = persister
-
-	// Start journalctl
 	journal := operator.newCmd(ctx, cursor)
-	stdout, err := journal.StdoutPipe()
-	if err != nil {
-		return fmt.Errorf("failed to get journalctl stdout: %w", err)
+	jctl := &journalctl{
+		cmd: journal,
 	}
-	stderr, err := journal.StderrPipe()
+
+	jctl.stdout, err = journal.StdoutPipe()
 	if err != nil {
-		return fmt.Errorf("failed to get journalctl stderr: %w", err)
+		return nil, fmt.Errorf("failed to get journalctl stdout: %w", err)
 	}
-	err = journal.Start()
+	jctl.stderr, err = journal.StderrPipe()
 	if err != nil {
-		return fmt.Errorf("start journalctl: %w", err)
+		return nil, fmt.Errorf("failed to get journalctl stderr: %w", err)
 	}
 
-	stderrChan := make(chan string)
-	failedChan := make(chan failedCommand)
-
-	// Start the wait goroutine
-	operator.wg.Add(1)
-	go func() {
-		defer operator.wg.Done()
-		err := journal.Wait()
-		message := <-stderrChan
-
-		f := failedCommand{
-			output: message,
-		}
-
-		if err != nil {
-			ee := (&exec.ExitError{})
-			if ok := errors.As(err, &ee); ok && ee.ExitCode() != 0 {
-				f.err = ee.Error()
-			}
-		}
+	if err = journal.Start(); err != nil {
+		return nil, fmt.Errorf("start journalctl: %w", err)
+	}
 
-		select {
-		case failedChan <- f:
-		// log an error in case channel is closed
-		case <-time.After(waitDuration):
-			operator.Logger().Error("journalctl command exited", zap.String("error", f.err), zap.String("output", f.output))
-		}
-	}()
+	return jctl, nil
+}
 
-	// Start the stderr reader goroutine
+// runJournalctl runs the journalctl command. This is a blocking call that returns
+// when the command exits.
+func (operator *Input) runJournalctl(ctx context.Context, jctl *journalctl) error {
+	// Start the stderr reader goroutine.
+	// This goroutine reads the stderr from the journalctl process. If the
+	// process exits for any reason, then the stderr will be closed, this
+	// goroutine will get an EOF error and exit.
 	operator.wg.Add(1)
 	go func() {
 		defer operator.wg.Done()
 
-		stderrBuf := bufio.NewReader(stderr)
-		messages := []string{}
+		stderrBuf := bufio.NewReader(jctl.stderr)
 
 		for {
 			line, err := stderrBuf.ReadBytes('\n')
 			if err != nil {
 				if !errors.Is(err, io.EOF) {
 					operator.Logger().Error("Received error reading from journalctl stderr", zap.Error(err))
 				}
-				stderrChan <- strings.Join(messages, "\n")
 				return
 			}
-			messages = append(messages, string(line))
+			operator.Logger().Error("Received from journalctl stderr", zap.ByteString("stderr", line))
 		}
 	}()
 
-	// Start the reader goroutine
+	// Start the reader goroutine.
+	// This goroutine reads the stdout from the journalctl process, parses
+	// the data, and writes to output. If the journalctl process exits for
+	// any reason, then the stdout will be closed, this goroutine will get
+	// an EOF error and exits.
 	operator.wg.Add(1)
 	go func() {
 		defer operator.wg.Done()
 
-		stdoutBuf := bufio.NewReader(stdout)
+		stdoutBuf := bufio.NewReader(jctl.stdout)
 
 		for {
 			line, err := stdoutBuf.ReadBytes('\n')
@@ -159,16 +198,11 @@ func (operator *Input) Start(persister operator.Persister) error {
 		}
 	}()
 
-	// Wait waitDuration for eventual error
-	select {
-	case err := <-failedChan:
-		if err.err == "" {
-			return fmt.Errorf("journalctl command exited")
-		}
-		return fmt.Errorf("journalctl command failed (%v): %v", err.err, err.output)
-	case <-time.After(waitDuration):
-		return nil
-	}
+	// we wait for the reader goroutines to exit before calling Cmd.Wait().
+	// As per documentation states, "It is thus incorrect to call Wait before all reads from the pipe have completed".
+	operator.wg.Wait()
+
+	return jctl.cmd.Wait()
 }
 
 func (operator *Input) parseJournalEntry(line []byte) (*entry.Entry, string, error) {
@@ -219,6 +253,5 @@ func (operator *Input) Stop() error {
 	if operator.cancel != nil {
 		operator.cancel()
 	}
-	operator.wg.Wait()
 	return nil
 }
diff --git a/pkg/stanza/operator/input/journald/input_test.go b/pkg/stanza/operator/input/journald/input_test.go
@@ -8,6 +8,7 @@ package journald
 import (
 	"bytes"
 	"context"
+	"errors"
 	"io"
 	"os/exec"
 	"testing"
@@ -17,19 +18,21 @@ import (
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
 	"go.opentelemetry.io/collector/component/componenttest"
+	"go.uber.org/zap"
 
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/entry"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/testutil"
 )
 
 type fakeJournaldCmd struct {
-	exitError *exec.ExitError
-	stdErr    string
+	startError error
+	exitError  *exec.ExitError
+	stdErr     string
 }
 
 func (f *fakeJournaldCmd) Start() error {
-	return nil
+	return f.startError
 }
 
 func (f *fakeJournaldCmd) StdoutPipe() (io.ReadCloser, error) {
@@ -73,8 +76,7 @@ func TestInputJournald(t *testing.T) {
 		return &fakeJournaldCmd{}
 	}
 
-	err = op.Start(testutil.NewUnscopedMockPersister())
-	assert.EqualError(t, err, "journalctl command exited")
+	require.NoError(t, op.Start(testutil.NewUnscopedMockPersister()))
 	defer func() {
 		require.NoError(t, op.Stop())
 	}()
@@ -236,6 +238,7 @@ func TestInputJournaldError(t *testing.T) {
 	cfg.OutputIDs = []string{"output"}
 
 	set := componenttest.NewNopTelemetrySettings()
+	set.Logger, _ = zap.NewDevelopment()
 	op, err := cfg.Build(set)
 	require.NoError(t, err)
 
@@ -250,20 +253,12 @@ func TestInputJournaldError(t *testing.T) {
 
 	op.(*Input).newCmd = func(_ context.Context, _ []byte) cmd {
 		return &fakeJournaldCmd{
-			exitError: &exec.ExitError{},
-			stdErr:    "stderr output\n",
+			exitError:  &exec.ExitError{},
+			startError: errors.New("fail to start"),
 		}
 	}
 
 	err = op.Start(testutil.NewUnscopedMockPersister())
-	assert.EqualError(t, err, "journalctl command failed (<nil>): stderr output\n")
-	defer func() {
-		require.NoError(t, op.Stop())
-	}()
-
-	select {
-	case <-received:
-	case <-time.After(time.Second):
-		require.FailNow(t, "Timed out waiting for entry to be read")
-	}
+	assert.EqualError(t, err, "journalctl command failed: start journalctl: fail to start")
+	require.NoError(t, op.Stop())
 }