Skip to content

Commit 123c8f0

Browse files
committed
chore: log provider stack traces on text file busy
1 parent 17f2584 commit 123c8f0

File tree

4 files changed

+233
-5
lines changed

4 files changed

+233
-5
lines changed

provisioner/terraform/executor.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,15 @@ func versionFromBinaryPath(ctx context.Context, binaryPath string) (*version.Ver
200200
return version.NewVersion(vj.Version)
201201
}
202202

203+
type errTextFileBusy struct {
204+
exitErr *exec.ExitError
205+
stderr string
206+
}
207+
208+
func (e *errTextFileBusy) Error() string {
209+
return "text file busy: " + e.exitErr.String()
210+
}
211+
203212
func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
204213
ctx, span := e.server.startTrace(ctx, tracing.FuncName())
205214
defer span.End()
@@ -216,13 +225,24 @@ func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
216225
<-doneErr
217226
}()
218227

228+
// As a special case, we want to look for the error "text file busy" in the stderr output of
229+
// the init command, so we also take a copy of the stderr into an in memory buffer.
230+
errBuf := newBufferedWriteCloser(errWriter)
231+
219232
args := []string{
220233
"init",
221234
"-no-color",
222235
"-input=false",
223236
}
224237

225-
return e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errWriter)
238+
err := e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errBuf)
239+
var exitErr *exec.ExitError
240+
if xerrors.As(err, &exitErr) {
241+
if bytes.Contains(errBuf.b.Bytes(), []byte("text file busy")) {
242+
return &errTextFileBusy{exitErr: exitErr, stderr: errBuf.b.String()}
243+
}
244+
}
245+
return err
226246
}
227247

228248
func getPlanFilePath(workdir string) string {
@@ -707,3 +727,26 @@ func (sw syncWriter) Write(p []byte) (n int, err error) {
707727
defer sw.mut.Unlock()
708728
return sw.w.Write(p)
709729
}
730+
731+
type bufferedWriteCloser struct {
732+
wc io.WriteCloser
733+
b bytes.Buffer
734+
}
735+
736+
func newBufferedWriteCloser(wc io.WriteCloser) *bufferedWriteCloser {
737+
return &bufferedWriteCloser{
738+
wc: wc,
739+
}
740+
}
741+
742+
func (b *bufferedWriteCloser) Write(p []byte) (int, error) {
743+
n, err := b.b.Write(p)
744+
if err != nil {
745+
return n, err
746+
}
747+
return b.wc.Write(p)
748+
}
749+
750+
func (b *bufferedWriteCloser) Close() error {
751+
return b.wc.Close()
752+
}

provisioner/terraform/provision.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"io"
8+
"net"
9+
"net/http"
710
"os"
11+
"path/filepath"
812
"strings"
913
"time"
1014

@@ -109,10 +113,32 @@ func (s *server) Plan(
109113
initTimings.ingest(createInitTimingsEvent(timingInitStart))
110114

111115
err = e.init(ctx, killCtx, sess)
116+
112117
if err != nil {
113118
initTimings.ingest(createInitTimingsEvent(timingInitErrored))
114119

115120
s.logger.Debug(ctx, "init failed", slog.Error(err))
121+
122+
// Special handling for "text file busy" c.f. https://github.com/coder/coder/issues/14726
123+
// We believe this might be due to some race condition that prevents the
124+
// terraform-provider-coder process from exiting. When terraform tries to install the
125+
// provider during this init, it copies over the local cache. Normally this isn't an issue,
126+
// but if the terraform-provider-coder process is still running from a previous build, Linux
127+
// returns "text file busy" error when attempting to open the file.
128+
//
129+
// Capturing the stack trace from the process should help us figure out why it has not
130+
// exited. We'll drop these diagnostics in a CRITICAL log so that operators are likely to
131+
// notice, and also because it indicates this provisioner could be permanently broken and
132+
// require a restart.
133+
var errTFB *errTextFileBusy
134+
if xerrors.As(err, &errTFB) {
135+
stacktrace := tryGettingCoderProviderStacktrace(sess)
136+
s.logger.Critical(ctx, "init: text file busy",
137+
slog.Error(errTFB),
138+
slog.F("stderr", errTFB.stderr),
139+
slog.F("provider_coder_stacktrace", stacktrace),
140+
)
141+
}
116142
return provisionersdk.PlanErrorf("initialize terraform: %s", err)
117143
}
118144

@@ -280,3 +306,33 @@ func logTerraformEnvVars(sink logSink) {
280306
}
281307
}
282308
}
309+
310+
// tryGettingCoderProviderStacktrace attempts to dial a special pprof endpoint we added to
311+
// terraform-provider-coder in https://github.com/coder/terraform-provider-coder/pull/295 which
312+
// shipped in v1.0.4. It will return the stacktraces of the provider, which will hopefully allow us
313+
// to figure out why it hasn't exited.
314+
func tryGettingCoderProviderStacktrace(sess *provisionersdk.Session) string {
315+
path := filepath.Clean(filepath.Join(sess.WorkDirectory, "../.coder/pprof"))
316+
sess.Logger.Info(sess.Context(), "attempting to get stack traces", slog.F("path", path))
317+
c := http.Client{
318+
Transport: &http.Transport{
319+
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
320+
d := net.Dialer{}
321+
return d.DialContext(ctx, "unix", path)
322+
},
323+
},
324+
}
325+
resp, err := c.Get("http://localhost/debug/pprof/goroutine?debug=2")
326+
if err != nil {
327+
// Only log at Info here, since we only added the pprof endpoint to terraform-provider-coder
328+
// in v1.0.4
329+
sess.Logger.Info(sess.Context(), "could not GET stack traces", slog.Error(err))
330+
return ""
331+
}
332+
defer resp.Body.Close()
333+
stacktraces, err := io.ReadAll(resp.Body)
334+
if err != nil {
335+
sess.Logger.Error(sess.Context(), "could not read stack traces", slog.Error(err))
336+
}
337+
return string(stacktraces)
338+
}

provisioner/terraform/provision_test.go

Lines changed: 91 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"encoding/json"
1010
"errors"
1111
"fmt"
12+
"net"
13+
"net/http"
1214
"os"
1315
"path/filepath"
1416
"runtime"
@@ -26,19 +28,28 @@ import (
2628
"github.com/coder/coder/v2/provisioner/terraform"
2729
"github.com/coder/coder/v2/provisionersdk"
2830
"github.com/coder/coder/v2/provisionersdk/proto"
31+
"github.com/coder/coder/v2/testutil"
2932
)
3033

3134
type provisionerServeOptions struct {
3235
binaryPath string
3336
exitTimeout time.Duration
37+
workDir string
38+
logger *slog.Logger
3439
}
3540

3641
func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
3742
if opts == nil {
3843
opts = &provisionerServeOptions{}
3944
}
4045
cachePath := t.TempDir()
41-
workDir := t.TempDir()
46+
if opts.workDir == "" {
47+
opts.workDir = t.TempDir()
48+
}
49+
if opts.logger == nil {
50+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
51+
opts.logger = &logger
52+
}
4253
client, server := drpc.MemTransportPipe()
4354
ctx, cancelFunc := context.WithCancel(context.Background())
4455
serverErr := make(chan error, 1)
@@ -55,8 +66,8 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
5566
serverErr <- terraform.Serve(ctx, &terraform.ServeOptions{
5667
ServeOptions: &provisionersdk.ServeOptions{
5768
Listener: server,
58-
Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug),
59-
WorkDirectory: workDir,
69+
Logger: *opts.logger,
70+
WorkDirectory: opts.workDir,
6071
},
6172
BinaryPath: opts.binaryPath,
6273
CachePath: cachePath,
@@ -236,7 +247,7 @@ func TestProvision_CancelTimeout(t *testing.T) {
236247
dir := t.TempDir()
237248
binPath := filepath.Join(dir, "terraform")
238249

239-
// Example: exec /path/to/terrafork_fake_cancel.sh 1.2.1 apply "$@"
250+
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
240251
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
241252
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
242253
require.NoError(t, err)
@@ -282,6 +293,82 @@ func TestProvision_CancelTimeout(t *testing.T) {
282293
}
283294
}
284295

296+
// below we exec fake_text_file_busy.sh, which causes the kernel to execute it, and if more than
297+
// one process tries to do this, it can cause "text file busy" to be returned to us. In this test
298+
// we want to simulate "text file busy" getting logged by terraform, due to an issue with the
299+
// terraform-provider-coder
300+
// nolint: paralleltest
301+
func TestProvision_TextFileBusy(t *testing.T) {
302+
if runtime.GOOS == "windows" {
303+
t.Skip("This test uses unix sockets and is not supported on Windows")
304+
}
305+
ctx := testutil.Context(t, testutil.WaitShort)
306+
307+
cwd, err := os.Getwd()
308+
require.NoError(t, err)
309+
fakeBin := filepath.Join(cwd, "testdata", "fake_text_file_busy.sh")
310+
311+
dir := t.TempDir()
312+
binPath := filepath.Join(dir, "terraform")
313+
314+
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
315+
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
316+
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
317+
require.NoError(t, err)
318+
319+
workDir := t.TempDir()
320+
321+
err = os.Mkdir(filepath.Join(workDir, ".coder"), 0o700)
322+
require.NoError(t, err)
323+
l, err := net.Listen("unix", filepath.Join(workDir, ".coder", "pprof"))
324+
require.NoError(t, err)
325+
defer l.Close()
326+
handlerCalled := 0
327+
srv := &http.Server{
328+
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
329+
assert.Equal(t, "/debug/pprof/goroutine", r.URL.Path)
330+
w.WriteHeader(http.StatusOK)
331+
_, err := w.Write([]byte("thestacks\n"))
332+
assert.NoError(t, err)
333+
handlerCalled++
334+
return
335+
}),
336+
}
337+
srvErr := make(chan error, 1)
338+
go func() {
339+
srvErr <- srv.Serve(l)
340+
}()
341+
342+
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
343+
ctx, api := setupProvisioner(t, &provisionerServeOptions{
344+
binaryPath: binPath,
345+
exitTimeout: time.Second,
346+
workDir: workDir,
347+
logger: &logger,
348+
})
349+
350+
sess := configure(ctx, t, api, &proto.Config{
351+
TemplateSourceArchive: makeTar(t, nil),
352+
})
353+
354+
err = sendPlan(sess, proto.WorkspaceTransition_START)
355+
require.NoError(t, err)
356+
357+
found := false
358+
for {
359+
msg, err := sess.Recv()
360+
require.NoError(t, err)
361+
362+
if c := msg.GetPlan(); c != nil {
363+
require.Contains(t, c.Error, "exit status 1")
364+
found = true
365+
break
366+
}
367+
}
368+
require.True(t, found)
369+
require.EqualValues(t, 1, handlerCalled)
370+
}
371+
285372
func TestProvision(t *testing.T) {
286373
t.Parallel()
287374

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/sh
2+
3+
VERSION=$1
4+
shift 1
5+
6+
json_print() {
7+
echo "{\"@level\":\"error\",\"@message\":\"$*\"}"
8+
}
9+
10+
case "$1" in
11+
version)
12+
cat <<-EOF
13+
{
14+
"terraform_version": "${VERSION}",
15+
"platform": "linux_amd64",
16+
"provider_selections": {},
17+
"terraform_outdated": false
18+
}
19+
EOF
20+
exit 0
21+
;;
22+
init)
23+
echo "init"
24+
>&2 echo "Error: Failed to install provider"
25+
>&2 echo " Error while installing coder/coder v1.0.4: open"
26+
>&2 echo " /home/coder/.cache/coder/provisioner-0/tf/registry.terraform.io/coder/coder/1.0.3/linux_amd64/terraform-provider-coder_v1.0.4:"
27+
>&2 echo " text file busy"
28+
exit 1
29+
;;
30+
plan)
31+
echo "plan not supported"
32+
exit 1
33+
;;
34+
apply)
35+
echo "apply not supported"
36+
exit 1
37+
;;
38+
esac
39+
40+
exit 10
41+
42+

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy