Skip to content

Commit d676ad5

Browse files
authored
chore: log provider stack traces on text file busy (#15078)
re: #14726 If we see "text file busy" in the errors while initializing terraform, attempt to query the pprof endpoint set up by coder/terraform-provider-coder#295 and log at CRITICAL. --------- Signed-off-by: Spike Curtis <spike@coder.com>
1 parent a77c907 commit d676ad5

File tree

4 files changed

+235
-5
lines changed

4 files changed

+235
-5
lines changed

provisioner/terraform/executor.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,15 @@ func versionFromBinaryPath(ctx context.Context, binaryPath string) (*version.Ver
200200
return version.NewVersion(vj.Version)
201201
}
202202

203+
type textFileBusyError struct {
204+
exitErr *exec.ExitError
205+
stderr string
206+
}
207+
208+
func (e *textFileBusyError) Error() string {
209+
return "text file busy: " + e.exitErr.String()
210+
}
211+
203212
func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
204213
ctx, span := e.server.startTrace(ctx, tracing.FuncName())
205214
defer span.End()
@@ -216,13 +225,24 @@ func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
216225
<-doneErr
217226
}()
218227

228+
// As a special case, we want to look for the error "text file busy" in the stderr output of
229+
// the init command, so we also take a copy of the stderr into an in memory buffer.
230+
errBuf := newBufferedWriteCloser(errWriter)
231+
219232
args := []string{
220233
"init",
221234
"-no-color",
222235
"-input=false",
223236
}
224237

225-
return e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errWriter)
238+
err := e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errBuf)
239+
var exitErr *exec.ExitError
240+
if xerrors.As(err, &exitErr) {
241+
if bytes.Contains(errBuf.b.Bytes(), []byte("text file busy")) {
242+
return &textFileBusyError{exitErr: exitErr, stderr: errBuf.b.String()}
243+
}
244+
}
245+
return err
226246
}
227247

228248
func getPlanFilePath(workdir string) string {
@@ -707,3 +727,26 @@ func (sw syncWriter) Write(p []byte) (n int, err error) {
707727
defer sw.mut.Unlock()
708728
return sw.w.Write(p)
709729
}
730+
731+
type bufferedWriteCloser struct {
732+
wc io.WriteCloser
733+
b bytes.Buffer
734+
}
735+
736+
func newBufferedWriteCloser(wc io.WriteCloser) *bufferedWriteCloser {
737+
return &bufferedWriteCloser{
738+
wc: wc,
739+
}
740+
}
741+
742+
func (b *bufferedWriteCloser) Write(p []byte) (int, error) {
743+
n, err := b.b.Write(p)
744+
if err != nil {
745+
return n, err
746+
}
747+
return b.wc.Write(p)
748+
}
749+
750+
func (b *bufferedWriteCloser) Close() error {
751+
return b.wc.Close()
752+
}

provisioner/terraform/provision.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"io"
8+
"net"
9+
"net/http"
710
"os"
11+
"path/filepath"
812
"strings"
913
"time"
1014

@@ -109,10 +113,32 @@ func (s *server) Plan(
109113
initTimings.ingest(createInitTimingsEvent(timingInitStart))
110114

111115
err = e.init(ctx, killCtx, sess)
116+
112117
if err != nil {
113118
initTimings.ingest(createInitTimingsEvent(timingInitErrored))
114119

115120
s.logger.Debug(ctx, "init failed", slog.Error(err))
121+
122+
// Special handling for "text file busy" c.f. https://github.com/coder/coder/issues/14726
123+
// We believe this might be due to some race condition that prevents the
124+
// terraform-provider-coder process from exiting. When terraform tries to install the
125+
// provider during this init, it copies over the local cache. Normally this isn't an issue,
126+
// but if the terraform-provider-coder process is still running from a previous build, Linux
127+
// returns "text file busy" error when attempting to open the file.
128+
//
129+
// Capturing the stack trace from the process should help us figure out why it has not
130+
// exited. We'll drop these diagnostics in a CRITICAL log so that operators are likely to
131+
// notice, and also because it indicates this provisioner could be permanently broken and
132+
// require a restart.
133+
var errTFB *textFileBusyError
134+
if xerrors.As(err, &errTFB) {
135+
stacktrace := tryGettingCoderProviderStacktrace(sess)
136+
s.logger.Critical(ctx, "init: text file busy",
137+
slog.Error(errTFB),
138+
slog.F("stderr", errTFB.stderr),
139+
slog.F("provider_coder_stacktrace", stacktrace),
140+
)
141+
}
116142
return provisionersdk.PlanErrorf("initialize terraform: %s", err)
117143
}
118144

@@ -280,3 +306,39 @@ func logTerraformEnvVars(sink logSink) {
280306
}
281307
}
282308
}
309+
310+
// tryGettingCoderProviderStacktrace attempts to dial a special pprof endpoint we added to
311+
// terraform-provider-coder in https://github.com/coder/terraform-provider-coder/pull/295 which
312+
// shipped in v1.0.4. It will return the stacktraces of the provider, which will hopefully allow us
313+
// to figure out why it hasn't exited.
314+
func tryGettingCoderProviderStacktrace(sess *provisionersdk.Session) string {
315+
path := filepath.Clean(filepath.Join(sess.WorkDirectory, "../.coder/pprof"))
316+
sess.Logger.Info(sess.Context(), "attempting to get stack traces", slog.F("path", path))
317+
c := http.Client{
318+
Transport: &http.Transport{
319+
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
320+
d := net.Dialer{}
321+
return d.DialContext(ctx, "unix", path)
322+
},
323+
},
324+
}
325+
req, err := http.NewRequestWithContext(sess.Context(), http.MethodGet,
326+
"http://localhost/debug/pprof/goroutine?debug=2", nil)
327+
if err != nil {
328+
sess.Logger.Error(sess.Context(), "error creating GET request", slog.Error(err))
329+
return ""
330+
}
331+
resp, err := c.Do(req)
332+
if err != nil {
333+
// Only log at Info here, since we only added the pprof endpoint to terraform-provider-coder
334+
// in v1.0.4
335+
sess.Logger.Info(sess.Context(), "could not GET stack traces", slog.Error(err))
336+
return ""
337+
}
338+
defer resp.Body.Close()
339+
stacktraces, err := io.ReadAll(resp.Body)
340+
if err != nil {
341+
sess.Logger.Error(sess.Context(), "could not read stack traces", slog.Error(err))
342+
}
343+
return string(stacktraces)
344+
}

provisioner/terraform/provision_test.go

Lines changed: 89 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"encoding/json"
1010
"errors"
1111
"fmt"
12+
"net"
13+
"net/http"
1214
"os"
1315
"path/filepath"
1416
"runtime"
@@ -31,14 +33,22 @@ import (
3133
type provisionerServeOptions struct {
3234
binaryPath string
3335
exitTimeout time.Duration
36+
workDir string
37+
logger *slog.Logger
3438
}
3539

3640
func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
3741
if opts == nil {
3842
opts = &provisionerServeOptions{}
3943
}
4044
cachePath := t.TempDir()
41-
workDir := t.TempDir()
45+
if opts.workDir == "" {
46+
opts.workDir = t.TempDir()
47+
}
48+
if opts.logger == nil {
49+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
50+
opts.logger = &logger
51+
}
4252
client, server := drpc.MemTransportPipe()
4353
ctx, cancelFunc := context.WithCancel(context.Background())
4454
serverErr := make(chan error, 1)
@@ -55,8 +65,8 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
5565
serverErr <- terraform.Serve(ctx, &terraform.ServeOptions{
5666
ServeOptions: &provisionersdk.ServeOptions{
5767
Listener: server,
58-
Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug),
59-
WorkDirectory: workDir,
68+
Logger: *opts.logger,
69+
WorkDirectory: opts.workDir,
6070
},
6171
BinaryPath: opts.binaryPath,
6272
CachePath: cachePath,
@@ -236,7 +246,7 @@ func TestProvision_CancelTimeout(t *testing.T) {
236246
dir := t.TempDir()
237247
binPath := filepath.Join(dir, "terraform")
238248

239-
// Example: exec /path/to/terrafork_fake_cancel.sh 1.2.1 apply "$@"
249+
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
240250
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
241251
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
242252
require.NoError(t, err)
@@ -282,6 +292,81 @@ func TestProvision_CancelTimeout(t *testing.T) {
282292
}
283293
}
284294

295+
// below we exec fake_text_file_busy.sh, which causes the kernel to execute it, and if more than
296+
// one process tries to do this, it can cause "text file busy" to be returned to us. In this test
297+
// we want to simulate "text file busy" getting logged by terraform, due to an issue with the
298+
// terraform-provider-coder
299+
// nolint: paralleltest
300+
func TestProvision_TextFileBusy(t *testing.T) {
301+
if runtime.GOOS == "windows" {
302+
t.Skip("This test uses unix sockets and is not supported on Windows")
303+
}
304+
305+
cwd, err := os.Getwd()
306+
require.NoError(t, err)
307+
fakeBin := filepath.Join(cwd, "testdata", "fake_text_file_busy.sh")
308+
309+
dir := t.TempDir()
310+
binPath := filepath.Join(dir, "terraform")
311+
312+
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
313+
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
314+
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
315+
require.NoError(t, err)
316+
317+
workDir := t.TempDir()
318+
319+
err = os.Mkdir(filepath.Join(workDir, ".coder"), 0o700)
320+
require.NoError(t, err)
321+
l, err := net.Listen("unix", filepath.Join(workDir, ".coder", "pprof"))
322+
require.NoError(t, err)
323+
defer l.Close()
324+
handlerCalled := 0
325+
// nolint: gosec
326+
srv := &http.Server{
327+
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
328+
assert.Equal(t, "/debug/pprof/goroutine", r.URL.Path)
329+
w.WriteHeader(http.StatusOK)
330+
_, err := w.Write([]byte("thestacks\n"))
331+
assert.NoError(t, err)
332+
handlerCalled++
333+
}),
334+
}
335+
srvErr := make(chan error, 1)
336+
go func() {
337+
srvErr <- srv.Serve(l)
338+
}()
339+
340+
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
341+
ctx, api := setupProvisioner(t, &provisionerServeOptions{
342+
binaryPath: binPath,
343+
exitTimeout: time.Second,
344+
workDir: workDir,
345+
logger: &logger,
346+
})
347+
348+
sess := configure(ctx, t, api, &proto.Config{
349+
TemplateSourceArchive: makeTar(t, nil),
350+
})
351+
352+
err = sendPlan(sess, proto.WorkspaceTransition_START)
353+
require.NoError(t, err)
354+
355+
found := false
356+
for {
357+
msg, err := sess.Recv()
358+
require.NoError(t, err)
359+
360+
if c := msg.GetPlan(); c != nil {
361+
require.Contains(t, c.Error, "exit status 1")
362+
found = true
363+
break
364+
}
365+
}
366+
require.True(t, found)
367+
require.EqualValues(t, 1, handlerCalled)
368+
}
369+
285370
func TestProvision(t *testing.T) {
286371
t.Parallel()
287372

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/sh
2+
3+
VERSION=$1
4+
shift 1
5+
6+
json_print() {
7+
echo "{\"@level\":\"error\",\"@message\":\"$*\"}"
8+
}
9+
10+
case "$1" in
11+
version)
12+
cat <<-EOF
13+
{
14+
"terraform_version": "${VERSION}",
15+
"platform": "linux_amd64",
16+
"provider_selections": {},
17+
"terraform_outdated": false
18+
}
19+
EOF
20+
exit 0
21+
;;
22+
init)
23+
echo "init"
24+
echo >&2 "Error: Failed to install provider"
25+
echo >&2 " Error while installing coder/coder v1.0.4: open"
26+
echo >&2 " /home/coder/.cache/coder/provisioner-0/tf/registry.terraform.io/coder/coder/1.0.3/linux_amd64/terraform-provider-coder_v1.0.4:"
27+
echo >&2 " text file busy"
28+
exit 1
29+
;;
30+
plan)
31+
echo "plan not supported"
32+
exit 1
33+
;;
34+
apply)
35+
echo "apply not supported"
36+
exit 1
37+
;;
38+
esac
39+
40+
exit 10

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy