zoekt-archive-index: support gzip via content detection

Currently we check the content-type header if fetching an archive via
HTTP. This updates our implementation to instead rely on content type
detection. The benefit of this is we can read gzip files from disk or stdin
now.

Change-Id: I5579da64107ce51ebc3a52c2e28ad3690d3567c1
diff --git a/cmd/zoekt-archive-index/archive.go b/cmd/zoekt-archive-index/archive.go
index 1230f8d..60e1c3c 100644
--- a/cmd/zoekt-archive-index/archive.go
+++ b/cmd/zoekt-archive-index/archive.go
@@ -2,6 +2,7 @@
 
 import (
 	"archive/tar"
+	"bytes"
 	"compress/gzip"
 	"fmt"
 	"io"
@@ -55,6 +56,19 @@
 	return nil
 }
 
+func detectContentType(r io.Reader) (string, io.Reader, error) {
+	var buf [512]byte
+	n, err := io.ReadFull(r, buf[:])
+	if err != nil && err != io.ErrUnexpectedEOF {
+		return "", nil, err
+	}
+
+	ct := http.DetectContentType(buf[:n])
+
+	// Return a new reader which merges in the read bytes
+	return ct, io.MultiReader(bytes.NewReader(buf[:n]), r), nil
+}
+
 // openArchive opens the tar at the URL or filepath u. Also supported is tgz
 // files over http.
 func openArchive(u string) (Archive, error) {
@@ -82,13 +96,6 @@
 		}
 		closer = resp.Body
 		r = resp.Body
-		if resp.Header.Get("Content-Type") == "application/x-gzip" {
-			r, err = gzip.NewReader(r)
-			if err != nil {
-				resp.Body.Close()
-				return nil, err
-			}
-		}
 	} else if u == "-" {
 		r = os.Stdin
 	} else {
@@ -100,6 +107,20 @@
 		r = f
 	}
 
+	ct, r, err := detectContentType(r)
+	if err != nil {
+		return nil, err
+	}
+	if ct == "application/x-gzip" {
+		r, err = gzip.NewReader(r)
+		if err != nil {
+			if closer != nil {
+				_ = closer.Close()
+			}
+			return nil, err
+		}
+	}
+
 	return &tarArchive{
 		tr:     tar.NewReader(r),
 		closer: closer,
diff --git a/cmd/zoekt-archive-index/e2e_test.go b/cmd/zoekt-archive-index/e2e_test.go
index e883f48..16705ea 100644
--- a/cmd/zoekt-archive-index/e2e_test.go
+++ b/cmd/zoekt-archive-index/e2e_test.go
@@ -2,6 +2,7 @@
 
 import (
 	"archive/tar"
+	"compress/gzip"
 	"context"
 	"fmt"
 	"io"
@@ -16,7 +17,19 @@
 	"github.com/google/zoekt/shards"
 )
 
-func writeArchive(w io.Writer, files map[string]string) error {
+func writeArchive(w io.Writer, format string, files map[string]string) (err error) {
+	if format == "tgz" {
+		gw := gzip.NewWriter(w)
+		defer func() {
+			err2 := gw.Close()
+			if err == nil {
+				err = err2
+			}
+		}()
+		w = gw
+		format = "tar"
+	}
+
 	tw := tar.NewWriter(w)
 
 	for name, body := range files {
@@ -45,6 +58,14 @@
 // -incremental=true option changing the options between indexes and ensuring
 // the results change as expected.
 func TestIndexIncrementally(t *testing.T) {
+	for _, format := range []string{"tar", "tgz"} {
+		t.Run(format, func(t *testing.T) {
+			testIndexIncrementally(t, format)
+		})
+	}
+}
+
+func testIndexIncrementally(t *testing.T, format string) {
 	indexdir, err := ioutil.TempDir("", "TestIndexArg-index")
 	if err != nil {
 		t.Fatalf("TempDir: %v", err)
@@ -64,7 +85,7 @@
 		files["F"+s] = strings.Repeat("a", fileSize)
 	}
 
-	err = writeArchive(archive, files)
+	err = writeArchive(archive, format, files)
 	if err != nil {
 		t.Fatalf("unable to create archive %v", err)
 	}