zoekt-archive-index: support reading zip files

Zip requires random access, so we do not add support for streaming them
in (http or stdin).

Change-Id: I1b8a02c674afd5b77373155f8a795f51677c5cac
diff --git a/cmd/zoekt-archive-index/archive.go b/cmd/zoekt-archive-index/archive.go
index 67b3db6..c1afe5b 100644
--- a/cmd/zoekt-archive-index/archive.go
+++ b/cmd/zoekt-archive-index/archive.go
@@ -2,6 +2,7 @@
 
 import (
 	"archive/tar"
+	"archive/zip"
 	"bytes"
 	"compress/gzip"
 	"fmt"
@@ -19,14 +20,14 @@
 }
 
 type File struct {
-	io.Reader
+	io.ReadCloser
 	Name string
 	Size int64
 }
 
 type tarArchive struct {
-	tr     *tar.Reader
-	closer io.Closer
+	io.Closer
+	tr *tar.Reader
 }
 
 func (a *tarArchive) Next() (*File, error) {
@@ -42,15 +43,69 @@
 		}
 
 		return &File{
-			Reader: a.tr,
-			Name:   hdr.Name,
-			Size:   hdr.Size,
+			ReadCloser: ioutil.NopCloser(a.tr),
+			Name:       hdr.Name,
+			Size:       hdr.Size,
 		}, nil
 	}
 }
 
-func (a *tarArchive) Close() error {
-	return a.closer.Close()
+type zipArchive struct {
+	io.Closer
+	files []*zip.File
+}
+
+func (a *zipArchive) Next() (*File, error) {
+	if len(a.files) == 0 {
+		return nil, io.EOF
+	}
+
+	f := a.files[0]
+	a.files = a.files[1:]
+
+	r, err := f.Open()
+	if err != nil {
+		return nil, err
+	}
+
+	return &File{
+		ReadCloser: r,
+		Name:       f.Name,
+		Size:       int64(f.UncompressedSize64),
+	}, nil
+}
+
+func newZipArchive(r io.Reader, closer io.Closer) (*zipArchive, error) {
+	f, ok := r.(interface {
+		io.ReaderAt
+		Stat() (os.FileInfo, error)
+	})
+	if !ok {
+		return nil, fmt.Errorf("streaming zip files not supported")
+	}
+
+	fi, err := f.Stat()
+	if err != nil {
+		return nil, err
+	}
+
+	zr, err := zip.NewReader(f, fi.Size())
+	if err != nil {
+		return nil, err
+	}
+
+	// Filter out non files
+	files := zr.File[:0]
+	for _, f := range zr.File {
+		if f.Mode().IsRegular() {
+			files = append(files, f)
+		}
+	}
+
+	return &zipArchive{
+		Closer: closer,
+		files:  files,
+	}, nil
 }
 
 func detectContentType(r io.Reader) (string, io.Reader, error) {
@@ -62,7 +117,13 @@
 
 	ct := http.DetectContentType(buf[:n])
 
-	// Return a new reader which merges in the read bytes
+	// If we are a seeker, we can just undo our read
+	if s, ok := r.(io.Seeker); ok {
+		_, err := s.Seek(int64(-n), io.SeekCurrent)
+		return ct, r, err
+	}
+
+	// Otherwise return a new reader which merges in the read bytes
 	return ct, io.MultiReader(bytes.NewReader(buf[:n]), r), nil
 }
 
@@ -109,15 +170,19 @@
 	if err != nil {
 		return nil, err
 	}
-	if ct == "application/x-gzip" {
+	switch ct {
+	case "application/x-gzip":
 		r, err = gzip.NewReader(r)
 		if err != nil {
 			return nil, err
 		}
+
+	case "application/zip":
+		return newZipArchive(r, readCloser)
 	}
 
 	return &tarArchive{
+		Closer: readCloser,
 		tr:     tar.NewReader(r),
-		closer: readCloser,
 	}, nil
 }
diff --git a/cmd/zoekt-archive-index/e2e_test.go b/cmd/zoekt-archive-index/e2e_test.go
index 16705ea..6456f41 100644
--- a/cmd/zoekt-archive-index/e2e_test.go
+++ b/cmd/zoekt-archive-index/e2e_test.go
@@ -2,8 +2,10 @@
 
 import (
 	"archive/tar"
+	"archive/zip"
 	"compress/gzip"
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -18,6 +20,20 @@
 )
 
 func writeArchive(w io.Writer, format string, files map[string]string) (err error) {
+	if format == "zip" {
+		zw := zip.NewWriter(w)
+		for name, body := range files {
+			f, err := zw.Create(name)
+			if err != nil {
+				return err
+			}
+			if _, err := f.Write([]byte(body)); err != nil {
+				return err
+			}
+		}
+		return zw.Close()
+	}
+
 	if format == "tgz" {
 		gw := gzip.NewWriter(w)
 		defer func() {
@@ -30,6 +46,10 @@
 		format = "tar"
 	}
 
+	if format != "tar" {
+		return errors.New("expected tar")
+	}
+
 	tw := tar.NewWriter(w)
 
 	for name, body := range files {
@@ -58,7 +78,7 @@
 // -incremental=true option changing the options between indexes and ensuring
 // the results change as expected.
 func TestIndexIncrementally(t *testing.T) {
-	for _, format := range []string{"tar", "tgz"} {
+	for _, format := range []string{"tar", "tgz", "zip"} {
 		t.Run(format, func(t *testing.T) {
 			testIndexIncrementally(t, format)
 		})
diff --git a/cmd/zoekt-archive-index/main.go b/cmd/zoekt-archive-index/main.go
index 2755088..da83e65 100644
--- a/cmd/zoekt-archive-index/main.go
+++ b/cmd/zoekt-archive-index/main.go
@@ -159,18 +159,13 @@
 	if err != nil {
 		return err
 	}
-	for {
-		f, err := a.Next()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			return err
-		}
+
+	add := func(f *File) error {
+		defer f.Close()
 
 		// We do not index large files
 		if f.Size > int64(bopts.SizeMax) && !bopts.IgnoreSizeMax(f.Name) {
-			continue
+			return nil
 		}
 
 		contents, err := ioutil.ReadAll(f)
@@ -180,17 +175,28 @@
 
 		name := stripComponents(f.Name, opts.Strip)
 		if name == "" {
-			continue
+			return nil
 		}
 
-		err = builder.Add(zoekt.Document{
+		return builder.Add(zoekt.Document{
 			Name:     name,
 			Content:  contents,
 			Branches: brs,
 		})
+	}
+
+	for {
+		f, err := a.Next()
+		if err == io.EOF {
+			break
+		}
 		if err != nil {
 			return err
 		}
+
+		if err := add(f); err != nil {
+			return err
+		}
 	}
 
 	return builder.Finish()