zoekt-archive-index: support reading zip files
Zip requires random access, so we do not add support for streaming them
in (http or stdin).
Change-Id: I1b8a02c674afd5b77373155f8a795f51677c5cac
diff --git a/cmd/zoekt-archive-index/archive.go b/cmd/zoekt-archive-index/archive.go
index 67b3db6..c1afe5b 100644
--- a/cmd/zoekt-archive-index/archive.go
+++ b/cmd/zoekt-archive-index/archive.go
@@ -2,6 +2,7 @@
import (
"archive/tar"
+ "archive/zip"
"bytes"
"compress/gzip"
"fmt"
@@ -19,14 +20,14 @@
}
type File struct {
- io.Reader
+ io.ReadCloser
Name string
Size int64
}
type tarArchive struct {
- tr *tar.Reader
- closer io.Closer
+ io.Closer
+ tr *tar.Reader
}
func (a *tarArchive) Next() (*File, error) {
@@ -42,15 +43,69 @@
}
return &File{
- Reader: a.tr,
- Name: hdr.Name,
- Size: hdr.Size,
+ ReadCloser: ioutil.NopCloser(a.tr),
+ Name: hdr.Name,
+ Size: hdr.Size,
}, nil
}
}
-func (a *tarArchive) Close() error {
- return a.closer.Close()
+type zipArchive struct {
+ io.Closer
+ files []*zip.File
+}
+
+func (a *zipArchive) Next() (*File, error) {
+ if len(a.files) == 0 {
+ return nil, io.EOF
+ }
+
+ f := a.files[0]
+ a.files = a.files[1:]
+
+ r, err := f.Open()
+ if err != nil {
+ return nil, err
+ }
+
+ return &File{
+ ReadCloser: r,
+ Name: f.Name,
+ Size: int64(f.UncompressedSize64),
+ }, nil
+}
+
+func newZipArchive(r io.Reader, closer io.Closer) (*zipArchive, error) {
+ f, ok := r.(interface {
+ io.ReaderAt
+ Stat() (os.FileInfo, error)
+ })
+ if !ok {
+ return nil, fmt.Errorf("streaming zip files not supported")
+ }
+
+ fi, err := f.Stat()
+ if err != nil {
+ return nil, err
+ }
+
+ zr, err := zip.NewReader(f, fi.Size())
+ if err != nil {
+ return nil, err
+ }
+
+ // Filter out non files
+ files := zr.File[:0]
+ for _, f := range zr.File {
+ if f.Mode().IsRegular() {
+ files = append(files, f)
+ }
+ }
+
+ return &zipArchive{
+ Closer: closer,
+ files: files,
+ }, nil
}
func detectContentType(r io.Reader) (string, io.Reader, error) {
@@ -62,7 +117,13 @@
ct := http.DetectContentType(buf[:n])
- // Return a new reader which merges in the read bytes
+ // If we are a seeker, we can just undo our read
+ if s, ok := r.(io.Seeker); ok {
+ _, err := s.Seek(int64(-n), io.SeekCurrent)
+ return ct, r, err
+ }
+
+ // Otherwise return a new reader which merges in the read bytes
return ct, io.MultiReader(bytes.NewReader(buf[:n]), r), nil
}
@@ -109,15 +170,19 @@
if err != nil {
return nil, err
}
- if ct == "application/x-gzip" {
+ switch ct {
+ case "application/x-gzip":
r, err = gzip.NewReader(r)
if err != nil {
return nil, err
}
+
+ case "application/zip":
+ return newZipArchive(r, readCloser)
}
return &tarArchive{
+ Closer: readCloser,
tr: tar.NewReader(r),
- closer: readCloser,
}, nil
}
diff --git a/cmd/zoekt-archive-index/e2e_test.go b/cmd/zoekt-archive-index/e2e_test.go
index 16705ea..6456f41 100644
--- a/cmd/zoekt-archive-index/e2e_test.go
+++ b/cmd/zoekt-archive-index/e2e_test.go
@@ -2,8 +2,10 @@
import (
"archive/tar"
+ "archive/zip"
"compress/gzip"
"context"
+ "errors"
"fmt"
"io"
"io/ioutil"
@@ -18,6 +20,20 @@
)
func writeArchive(w io.Writer, format string, files map[string]string) (err error) {
+ if format == "zip" {
+ zw := zip.NewWriter(w)
+ for name, body := range files {
+ f, err := zw.Create(name)
+ if err != nil {
+ return err
+ }
+ if _, err := f.Write([]byte(body)); err != nil {
+ return err
+ }
+ }
+ return zw.Close()
+ }
+
if format == "tgz" {
gw := gzip.NewWriter(w)
defer func() {
@@ -30,6 +46,10 @@
format = "tar"
}
+ if format != "tar" {
+ return errors.New("expected tar")
+ }
+
tw := tar.NewWriter(w)
for name, body := range files {
@@ -58,7 +78,7 @@
// -incremental=true option changing the options between indexes and ensuring
// the results change as expected.
func TestIndexIncrementally(t *testing.T) {
- for _, format := range []string{"tar", "tgz"} {
+ for _, format := range []string{"tar", "tgz", "zip"} {
t.Run(format, func(t *testing.T) {
testIndexIncrementally(t, format)
})
diff --git a/cmd/zoekt-archive-index/main.go b/cmd/zoekt-archive-index/main.go
index 2755088..da83e65 100644
--- a/cmd/zoekt-archive-index/main.go
+++ b/cmd/zoekt-archive-index/main.go
@@ -159,18 +159,13 @@
if err != nil {
return err
}
- for {
- f, err := a.Next()
- if err == io.EOF {
- break
- }
- if err != nil {
- return err
- }
+
+ add := func(f *File) error {
+ defer f.Close()
// We do not index large files
if f.Size > int64(bopts.SizeMax) && !bopts.IgnoreSizeMax(f.Name) {
- continue
+ return nil
}
contents, err := ioutil.ReadAll(f)
@@ -180,17 +175,28 @@
name := stripComponents(f.Name, opts.Strip)
if name == "" {
- continue
+ return nil
}
- err = builder.Add(zoekt.Document{
+ return builder.Add(zoekt.Document{
Name: name,
Content: contents,
Branches: brs,
})
+ }
+
+ for {
+ f, err := a.Next()
+ if err == io.EOF {
+ break
+ }
if err != nil {
return err
}
+
+ if err := add(f); err != nil {
+ return err
+ }
}
return builder.Finish()