add build option for large file whitelist
Change-Id: I70e51197558e760432e820743284696a4c469132
diff --git a/build/builder.go b/build/builder.go
index 08db631..19b951d 100644
--- a/build/builder.go
+++ b/build/builder.go
@@ -73,6 +73,11 @@
// Write memory profiles to this file.
MemProfile string
+
+ // LargeFiles is a slice of glob patterns where matching file
+ // paths should be indexed regardless of their size. The pattern syntax
+ // can be found here: https://golang.org/pkg/path/filepath/#Match.
+ LargeFiles []string
}
// Builder manages (parallel) creation of uniformly sized shards. The
@@ -179,6 +184,19 @@
return repo.Branches
}
+// IgnoreSizeMax determines whether the max size should be ignored.
+func (o *Options) IgnoreSizeMax(name string) bool {
+ for _, pattern := range o.LargeFiles {
+ pattern = strings.TrimSpace(pattern)
+ m, _ := filepath.Match(pattern, name)
+ if m {
+ return true
+ }
+ }
+
+ return false
+}
+
// NewBuilder creates a new Builder instance.
func NewBuilder(opts Options) (*Builder, error) {
opts.SetDefaults()
@@ -221,7 +239,7 @@
// we pass through a part of the source tree with binary/large
// files, the corresponding shard would be mostly empty, so
// insert a reason here too.
- if len(doc.Content) > b.opts.SizeMax {
+ if len(doc.Content) > b.opts.SizeMax && !b.opts.IgnoreSizeMax(doc.Name) {
doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
} else if err := zoekt.CheckText(doc.Content); err != nil {
doc.SkipReason = err.Error()
diff --git a/build/e2e_test.go b/build/e2e_test.go
index a68c9bb..d6a7fe7 100644
--- a/build/e2e_test.go
+++ b/build/e2e_test.go
@@ -88,6 +88,60 @@
defer ss.Close()
}
+func TestLargeFileOption(t *testing.T) {
+ dir, err := ioutil.TempDir("", "large_files_test")
+ if err != nil {
+ t.Fatalf("TempDir: %v", err)
+ }
+ defer func() {
+ os.RemoveAll(dir)
+ }()
+
+ sizeMax := 1000
+ opts := Options{
+ IndexDir: dir,
+ LargeFiles: []string{"F0", "F2"},
+ RepositoryDescription: zoekt.Repository{
+ Name: "repo",
+ },
+ SizeMax: sizeMax,
+ }
+
+ b, err := NewBuilder(opts)
+ if err != nil {
+ t.Fatalf("NewBuilder: %v", err)
+ }
+
+ for i := 0; i < 4; i++ {
+ s := fmt.Sprintf("%d", i)
+ b.AddFile("F"+s, []byte(strings.Repeat("a", sizeMax+1)))
+ }
+
+ b.Finish()
+
+ ss, err := shards.NewDirectorySearcher(dir)
+ if err != nil {
+ t.Fatalf("NewDirectorySearcher(%s): %v", dir, err)
+ }
+
+ q, err := query.Parse("aaa")
+ if err != nil {
+ t.Fatalf("Parse(aaa): %v", err)
+ }
+
+ var sOpts zoekt.SearchOptions
+ ctx := context.Background()
+ result, err := ss.Search(ctx, q, &sOpts)
+ if err != nil {
+ t.Fatalf("Search(%v): %v", q, err)
+ }
+
+ if len(result.Files) != 2 {
+ t.Errorf("got %v files, want 2 files.", len(result.Files))
+ }
+ defer ss.Close()
+}
+
func TestUpdate(t *testing.T) {
dir, err := ioutil.TempDir("", "")
if err != nil {
diff --git a/cmd/flags.go b/cmd/flags.go
index 20c64a6..51580a1 100644
--- a/cmd/flags.go
+++ b/cmd/flags.go
@@ -19,11 +19,24 @@
"fmt"
"os"
"path/filepath"
+ "strings"
"github.com/google/zoekt"
"github.com/google/zoekt/build"
)
+type largeFilesFlag []string
+
+func (f *largeFilesFlag) String() string {
+ s := append([]string{""}, *f...)
+ return strings.Join(s, "-large_file ")
+}
+
+func (f *largeFilesFlag) Set(value string) error {
+ *f = append(*f, value)
+ return nil
+}
+
var (
sizeMax = flag.Int("file_limit", 128*1024, "maximum file size")
shardLimit = flag.Int("shard_limit", 100<<20, "maximum corpus size for a shard")
@@ -31,8 +44,13 @@
indexDir = flag.String("index", build.DefaultDir, "directory for search indices")
version = flag.Bool("version", false, "Print version number")
ctags = flag.Bool("require_ctags", false, "If set, ctags calls must succeed.")
+ largeFiles = largeFilesFlag{}
)
+func init() {
+ flag.Var(&largeFiles, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
+}
+
func OptionsFromFlags() *build.Options {
if *version {
name := filepath.Base(os.Args[0])
diff --git a/cmd/zoekt-archive-index/main.go b/cmd/zoekt-archive-index/main.go
index 2efcfe3..1685c62 100644
--- a/cmd/zoekt-archive-index/main.go
+++ b/cmd/zoekt-archive-index/main.go
@@ -168,7 +168,7 @@
}
// We do not index large files
- if f.Size > int64(bopts.SizeMax) {
+ if f.Size > int64(bopts.SizeMax) && !bopts.IgnoreSizeMax(f.Name) {
continue
}
diff --git a/cmd/zoekt-index/main.go b/cmd/zoekt-index/main.go
index 54ff4ff..3ac304a 100644
--- a/cmd/zoekt-index/main.go
+++ b/cmd/zoekt-index/main.go
@@ -60,7 +60,6 @@
func main() {
var cpuProfile = flag.String("cpu_profile", "", "write cpu profile to file")
-
ignoreDirs := flag.String("ignore_dirs", ".git,.hg,.svn", "comma separated list of directories to ignore.")
flag.Parse()
@@ -120,7 +119,7 @@
for f := range comm {
displayName := strings.TrimPrefix(f.name, dir+"/")
- if f.size > int64(opts.SizeMax) {
+ if f.size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(displayName) {
builder.Add(zoekt.Document{
Name: displayName,
SkipReason: fmt.Sprintf("document size %d larger than limit %d", f.size, opts.SizeMax),
diff --git a/gitindex/index.go b/gitindex/index.go
index a94d49b..1301ac7 100644
--- a/gitindex/index.go
+++ b/gitindex/index.go
@@ -481,7 +481,7 @@
return err
}
- if blob.Size > int64(opts.BuildOptions.SizeMax) {
+ if blob.Size > int64(opts.BuildOptions.SizeMax) && !opts.BuildOptions.IgnoreSizeMax(key.FullPath()) {
if err := builder.Add(zoekt.Document{
SkipReason: fmt.Sprintf("file size %d exceeds maximum size %d", blob.Size, opts.BuildOptions.SizeMax),
Name: key.FullPath(),