add build option for large file whitelist

Change-Id: I70e51197558e760432e820743284696a4c469132
diff --git a/build/builder.go b/build/builder.go
index 08db631..19b951d 100644
--- a/build/builder.go
+++ b/build/builder.go
@@ -73,6 +73,11 @@
 
 	// Write memory profiles to this file.
 	MemProfile string
+
+	// LargeFiles is a slice of glob patterns where matching file
+	// paths should be indexed regardless of their size. The pattern syntax
+	// can be found here: https://golang.org/pkg/path/filepath/#Match.
+	LargeFiles []string
 }
 
 // Builder manages (parallel) creation of uniformly sized shards. The
@@ -179,6 +184,19 @@
 	return repo.Branches
 }
 
+// IgnoreSizeMax determines whether the max size should be ignored.
+func (o *Options) IgnoreSizeMax(name string) bool {
+	for _, pattern := range o.LargeFiles {
+		pattern = strings.TrimSpace(pattern)
+		m, _ := filepath.Match(pattern, name)
+		if m {
+			return true
+		}
+	}
+
+	return false
+}
+
 // NewBuilder creates a new Builder instance.
 func NewBuilder(opts Options) (*Builder, error) {
 	opts.SetDefaults()
@@ -221,7 +239,7 @@
 	// we pass through a part of the source tree with binary/large
 	// files, the corresponding shard would be mostly empty, so
 	// insert a reason here too.
-	if len(doc.Content) > b.opts.SizeMax {
+	if len(doc.Content) > b.opts.SizeMax && !b.opts.IgnoreSizeMax(doc.Name) {
 		doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
 	} else if err := zoekt.CheckText(doc.Content); err != nil {
 		doc.SkipReason = err.Error()
diff --git a/build/e2e_test.go b/build/e2e_test.go
index a68c9bb..d6a7fe7 100644
--- a/build/e2e_test.go
+++ b/build/e2e_test.go
@@ -88,6 +88,60 @@
 	defer ss.Close()
 }
 
+func TestLargeFileOption(t *testing.T) {
+	dir, err := ioutil.TempDir("", "large_files_test")
+	if err != nil {
+		t.Fatalf("TempDir: %v", err)
+	}
+	defer func() {
+		os.RemoveAll(dir)
+	}()
+
+	sizeMax := 1000
+	opts := Options{
+		IndexDir:   dir,
+		LargeFiles: []string{"F0", "F2"},
+		RepositoryDescription: zoekt.Repository{
+			Name: "repo",
+		},
+		SizeMax: sizeMax,
+	}
+
+	b, err := NewBuilder(opts)
+	if err != nil {
+		t.Fatalf("NewBuilder: %v", err)
+	}
+
+	for i := 0; i < 4; i++ {
+		s := fmt.Sprintf("%d", i)
+		b.AddFile("F"+s, []byte(strings.Repeat("a", sizeMax+1)))
+	}
+
+	b.Finish()
+
+	ss, err := shards.NewDirectorySearcher(dir)
+	if err != nil {
+		t.Fatalf("NewDirectorySearcher(%s): %v", dir, err)
+	}
+
+	q, err := query.Parse("aaa")
+	if err != nil {
+		t.Fatalf("Parse(aaa): %v", err)
+	}
+
+	var sOpts zoekt.SearchOptions
+	ctx := context.Background()
+	result, err := ss.Search(ctx, q, &sOpts)
+	if err != nil {
+		t.Fatalf("Search(%v): %v", q, err)
+	}
+
+	if len(result.Files) != 2 {
+		t.Errorf("got %v files, want 2 files.", len(result.Files))
+	}
+	defer ss.Close()
+}
+
 func TestUpdate(t *testing.T) {
 	dir, err := ioutil.TempDir("", "")
 	if err != nil {
diff --git a/cmd/flags.go b/cmd/flags.go
index 20c64a6..51580a1 100644
--- a/cmd/flags.go
+++ b/cmd/flags.go
@@ -19,11 +19,24 @@
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
 
 	"github.com/google/zoekt"
 	"github.com/google/zoekt/build"
 )
 
+type largeFilesFlag []string
+
+func (f *largeFilesFlag) String() string {
+	s := append([]string{""}, *f...)
+	return strings.Join(s, "-large_file ")
+}
+
+func (f *largeFilesFlag) Set(value string) error {
+	*f = append(*f, value)
+	return nil
+}
+
 var (
 	sizeMax     = flag.Int("file_limit", 128*1024, "maximum file size")
 	shardLimit  = flag.Int("shard_limit", 100<<20, "maximum corpus size for a shard")
@@ -31,8 +44,13 @@
 	indexDir    = flag.String("index", build.DefaultDir, "directory for search indices")
 	version     = flag.Bool("version", false, "Print version number")
 	ctags       = flag.Bool("require_ctags", false, "If set, ctags calls must succeed.")
+	largeFiles  = largeFilesFlag{}
 )
 
+func init() {
+	flag.Var(&largeFiles, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
+}
+
 func OptionsFromFlags() *build.Options {
 	if *version {
 		name := filepath.Base(os.Args[0])
diff --git a/cmd/zoekt-archive-index/main.go b/cmd/zoekt-archive-index/main.go
index 2efcfe3..1685c62 100644
--- a/cmd/zoekt-archive-index/main.go
+++ b/cmd/zoekt-archive-index/main.go
@@ -168,7 +168,7 @@
 		}
 
 		// We do not index large files
-		if f.Size > int64(bopts.SizeMax) {
+		if f.Size > int64(bopts.SizeMax) && !bopts.IgnoreSizeMax(f.Name) {
 			continue
 		}
 
diff --git a/cmd/zoekt-index/main.go b/cmd/zoekt-index/main.go
index 54ff4ff..3ac304a 100644
--- a/cmd/zoekt-index/main.go
+++ b/cmd/zoekt-index/main.go
@@ -60,7 +60,6 @@
 
 func main() {
 	var cpuProfile = flag.String("cpu_profile", "", "write cpu profile to file")
-
 	ignoreDirs := flag.String("ignore_dirs", ".git,.hg,.svn", "comma separated list of directories to ignore.")
 	flag.Parse()
 
@@ -120,7 +119,7 @@
 
 	for f := range comm {
 		displayName := strings.TrimPrefix(f.name, dir+"/")
-		if f.size > int64(opts.SizeMax) {
+		if f.size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(displayName) {
 			builder.Add(zoekt.Document{
 				Name:       displayName,
 				SkipReason: fmt.Sprintf("document size %d larger than limit %d", f.size, opts.SizeMax),
diff --git a/gitindex/index.go b/gitindex/index.go
index a94d49b..1301ac7 100644
--- a/gitindex/index.go
+++ b/gitindex/index.go
@@ -481,7 +481,7 @@
 				return err
 			}
 
-			if blob.Size > int64(opts.BuildOptions.SizeMax) {
+			if blob.Size > int64(opts.BuildOptions.SizeMax) && !opts.BuildOptions.IgnoreSizeMax(key.FullPath()) {
 				if err := builder.Add(zoekt.Document{
 					SkipReason:        fmt.Sprintf("file size %d exceeds maximum size %d", blob.Size, opts.BuildOptions.SizeMax),
 					Name:              key.FullPath(),