build/builder.go - zoekt - Git at Google

 // Copyright 2016 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // package build implements a more convenient interface for building
 // zoekt indices.
 package build

 import (
 	"crypto/sha1"
 	"flag"
 	"fmt"
 	"io"
 	"io/ioutil"
 	"log"
 	"net/url"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"reflect"
 	"regexp"
 	"runtime"
 	"runtime/pprof"
 	"sort"
 	"strings"
 	"sync"

 	"github.com/google/zoekt"
 	"github.com/google/zoekt/ctags"
 )

 var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")

 // Branch describes a single branch version.
 type Branch struct {
 	Name    string
 	Version string
 }

 // Options sets options for the index building.
 type Options struct {
 	// IndexDir is a directory that holds *.zoekt index files.
 	IndexDir string

 	// SizeMax is the maximum file size
 	SizeMax int

 	// Parallelism is the maximum number of shards to index in parallel
 	Parallelism int

 	// ShardMax sets the maximum corpus size for a single shard
 	ShardMax int

 	// TrigramMax sets the maximum number of distinct trigrams per document.
 	TrigramMax int

 	// RepositoryDescription holds names and URLs for the repository.
 	RepositoryDescription zoekt.Repository

 	// SubRepositories is a path => sub repository map.
 	SubRepositories map[string]*zoekt.Repository

 	// Path to exuberant ctags binary to run
 	CTags string

 	// If set, ctags must succeed.
 	CTagsMustSucceed bool

 	// Write memory profiles to this file.
 	MemProfile string

 	// LargeFiles is a slice of glob patterns where matching file
 	// paths should be indexed regardless of their size. The pattern syntax
 	// can be found here: https://golang.org/pkg/path/filepath/#Match.
 	LargeFiles []string
 }

 // HashOptions creates a hash of the options that affect an index.
 func (o *Options) HashOptions() string {
 	hasher := sha1.New()

 	hasher.Write([]byte(o.CTags))
 	hasher.Write([]byte(fmt.Sprintf("%t", o.CTagsMustSucceed)))
 	hasher.Write([]byte(fmt.Sprintf("%d", o.SizeMax)))
 	hasher.Write([]byte(fmt.Sprintf("%q", o.LargeFiles)))

 	return fmt.Sprintf("%x", hasher.Sum(nil))
 }

 type largeFilesFlag struct{ *Options }

 func (f largeFilesFlag) String() string {
 	// From flag.Value documentation:
 	//
 	// The flag package may call the String method with a zero-valued receiver,
 	// such as a nil pointer.
 	if f.Options == nil {
 		return ""
 	}
 	s := append([]string{""}, f.LargeFiles...)
 	return strings.Join(s, "-large_file ")
 }

 func (f largeFilesFlag) Set(value string) error {
 	f.LargeFiles = append(f.LargeFiles, value)
 	return nil
 }

 // Flags adds flags for build options to fs.
 func (o *Options) Flags(fs *flag.FlagSet) {
 	x := *o
 	x.SetDefaults()
 	fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
 	fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
 	fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
 	fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
 	fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
 	fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
 	fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
 }

 // Builder manages (parallel) creation of uniformly sized shards. The
 // builder buffers up documents until it collects enough documents and
 // then builds a shard and writes.
 type Builder struct {
 	opts     Options
 	throttle chan int

 	nextShardNum int
 	todo         []*zoekt.Document
 	size         int

 	parser ctags.Parser

 	building sync.WaitGroup

 	errMu      sync.Mutex
 	buildError error

 	// temp name => final name for finished shards. We only rename
 	// them once all shards succeed to avoid Frankstein corpuses.
 	finishedShards map[string]string
 }

 type finishedShard struct {
 	temp, final string
 }

 // SetDefaults sets reasonable default options.
 func (o *Options) SetDefaults() {
 	if o.CTags == "" {
 		ctags, err := exec.LookPath("universal-ctags")
 		if err == nil {
 			o.CTags = ctags
 		}
 	}

 	if o.CTags == "" {
 		ctags, err := exec.LookPath("ctags-exuberant")
 		if err == nil {
 			o.CTags = ctags
 		}
 	}
 	if o.Parallelism == 0 {
 		o.Parallelism = 4
 	}
 	if o.SizeMax == 0 {
 		o.SizeMax = 2 << 20
 	}
 	if o.ShardMax == 0 {
 		o.ShardMax = 100 << 20
 	}
 	if o.TrigramMax == 0 {
 		o.TrigramMax = 20000
 	}

 	if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
 		parsed, _ := url.Parse(o.RepositoryDescription.URL)
 		if parsed != nil {
 			o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
 		}
 	}
 }

 func hashString(s string) string {
 	h := sha1.New()
 	io.WriteString(h, s)
 	return fmt.Sprintf("%x", h.Sum(nil))
 }

 // ShardName returns the name the given index shard.
 func (o *Options) shardName(n int) string {
 	abs := url.QueryEscape(o.RepositoryDescription.Name)
 	if len(abs) > 200 {
 		abs = abs[:200] + hashString(abs)[:8]
 	}
 	return filepath.Join(o.IndexDir,
 		fmt.Sprintf("%s_v%d.%05d.zoekt", abs, zoekt.IndexFormatVersion, n))
 }

 // IncrementalSkipIndexing returns true if the index present on disk matches
 // the build options.
 func (o *Options) IncrementalSkipIndexing() bool {
 	fn := o.shardName(0)

 	f, err := os.Open(fn)
 	if err != nil {
 		return false
 	}

 	iFile, err := zoekt.NewIndexFile(f)
 	if err != nil {
 		return false
 	}
 	defer iFile.Close()

 	repo, index, err := zoekt.ReadMetadata(iFile)
 	if err != nil {
 		return false
 	}

 	if index.IndexFeatureVersion != zoekt.FeatureVersion {
 		return false
 	}

 	if repo.IndexOptions != o.HashOptions() {
 		return false
 	}

 	return reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches)
 }

 // IgnoreSizeMax determines whether the max size should be ignored.
 func (o *Options) IgnoreSizeMax(name string) bool {
 	for _, pattern := range o.LargeFiles {
 		pattern = strings.TrimSpace(pattern)
 		m, _ := filepath.Match(pattern, name)
 		if m {
 			return true
 		}
 	}

 	return false
 }

 // NewBuilder creates a new Builder instance.
 func NewBuilder(opts Options) (*Builder, error) {
 	opts.SetDefaults()
 	if opts.RepositoryDescription.Name == "" {
 		return nil, fmt.Errorf("builder: must set Name")
 	}

 	b := &Builder{
 		opts:           opts,
 		throttle:       make(chan int, opts.Parallelism),
 		finishedShards: map[string]string{},
 	}

 	if b.opts.CTags == "" && b.opts.CTagsMustSucceed {
 		return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set")
 	}

 	if strings.Contains(opts.CTags, "universal-ctags") {
 		parser, err := ctags.NewParser(opts.CTags)
 		if err != nil && opts.CTagsMustSucceed {
 			return nil, fmt.Errorf("ctags.NewParser: %v", err)
 		}

 		b.parser = parser
 	}
 	if _, err := b.newShardBuilder(); err != nil {
 		return nil, err
 	}

 	return b, nil
 }

 // AddFile is a convenience wrapper for the Add method
 func (b *Builder) AddFile(name string, content []byte) error {
 	return b.Add(zoekt.Document{Name: name, Content: content})
 }

 func (b *Builder) Add(doc zoekt.Document) error {
 	// We could pass the document on to the shardbuilder, but if
 	// we pass through a part of the source tree with binary/large
 	// files, the corresponding shard would be mostly empty, so
 	// insert a reason here too.
 	if len(doc.Content) > b.opts.SizeMax && !b.opts.IgnoreSizeMax(doc.Name) {
 		doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
 	} else if err := zoekt.CheckText(doc.Content, b.opts.TrigramMax); err != nil {
 		doc.SkipReason = err.Error()
 		doc.Language = "binary"
 	}

 	b.todo = append(b.todo, &doc)
 	b.size += len(doc.Name) + len(doc.Content)
 	if b.size > b.opts.ShardMax {
 		return b.flush()
 	}

 	return nil
 }

 // Finish creates a last shard from the buffered documents, and clears
 // stale shards from previous runs. This should always be called, also
 // in failure cases, to ensure cleanup.
 func (b *Builder) Finish() error {
 	b.flush()
 	b.building.Wait()

 	if b.buildError != nil {
 		for tmp := range b.finishedShards {
 			os.Remove(tmp)
 		}
 		b.finishedShards = map[string]string{}
 		return b.buildError
 	}

 	for tmp, final := range b.finishedShards {
 		if err := os.Rename(tmp, final); err != nil {
 			b.buildError = err
 		}
 	}
 	b.finishedShards = map[string]string{}

 	if b.nextShardNum > 0 {
 		b.deleteRemainingShards()
 	}
 	return b.buildError
 }

 func (b *Builder) deleteRemainingShards() {
 	for {
 		shard := b.nextShardNum
 		b.nextShardNum++
 		name := b.opts.shardName(shard)
 		if err := os.Remove(name); os.IsNotExist(err) {
 			break
 		}
 	}
 }

 func (b *Builder) flush() error {
 	todo := b.todo
 	b.todo = nil
 	b.size = 0
 	b.errMu.Lock()
 	defer b.errMu.Unlock()
 	if b.buildError != nil {
 		return b.buildError
 	}

 	hasShard := b.nextShardNum > 0
 	if len(todo) == 0 && hasShard {
 		return nil
 	}

 	shard := b.nextShardNum
 	b.nextShardNum++

 	if b.opts.Parallelism > 1 {
 		b.building.Add(1)
 		go func() {
 			b.throttle <- 1
 			done, err := b.buildShard(todo, shard)
 			<-b.throttle

 			b.errMu.Lock()
 			defer b.errMu.Unlock()
 			if err != nil && b.buildError == nil {
 				b.buildError = err
 			}
 			if err == nil {
 				b.finishedShards[done.temp] = done.final
 			}
 			b.building.Done()
 		}()
 	} else {
 		// No goroutines when we're not parallel. This
 		// simplifies memory profiling.
 		done, err := b.buildShard(todo, shard)
 		b.buildError = err
 		if err == nil {
 			b.finishedShards[done.temp] = done.final
 		}
 		if b.opts.MemProfile != "" {
 			// drop memory, and profile.
 			todo = nil
 			b.writeMemProfile(b.opts.MemProfile)
 		}

 		return b.buildError
 	}

 	return nil
 }

 var profileNumber int

 func (b *Builder) writeMemProfile(name string) {
 	nm := fmt.Sprintf("%s.%d", name, profileNumber)
 	profileNumber++
 	f, err := os.Create(nm)
 	if err != nil {
 		log.Fatal("could not create memory profile: ", err)
 	}
 	runtime.GC() // get up-to-date statistics
 	if err := pprof.WriteHeapProfile(f); err != nil {
 		log.Fatal("could not write memory profile: ", err)
 	}
 	f.Close()
 	log.Printf("wrote mem profile %q", nm)
 }

 // map [0,inf) to [0,1) monotonically
 func squashRange(j int) float64 {
 	x := float64(j)
 	return x / (1 + x)
 }

 var testRe = regexp.MustCompile("test")

 type rankedDoc struct {
 	*zoekt.Document
 	rank []float64
 }

 func rank(d *zoekt.Document, origIdx int) []float64 {
 	test := 0.0
 	if testRe.MatchString(d.Name) {
 		test = 1.0
 	}

 	// Smaller is earlier (=better).
 	return []float64{
 		// Prefer docs that are not tests
 		test,

 		// With many symbols
 		1.0 - squashRange(len(d.Symbols)),

 		// With short content
 		squashRange(len(d.Content)),

 		// With short names
 		squashRange(len(d.Name)),

 		// That is present is as many branches as possible
 		1.0 - squashRange(len(d.Branches)),

 		// Preserve original ordering.
 		squashRange(origIdx),
 	}
 }

 func sortDocuments(todo []*zoekt.Document) {
 	rs := make([]rankedDoc, 0, len(todo))
 	for i, t := range todo {
 		rd := rankedDoc{t, rank(t, i)}
 		rs = append(rs, rd)
 	}
 	sort.Slice(rs, func(i, j int) bool {
 		r1 := rs[i].rank
 		r2 := rs[j].rank
 		for i := range r1 {
 			if r1[i] < r2[i] {
 				return true
 			}
 			if r1[i] > r2[i] {
 				return false
 			}
 		}

 		return false
 	})
 	for i := range todo {
 		todo[i] = rs[i].Document
 	}
 }

 func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
 	if b.opts.CTags != "" {
 		err := ctagsAddSymbols(todo, b.parser, b.opts.CTags)
 		if b.opts.CTagsMustSucceed && err != nil {
 			return nil, err
 		}
 		if err != nil {
 			log.Printf("ignoring %s error: %v", b.opts.CTags, err)
 		}
 	}

 	name := b.opts.shardName(nextShardNum)

 	shardBuilder, err := b.newShardBuilder()
 	if err != nil {
 		return nil, err
 	}
 	sortDocuments(todo)
 	for _, t := range todo {
 		if err := shardBuilder.Add(*t); err != nil {
 			return nil, err
 		}
 	}

 	return b.writeShard(name, shardBuilder)
 }

 func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
 	desc := b.opts.RepositoryDescription
 	desc.SubRepoMap = b.opts.SubRepositories
 	desc.IndexOptions = b.opts.HashOptions()

 	shardBuilder, err := zoekt.NewIndexBuilder(&desc)
 	if err != nil {
 		return nil, err
 	}
 	return shardBuilder, nil
 }

 func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
 	dir := filepath.Dir(fn)
 	if err := os.MkdirAll(dir, 0700); err != nil {
 		return nil, err
 	}

 	f, err := ioutil.TempFile(dir, filepath.Base(fn) + ".*.tmp")
 	if err != nil {
 		return nil, err
 	}
 	if runtime.GOOS != "windows" {
 		if err := f.Chmod(0666 &^ umask); err != nil {
 			return nil, err
 		}
 	}

 	defer f.Close()
 	if err := ib.Write(f); err != nil {
 		return nil, err
 	}
 	fi, err := f.Stat()
 	if err != nil {
 		return nil, err
 	}
 	if err := f.Close(); err != nil {
 		return nil, err
 	}

 	log.Printf("finished %s: %d index bytes (overhead %3.1f)", fn, fi.Size(),
 		float64(fi.Size())/float64(ib.ContentSize()+1))

 	return &finishedShard{f.Name(), fn}, nil
 }

 // umask holds the Umask of the current process
 var umask os.FileMode
	// Copyright 2016 Google Inc. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// package build implements a more convenient interface for building
	// zoekt indices.
	package build

	import (
	"crypto/sha1"
	"flag"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"net/url"
	"os"
	"os/exec"
	"path/filepath"
	"reflect"
	"regexp"
	"runtime"
	"runtime/pprof"
	"sort"
	"strings"
	"sync"

	"github.com/google/zoekt"
	"github.com/google/zoekt/ctags"
	)

	var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")

	// Branch describes a single branch version.
	type Branch struct {
	Name string
	Version string
	}

	// Options sets options for the index building.
	type Options struct {
	// IndexDir is a directory that holds *.zoekt index files.
	IndexDir string

	// SizeMax is the maximum file size
	SizeMax int

	// Parallelism is the maximum number of shards to index in parallel
	Parallelism int

	// ShardMax sets the maximum corpus size for a single shard
	ShardMax int

	// TrigramMax sets the maximum number of distinct trigrams per document.
	TrigramMax int

	// RepositoryDescription holds names and URLs for the repository.
	RepositoryDescription zoekt.Repository

	// SubRepositories is a path => sub repository map.
	SubRepositories map[string]*zoekt.Repository

	// Path to exuberant ctags binary to run
	CTags string

	// If set, ctags must succeed.
	CTagsMustSucceed bool

	// Write memory profiles to this file.
	MemProfile string

	// LargeFiles is a slice of glob patterns where matching file
	// paths should be indexed regardless of their size. The pattern syntax
	// can be found here: https://golang.org/pkg/path/filepath/#Match.
	LargeFiles []string
	}

	// HashOptions creates a hash of the options that affect an index.
	func (o *Options) HashOptions() string {
	hasher := sha1.New()

	hasher.Write([]byte(o.CTags))
	hasher.Write([]byte(fmt.Sprintf("%t", o.CTagsMustSucceed)))
	hasher.Write([]byte(fmt.Sprintf("%d", o.SizeMax)))
	hasher.Write([]byte(fmt.Sprintf("%q", o.LargeFiles)))

	return fmt.Sprintf("%x", hasher.Sum(nil))
	}

	type largeFilesFlag struct{ *Options }

	func (f largeFilesFlag) String() string {
	// From flag.Value documentation:
	//
	// The flag package may call the String method with a zero-valued receiver,
	// such as a nil pointer.
	if f.Options == nil {
	return ""
	}
	s := append([]string{""}, f.LargeFiles...)
	return strings.Join(s, "-large_file ")
	}

	func (f largeFilesFlag) Set(value string) error {
	f.LargeFiles = append(f.LargeFiles, value)
	return nil
	}

	// Flags adds flags for build options to fs.
	func (o Options) Flags(fs flag.FlagSet) {
	x := *o
	x.SetDefaults()
	fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
	fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
	fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
	fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
	fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
	fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
	fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
	}

	// Builder manages (parallel) creation of uniformly sized shards. The
	// builder buffers up documents until it collects enough documents and
	// then builds a shard and writes.
	type Builder struct {
	opts Options
	throttle chan int

	nextShardNum int
	todo []*zoekt.Document
	size int

	parser ctags.Parser

	building sync.WaitGroup

	errMu sync.Mutex
	buildError error

	// temp name => final name for finished shards. We only rename
	// them once all shards succeed to avoid Frankstein corpuses.
	finishedShards map[string]string
	}

	type finishedShard struct {
	temp, final string
	}

	// SetDefaults sets reasonable default options.
	func (o *Options) SetDefaults() {
	if o.CTags == "" {
	ctags, err := exec.LookPath("universal-ctags")
	if err == nil {
	o.CTags = ctags
	}
	}

	if o.CTags == "" {
	ctags, err := exec.LookPath("ctags-exuberant")
	if err == nil {
	o.CTags = ctags
	}
	}
	if o.Parallelism == 0 {
	o.Parallelism = 4
	}
	if o.SizeMax == 0 {
	o.SizeMax = 2 << 20
	}
	if o.ShardMax == 0 {
	o.ShardMax = 100 << 20
	}
	if o.TrigramMax == 0 {
	o.TrigramMax = 20000
	}

	if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
	parsed, _ := url.Parse(o.RepositoryDescription.URL)
	if parsed != nil {
	o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
	}
	}
	}

	func hashString(s string) string {
	h := sha1.New()
	io.WriteString(h, s)
	return fmt.Sprintf("%x", h.Sum(nil))
	}

	// ShardName returns the name the given index shard.
	func (o *Options) shardName(n int) string {
	abs := url.QueryEscape(o.RepositoryDescription.Name)
	if len(abs) > 200 {
	abs = abs[:200] + hashString(abs)[:8]
	}
	return filepath.Join(o.IndexDir,
	fmt.Sprintf("%s_v%d.%05d.zoekt", abs, zoekt.IndexFormatVersion, n))
	}

	// IncrementalSkipIndexing returns true if the index present on disk matches
	// the build options.
	func (o *Options) IncrementalSkipIndexing() bool {
	fn := o.shardName(0)

	f, err := os.Open(fn)
	if err != nil {
	return false
	}

	iFile, err := zoekt.NewIndexFile(f)
	if err != nil {
	return false
	}
	defer iFile.Close()

	repo, index, err := zoekt.ReadMetadata(iFile)
	if err != nil {
	return false
	}

	if index.IndexFeatureVersion != zoekt.FeatureVersion {
	return false
	}

	if repo.IndexOptions != o.HashOptions() {
	return false
	}

	return reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches)
	}

	// IgnoreSizeMax determines whether the max size should be ignored.
	func (o *Options) IgnoreSizeMax(name string) bool {
	for _, pattern := range o.LargeFiles {
	pattern = strings.TrimSpace(pattern)
	m, _ := filepath.Match(pattern, name)
	if m {
	return true
	}
	}

	return false
	}

	// NewBuilder creates a new Builder instance.
	func NewBuilder(opts Options) (*Builder, error) {
	opts.SetDefaults()
	if opts.RepositoryDescription.Name == "" {
	return nil, fmt.Errorf("builder: must set Name")
	}

	b := &Builder{
	opts: opts,
	throttle: make(chan int, opts.Parallelism),
	finishedShards: map[string]string{},
	}

	if b.opts.CTags == "" && b.opts.CTagsMustSucceed {
	return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set")
	}

	if strings.Contains(opts.CTags, "universal-ctags") {
	parser, err := ctags.NewParser(opts.CTags)
	if err != nil && opts.CTagsMustSucceed {
	return nil, fmt.Errorf("ctags.NewParser: %v", err)
	}

	b.parser = parser
	}
	if _, err := b.newShardBuilder(); err != nil {
	return nil, err
	}

	return b, nil
	}

	// AddFile is a convenience wrapper for the Add method
	func (b *Builder) AddFile(name string, content []byte) error {
	return b.Add(zoekt.Document{Name: name, Content: content})
	}

	func (b *Builder) Add(doc zoekt.Document) error {
	// We could pass the document on to the shardbuilder, but if
	// we pass through a part of the source tree with binary/large
	// files, the corresponding shard would be mostly empty, so
	// insert a reason here too.
	if len(doc.Content) > b.opts.SizeMax && !b.opts.IgnoreSizeMax(doc.Name) {
	doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
	} else if err := zoekt.CheckText(doc.Content, b.opts.TrigramMax); err != nil {
	doc.SkipReason = err.Error()
	doc.Language = "binary"
	}

	b.todo = append(b.todo, &doc)
	b.size += len(doc.Name) + len(doc.Content)
	if b.size > b.opts.ShardMax {
	return b.flush()
	}

	return nil
	}

	// Finish creates a last shard from the buffered documents, and clears
	// stale shards from previous runs. This should always be called, also
	// in failure cases, to ensure cleanup.
	func (b *Builder) Finish() error {
	b.flush()
	b.building.Wait()

	if b.buildError != nil {
	for tmp := range b.finishedShards {
	os.Remove(tmp)
	}
	b.finishedShards = map[string]string{}
	return b.buildError
	}

	for tmp, final := range b.finishedShards {
	if err := os.Rename(tmp, final); err != nil {
	b.buildError = err
	}
	}
	b.finishedShards = map[string]string{}

	if b.nextShardNum > 0 {
	b.deleteRemainingShards()
	}
	return b.buildError
	}

	func (b *Builder) deleteRemainingShards() {
	for {
	shard := b.nextShardNum
	b.nextShardNum++
	name := b.opts.shardName(shard)
	if err := os.Remove(name); os.IsNotExist(err) {
	break
	}
	}
	}

	func (b *Builder) flush() error {
	todo := b.todo
	b.todo = nil
	b.size = 0
	b.errMu.Lock()
	defer b.errMu.Unlock()
	if b.buildError != nil {
	return b.buildError
	}

	hasShard := b.nextShardNum > 0
	if len(todo) == 0 && hasShard {
	return nil
	}

	shard := b.nextShardNum
	b.nextShardNum++

	if b.opts.Parallelism > 1 {
	b.building.Add(1)
	go func() {
	b.throttle <- 1
	done, err := b.buildShard(todo, shard)
	<-b.throttle

	b.errMu.Lock()
	defer b.errMu.Unlock()
	if err != nil && b.buildError == nil {
	b.buildError = err
	}
	if err == nil {
	b.finishedShards[done.temp] = done.final
	}
	b.building.Done()
	}()
	} else {
	// No goroutines when we're not parallel. This
	// simplifies memory profiling.
	done, err := b.buildShard(todo, shard)
	b.buildError = err
	if err == nil {
	b.finishedShards[done.temp] = done.final
	}
	if b.opts.MemProfile != "" {
	// drop memory, and profile.
	todo = nil
	b.writeMemProfile(b.opts.MemProfile)
	}

	return b.buildError
	}

	return nil
	}

	var profileNumber int

	func (b *Builder) writeMemProfile(name string) {
	nm := fmt.Sprintf("%s.%d", name, profileNumber)
	profileNumber++
	f, err := os.Create(nm)
	if err != nil {
	log.Fatal("could not create memory profile: ", err)
	}
	runtime.GC() // get up-to-date statistics
	if err := pprof.WriteHeapProfile(f); err != nil {
	log.Fatal("could not write memory profile: ", err)
	}
	f.Close()
	log.Printf("wrote mem profile %q", nm)
	}

	// map [0,inf) to [0,1) monotonically
	func squashRange(j int) float64 {
	x := float64(j)
	return x / (1 + x)
	}

	var testRe = regexp.MustCompile("test")

	type rankedDoc struct {
	*zoekt.Document
	rank []float64
	}

	func rank(d *zoekt.Document, origIdx int) []float64 {
	test := 0.0
	if testRe.MatchString(d.Name) {
	test = 1.0
	}

	// Smaller is earlier (=better).
	return []float64{
	// Prefer docs that are not tests
	test,

	// With many symbols
	1.0 - squashRange(len(d.Symbols)),

	// With short content
	squashRange(len(d.Content)),

	// With short names
	squashRange(len(d.Name)),

	// That is present is as many branches as possible
	1.0 - squashRange(len(d.Branches)),

	// Preserve original ordering.
	squashRange(origIdx),
	}
	}

	func sortDocuments(todo []*zoekt.Document) {
	rs := make([]rankedDoc, 0, len(todo))
	for i, t := range todo {
	rd := rankedDoc{t, rank(t, i)}
	rs = append(rs, rd)
	}
	sort.Slice(rs, func(i, j int) bool {
	r1 := rs[i].rank
	r2 := rs[j].rank
	for i := range r1 {
	if r1[i] < r2[i] {
	return true
	}
	if r1[i] > r2[i] {
	return false
	}
	}

	return false
	})
	for i := range todo {
	todo[i] = rs[i].Document
	}
	}

	func (b Builder) buildShard(todo []zoekt.Document, nextShardNum int) (*finishedShard, error) {
	if b.opts.CTags != "" {
	err := ctagsAddSymbols(todo, b.parser, b.opts.CTags)
	if b.opts.CTagsMustSucceed && err != nil {
	return nil, err
	}
	if err != nil {
	log.Printf("ignoring %s error: %v", b.opts.CTags, err)
	}
	}

	name := b.opts.shardName(nextShardNum)

	shardBuilder, err := b.newShardBuilder()
	if err != nil {
	return nil, err
	}
	sortDocuments(todo)
	for _, t := range todo {
	if err := shardBuilder.Add(*t); err != nil {
	return nil, err
	}
	}

	return b.writeShard(name, shardBuilder)
	}

	func (b Builder) newShardBuilder() (zoekt.IndexBuilder, error) {
	desc := b.opts.RepositoryDescription
	desc.SubRepoMap = b.opts.SubRepositories
	desc.IndexOptions = b.opts.HashOptions()

	shardBuilder, err := zoekt.NewIndexBuilder(&desc)
	if err != nil {
	return nil, err
	}
	return shardBuilder, nil
	}

	func (b Builder) writeShard(fn string, ib zoekt.IndexBuilder) (*finishedShard, error) {
	dir := filepath.Dir(fn)
	if err := os.MkdirAll(dir, 0700); err != nil {
	return nil, err
	}

	f, err := ioutil.TempFile(dir, filepath.Base(fn) + ".*.tmp")
	if err != nil {
	return nil, err
	}
	if runtime.GOOS != "windows" {
	if err := f.Chmod(0666 &^ umask); err != nil {
	return nil, err
	}
	}

	defer f.Close()
	if err := ib.Write(f); err != nil {
	return nil, err
	}
	fi, err := f.Stat()
	if err != nil {
	return nil, err
	}
	if err := f.Close(); err != nil {
	return nil, err
	}

	log.Printf("finished %s: %d index bytes (overhead %3.1f)", fn, fi.Size(),
	float64(fi.Size())/float64(ib.ContentSize()+1))

	return &finishedShard{f.Name(), fn}, nil
	}

	// umask holds the Umask of the current process
	var umask os.FileMode