blob: 77ee6118c61eb21aa19e45fa10d0c1316fef2a45 [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// package build implements a more convenient interface for building
// zoekt indices.
package build
import (
"fmt"
"io/ioutil"
"log"
"os"
"os/exec"
"path/filepath"
"runtime"
"runtime/pprof"
"strings"
"sync"
"github.com/google/zoekt"
)
var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
// Branch describes a single branch version.
type Branch struct {
Name string
Version string
}
// Options sets options for the index building.
type Options struct {
// IndexDir is a directory that holds *.zoekt index files.
IndexDir string
// SizeMax is the maximum file size
SizeMax int
// Parallelism is the maximum number of shards to index in parallel
Parallelism int
// ShardMax sets the maximum corpus size for a single shard
ShardMax int
// RepositoryDescription holds names and URLs for the repository.
RepositoryDescription zoekt.Repository
// SubRepositories is a path => sub repository map.
SubRepositories map[string]*zoekt.Repository
// RepoDir is the path to the repository on the indexing machine.
RepoDir string
// Path to exuberant ctags binary to run
CTags string
// If set, ctags must succeed.
CTagsMustSucceed bool
// Path to namespace-sandbox from Bazel
NamespaceSandbox string
// Write memory profiles to this file.
MemProfile string
}
// Builder manages (parallel) creation of uniformly sized shards.
type Builder struct {
opts Options
throttle chan int
nextShardNum int
todo []*zoekt.Document
size int
building sync.WaitGroup
errMu sync.Mutex
buildError error
// temp name => final name for finished shards. We only rename
// them once all shards succeed to avoid Frankstein corpuses.
finishedShards map[string]string
}
// SetDefaults sets reasonable default options.
func (o *Options) SetDefaults() {
if o.CTags == "" {
ctags, err := exec.LookPath("ctags-universal")
if err == nil {
o.CTags = ctags
}
}
if o.CTags == "" {
ctags, err := exec.LookPath("ctags-exuberant")
if err == nil {
o.CTags = ctags
}
}
if o.NamespaceSandbox == "" {
ns, err := exec.LookPath("zoekt-sandbox")
if err == nil {
o.NamespaceSandbox = ns
}
}
if o.Parallelism == 0 {
o.Parallelism = 1
}
if o.SizeMax == 0 {
o.SizeMax = 128 << 10
}
if o.ShardMax == 0 {
o.ShardMax = 128 << 20
}
}
// ShardName returns the name the given index shard.
func (o *Options) shardName(n int) (string, error) {
abs, err := filepath.Abs(o.RepoDir)
if err != nil {
return "", err
}
return filepath.Join(o.IndexDir,
fmt.Sprintf("%s_v%d.%05d.zoekt", strings.Replace(abs, "/", "_", -1), zoekt.IndexFormatVersion, n)), nil
}
// IndexVersions returns the versions as present in the index, for
// implementing incremental indexing.
func (o *Options) IndexVersions() []zoekt.RepositoryBranch {
fn, err := o.shardName(0)
if err != nil {
return nil
}
f, err := os.Open(fn)
if err != nil {
return nil
}
iFile, err := zoekt.NewIndexFile(f)
if err != nil {
return nil
}
defer iFile.Close()
repo, index, err := zoekt.ReadMetadata(iFile)
if err != nil {
return nil
}
if index.IndexFeatureVersion != zoekt.FeatureVersion {
return nil
}
return repo.Branches
}
// NewBuilder creates a new Builder instance.
func NewBuilder(opt Options) (*Builder, error) {
if opt.RepoDir == "" {
return nil, fmt.Errorf("must set options.RepoDir")
}
b := &Builder{
opts: opt,
throttle: make(chan int, opt.Parallelism),
finishedShards: map[string]string{},
}
if _, err := b.newShardBuilder(); err != nil {
return nil, err
}
return b, nil
}
func (b *Builder) AddFile(name string, content []byte) {
b.Add(zoekt.Document{Name: name, Content: content})
}
func (b *Builder) Add(doc zoekt.Document) error {
if len(doc.Content) > b.opts.SizeMax {
return nil
}
if !zoekt.IsText(doc.Content) {
return nil
}
b.todo = append(b.todo, &doc)
b.size += len(doc.Name) + len(doc.Content)
if b.size > b.opts.ShardMax {
return b.flush()
}
return nil
}
func (b *Builder) Finish() error {
b.flush()
b.building.Wait()
if b.buildError != nil {
for tmp := range b.finishedShards {
os.Remove(tmp)
}
return b.buildError
}
for tmp, final := range b.finishedShards {
if err := os.Rename(tmp, final); err != nil {
b.buildError = err
}
}
if b.nextShardNum > 0 {
b.deleteRemainingShards()
}
return b.buildError
}
func (b *Builder) deleteRemainingShards() {
for {
shard := b.nextShardNum
b.nextShardNum++
name, err := b.opts.shardName(shard)
if err != nil {
break
}
if err := os.Remove(name); os.IsNotExist(err) {
break
}
}
}
func (b *Builder) flush() error {
todo := b.todo
b.todo = nil
b.size = 0
b.errMu.Lock()
defer b.errMu.Unlock()
if b.buildError != nil {
return b.buildError
}
if len(todo) == 0 {
return nil
}
shard := b.nextShardNum
b.nextShardNum++
if b.opts.Parallelism > 1 {
b.building.Add(1)
go func() {
b.throttle <- 1
err := b.buildShard(todo, shard)
<-b.throttle
b.errMu.Lock()
defer b.errMu.Unlock()
if err != nil && b.buildError == nil {
b.buildError = err
}
b.building.Done()
}()
} else {
// No goroutines when we're not parallel. This
// simplifies memory profiling.
b.buildError = b.buildShard(todo, shard)
if b.opts.MemProfile != "" {
// drop memory, and profile.
todo = nil
b.writeMemProfile(b.opts.MemProfile)
}
return b.buildError
}
return nil
}
var profileNumber int
func (b *Builder) writeMemProfile(name string) {
nm := fmt.Sprintf("%s.%d", name, profileNumber)
profileNumber++
f, err := os.Create(nm)
if err != nil {
log.Fatal("could not create memory profile: ", err)
}
runtime.GC() // get up-to-date statistics
if err := pprof.WriteHeapProfile(f); err != nil {
log.Fatal("could not write memory profile: ", err)
}
f.Close()
log.Printf("wrote mem profile %q", nm)
}
func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) error {
if b.opts.CTags == "" && b.opts.CTagsMustSucceed {
return fmt.Errorf("ctags binary not found, but CTagsMustSucceed set.")
}
if b.opts.CTags != "" {
err := ctagsAddSymbols(todo, b.opts.CTags, b.opts.NamespaceSandbox)
if b.opts.CTagsMustSucceed && err != nil {
return err
}
if err != nil {
log.Printf("ignoring %s error: %v", b.opts.CTags, err)
}
}
name, err := b.opts.shardName(nextShardNum)
if err != nil {
return err
}
shardBuilder, err := b.newShardBuilder()
if err != nil {
return err
}
for _, t := range todo {
shardBuilder.Add(*t)
}
if err := b.writeShard(name, shardBuilder); err != nil {
return err
}
return nil
}
func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
desc := b.opts.RepositoryDescription
desc.SubRepoMap = b.opts.SubRepositories
shardBuilder, err := zoekt.NewIndexBuilder(&desc)
if err != nil {
return nil, err
}
return shardBuilder, nil
}
func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) error {
dir := filepath.Dir(fn)
if err := os.MkdirAll(dir, 0700); err != nil {
return err
}
f, err := ioutil.TempFile(dir, filepath.Base(fn))
if err != nil {
return err
}
defer f.Close()
if err := ib.Write(f); err != nil {
return err
}
fi, err := f.Stat()
if err != nil {
return err
}
if err := f.Close(); err != nil {
return err
}
b.finishedShards[f.Name()] = fn
log.Printf("finished %s: %d index bytes (overhead %3.1f)", fn, fi.Size(),
float64(fi.Size())/float64(ib.ContentSize()+1))
return nil
}