blob: d1fc5f348f2560c18e52dc8c58adfc05b58459c3 [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// package build implements a more convenient interface for building
// zoekt indices.
package build
import (
"crypto/sha1"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"net/url"
"os"
"os/exec"
"path/filepath"
"reflect"
"regexp"
"runtime"
"runtime/pprof"
"sort"
"strings"
"sync"
"github.com/google/zoekt"
"github.com/google/zoekt/ctags"
)
var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
// Branch describes a single branch version.
type Branch struct {
Name string
Version string
}
// Options sets options for the index building.
type Options struct {
// IndexDir is a directory that holds *.zoekt index files.
IndexDir string
// SizeMax is the maximum file size
SizeMax int
// Parallelism is the maximum number of shards to index in parallel
Parallelism int
// ShardMax sets the maximum corpus size for a single shard
ShardMax int
// TrigramMax sets the maximum number of distinct trigrams per document.
TrigramMax int
// RepositoryDescription holds names and URLs for the repository.
RepositoryDescription zoekt.Repository
// SubRepositories is a path => sub repository map.
SubRepositories map[string]*zoekt.Repository
// Path to exuberant ctags binary to run
CTags string
// If set, ctags must succeed.
CTagsMustSucceed bool
// Write memory profiles to this file.
MemProfile string
// LargeFiles is a slice of glob patterns where matching file
// paths should be indexed regardless of their size. The pattern syntax
// can be found here: https://golang.org/pkg/path/filepath/#Match.
LargeFiles []string
}
// HashOptions creates a hash of the options that affect an index.
func (o *Options) HashOptions() string {
hasher := sha1.New()
hasher.Write([]byte(o.CTags))
hasher.Write([]byte(fmt.Sprintf("%t", o.CTagsMustSucceed)))
hasher.Write([]byte(fmt.Sprintf("%d", o.SizeMax)))
hasher.Write([]byte(fmt.Sprintf("%q", o.LargeFiles)))
return fmt.Sprintf("%x", hasher.Sum(nil))
}
type largeFilesFlag struct{ *Options }
func (f largeFilesFlag) String() string {
// From flag.Value documentation:
//
// The flag package may call the String method with a zero-valued receiver,
// such as a nil pointer.
if f.Options == nil {
return ""
}
s := append([]string{""}, f.LargeFiles...)
return strings.Join(s, "-large_file ")
}
func (f largeFilesFlag) Set(value string) error {
f.LargeFiles = append(f.LargeFiles, value)
return nil
}
// Flags adds flags for build options to fs.
func (o *Options) Flags(fs *flag.FlagSet) {
x := *o
x.SetDefaults()
fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
}
// Builder manages (parallel) creation of uniformly sized shards. The
// builder buffers up documents until it collects enough documents and
// then builds a shard and writes.
type Builder struct {
opts Options
throttle chan int
nextShardNum int
todo []*zoekt.Document
size int
parser ctags.Parser
building sync.WaitGroup
errMu sync.Mutex
buildError error
// temp name => final name for finished shards. We only rename
// them once all shards succeed to avoid Frankstein corpuses.
finishedShards map[string]string
}
type finishedShard struct {
temp, final string
}
// SetDefaults sets reasonable default options.
func (o *Options) SetDefaults() {
if o.CTags == "" {
ctags, err := exec.LookPath("universal-ctags")
if err == nil {
o.CTags = ctags
}
}
if o.CTags == "" {
ctags, err := exec.LookPath("ctags-exuberant")
if err == nil {
o.CTags = ctags
}
}
if o.Parallelism == 0 {
o.Parallelism = 4
}
if o.SizeMax == 0 {
o.SizeMax = 2 << 20
}
if o.ShardMax == 0 {
o.ShardMax = 100 << 20
}
if o.TrigramMax == 0 {
o.TrigramMax = 20000
}
if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
parsed, _ := url.Parse(o.RepositoryDescription.URL)
if parsed != nil {
o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
}
}
}
func hashString(s string) string {
h := sha1.New()
io.WriteString(h, s)
return fmt.Sprintf("%x", h.Sum(nil))
}
// ShardName returns the name the given index shard.
func (o *Options) shardName(n int) string {
abs := url.QueryEscape(o.RepositoryDescription.Name)
if len(abs) > 200 {
abs = abs[:200] + hashString(abs)[:8]
}
return filepath.Join(o.IndexDir,
fmt.Sprintf("%s_v%d.%05d.zoekt", abs, zoekt.IndexFormatVersion, n))
}
// IncrementalSkipIndexing returns true if the index present on disk matches
// the build options.
func (o *Options) IncrementalSkipIndexing() bool {
fn := o.shardName(0)
f, err := os.Open(fn)
if err != nil {
return false
}
iFile, err := zoekt.NewIndexFile(f)
if err != nil {
return false
}
defer iFile.Close()
repo, index, err := zoekt.ReadMetadata(iFile)
if err != nil {
return false
}
if index.IndexFeatureVersion != zoekt.FeatureVersion {
return false
}
if repo.IndexOptions != o.HashOptions() {
return false
}
return reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches)
}
// IgnoreSizeMax determines whether the max size should be ignored.
func (o *Options) IgnoreSizeMax(name string) bool {
for _, pattern := range o.LargeFiles {
pattern = strings.TrimSpace(pattern)
m, _ := filepath.Match(pattern, name)
if m {
return true
}
}
return false
}
// NewBuilder creates a new Builder instance.
func NewBuilder(opts Options) (*Builder, error) {
opts.SetDefaults()
if opts.RepositoryDescription.Name == "" {
return nil, fmt.Errorf("builder: must set Name")
}
b := &Builder{
opts: opts,
throttle: make(chan int, opts.Parallelism),
finishedShards: map[string]string{},
}
if b.opts.CTags == "" && b.opts.CTagsMustSucceed {
return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set")
}
if strings.Contains(opts.CTags, "universal-ctags") {
parser, err := ctags.NewParser(opts.CTags)
if err != nil && opts.CTagsMustSucceed {
return nil, fmt.Errorf("ctags.NewParser: %v", err)
}
b.parser = parser
}
if _, err := b.newShardBuilder(); err != nil {
return nil, err
}
return b, nil
}
// AddFile is a convenience wrapper for the Add method
func (b *Builder) AddFile(name string, content []byte) error {
return b.Add(zoekt.Document{Name: name, Content: content})
}
func (b *Builder) Add(doc zoekt.Document) error {
// We could pass the document on to the shardbuilder, but if
// we pass through a part of the source tree with binary/large
// files, the corresponding shard would be mostly empty, so
// insert a reason here too.
if len(doc.Content) > b.opts.SizeMax && !b.opts.IgnoreSizeMax(doc.Name) {
doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
} else if err := zoekt.CheckText(doc.Content, b.opts.TrigramMax); err != nil {
doc.SkipReason = err.Error()
doc.Language = "binary"
}
b.todo = append(b.todo, &doc)
b.size += len(doc.Name) + len(doc.Content)
if b.size > b.opts.ShardMax {
return b.flush()
}
return nil
}
// Finish creates a last shard from the buffered documents, and clears
// stale shards from previous runs. This should always be called, also
// in failure cases, to ensure cleanup.
func (b *Builder) Finish() error {
b.flush()
b.building.Wait()
if b.buildError != nil {
for tmp := range b.finishedShards {
os.Remove(tmp)
}
b.finishedShards = map[string]string{}
return b.buildError
}
for tmp, final := range b.finishedShards {
if err := os.Rename(tmp, final); err != nil {
b.buildError = err
}
}
b.finishedShards = map[string]string{}
if b.nextShardNum > 0 {
b.deleteRemainingShards()
}
return b.buildError
}
func (b *Builder) deleteRemainingShards() {
for {
shard := b.nextShardNum
b.nextShardNum++
name := b.opts.shardName(shard)
if err := os.Remove(name); os.IsNotExist(err) {
break
}
}
}
func (b *Builder) flush() error {
todo := b.todo
b.todo = nil
b.size = 0
b.errMu.Lock()
defer b.errMu.Unlock()
if b.buildError != nil {
return b.buildError
}
hasShard := b.nextShardNum > 0
if len(todo) == 0 && hasShard {
return nil
}
shard := b.nextShardNum
b.nextShardNum++
if b.opts.Parallelism > 1 {
b.building.Add(1)
go func() {
b.throttle <- 1
done, err := b.buildShard(todo, shard)
<-b.throttle
b.errMu.Lock()
defer b.errMu.Unlock()
if err != nil && b.buildError == nil {
b.buildError = err
}
if err == nil {
b.finishedShards[done.temp] = done.final
}
b.building.Done()
}()
} else {
// No goroutines when we're not parallel. This
// simplifies memory profiling.
done, err := b.buildShard(todo, shard)
b.buildError = err
if err == nil {
b.finishedShards[done.temp] = done.final
}
if b.opts.MemProfile != "" {
// drop memory, and profile.
todo = nil
b.writeMemProfile(b.opts.MemProfile)
}
return b.buildError
}
return nil
}
var profileNumber int
func (b *Builder) writeMemProfile(name string) {
nm := fmt.Sprintf("%s.%d", name, profileNumber)
profileNumber++
f, err := os.Create(nm)
if err != nil {
log.Fatal("could not create memory profile: ", err)
}
runtime.GC() // get up-to-date statistics
if err := pprof.WriteHeapProfile(f); err != nil {
log.Fatal("could not write memory profile: ", err)
}
f.Close()
log.Printf("wrote mem profile %q", nm)
}
// map [0,inf) to [0,1) monotonically
func squashRange(j int) float64 {
x := float64(j)
return x / (1 + x)
}
var testRe = regexp.MustCompile("test")
type rankedDoc struct {
*zoekt.Document
rank []float64
}
func rank(d *zoekt.Document, origIdx int) []float64 {
test := 0.0
if testRe.MatchString(d.Name) {
test = 1.0
}
// Smaller is earlier (=better).
return []float64{
// Prefer docs that are not tests
test,
// With many symbols
1.0 - squashRange(len(d.Symbols)),
// With short content
squashRange(len(d.Content)),
// With short names
squashRange(len(d.Name)),
// That is present is as many branches as possible
1.0 - squashRange(len(d.Branches)),
// Preserve original ordering.
squashRange(origIdx),
}
}
func sortDocuments(todo []*zoekt.Document) {
rs := make([]rankedDoc, 0, len(todo))
for i, t := range todo {
rd := rankedDoc{t, rank(t, i)}
rs = append(rs, rd)
}
sort.Slice(rs, func(i, j int) bool {
r1 := rs[i].rank
r2 := rs[j].rank
for i := range r1 {
if r1[i] < r2[i] {
return true
}
if r1[i] > r2[i] {
return false
}
}
return false
})
for i := range todo {
todo[i] = rs[i].Document
}
}
func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
if b.opts.CTags != "" {
err := ctagsAddSymbols(todo, b.parser, b.opts.CTags)
if b.opts.CTagsMustSucceed && err != nil {
return nil, err
}
if err != nil {
log.Printf("ignoring %s error: %v", b.opts.CTags, err)
}
}
name := b.opts.shardName(nextShardNum)
shardBuilder, err := b.newShardBuilder()
if err != nil {
return nil, err
}
sortDocuments(todo)
for _, t := range todo {
if err := shardBuilder.Add(*t); err != nil {
return nil, err
}
}
return b.writeShard(name, shardBuilder)
}
func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
desc := b.opts.RepositoryDescription
desc.SubRepoMap = b.opts.SubRepositories
desc.IndexOptions = b.opts.HashOptions()
shardBuilder, err := zoekt.NewIndexBuilder(&desc)
if err != nil {
return nil, err
}
return shardBuilder, nil
}
func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
dir := filepath.Dir(fn)
if err := os.MkdirAll(dir, 0700); err != nil {
return nil, err
}
f, err := ioutil.TempFile(dir, filepath.Base(fn) + ".*.tmp")
if err != nil {
return nil, err
}
if runtime.GOOS != "windows" {
if err := f.Chmod(0666 &^ umask); err != nil {
return nil, err
}
}
defer f.Close()
if err := ib.Write(f); err != nil {
return nil, err
}
fi, err := f.Stat()
if err != nil {
return nil, err
}
if err := f.Close(); err != nil {
return nil, err
}
log.Printf("finished %s: %d index bytes (overhead %3.1f)", fn, fi.Size(),
float64(fi.Size())/float64(ib.ContentSize()+1))
return &finishedShard{f.Name(), fn}, nil
}
// umask holds the Umask of the current process
var umask os.FileMode