shards.go - zoekt - Git at Google

 // Copyright 2016 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package zoekt

 import (
 	"fmt"
 	"log"
 	"os"
 	"path/filepath"
 	"runtime"
 	"runtime/debug"
 	"sort"
 	"time"

 	"golang.org/x/net/context"

 	"github.com/fsnotify/fsnotify"
 	"github.com/google/zoekt/query"
 )

 type searchShard struct {
 	Searcher
 	mtime time.Time
 }

 type shardWatcher struct {
 	dir string

 	// Limit the number of parallel queries. Since searching is
 	// CPU bound, we can't do better than #CPU queries in
 	// parallel.  If we do so, we just create more memory
 	// pressure.
 	throttle chan struct{}

 	shards map[string]*searchShard
 	quit   chan struct{}
 }

 func loadShard(fn string) (*searchShard, error) {
 	f, err := os.Open(fn)
 	if err != nil {
 		return nil, err
 	}
 	fi, err := f.Stat()
 	if err != nil {
 		return nil, err
 	}

 	iFile, err := NewIndexFile(f)
 	if err != nil {
 		return nil, err
 	}
 	s, err := NewSearcher(iFile)
 	if err != nil {
 		iFile.Close()
 		return nil, fmt.Errorf("NewSearcher(%s): %v", fn, err)
 	}

 	return &searchShard{
 		mtime:    fi.ModTime(),
 		Searcher: s,
 	}, nil
 }

 func (s *shardWatcher) String() string {
 	return fmt.Sprintf("shardWatcher(%s)", s.dir)
 }

 func (s *shardWatcher) scan() error {
 	fs, err := filepath.Glob(filepath.Join(s.dir, "*.zoekt"))
 	if err != nil {
 		return err
 	}

 	if len(fs) == 0 {
 		return fmt.Errorf("directory %s is empty", s.dir)
 	}

 	ts := map[string]time.Time{}
 	for _, fn := range fs {
 		key := filepath.Base(fn)
 		fi, err := os.Lstat(fn)
 		if err != nil {
 			continue
 		}

 		ts[key] = fi.ModTime()
 	}

 	s.lock()
 	var toLoad []string
 	for k, mtime := range ts {
 		if s.shards[k] == nil || s.shards[k].mtime != mtime {
 			toLoad = append(toLoad, k)
 		}
 	}

 	var toDrop []string
 	// Unload deleted shards.
 	for k := range s.shards {
 		if _, ok := ts[k]; !ok {
 			toDrop = append(toDrop, k)
 		}
 	}
 	s.unlock()

 	for _, t := range toDrop {
 		log.Printf("unloading: %s", t)
 		s.replace(t, nil)
 	}

 	for _, t := range toLoad {
 		shard, err := loadShard(filepath.Join(s.dir, t))
 		log.Printf("reloading: %s, err %v ", t, err)
 		if err != nil {
 			continue
 		}
 		s.replace(t, shard)
 	}

 	return nil
 }

 func (s *shardWatcher) rlock() {
 	s.throttle <- struct{}{}
 }

 // getShards returns the currently loaded shards. The shards must be
 // accessed under a rlock call.
 func (s *shardWatcher) getShards() []Searcher {
 	var res []Searcher
 	for _, sh := range s.shards {
 		res = append(res, sh)
 	}
 	return res
 }

 func (s *shardWatcher) runlock() {
 	<-s.throttle
 }

 func (s *shardWatcher) lock() {
 	n := cap(s.throttle)
 	for n > 0 {
 		s.throttle <- struct{}{}
 		n--
 	}
 }

 func (s *shardWatcher) unlock() {
 	n := cap(s.throttle)
 	for n > 0 {
 		<-s.throttle
 		n--
 	}
 }

 func (s *shardWatcher) replace(key string, shard *searchShard) {
 	s.lock()
 	defer s.unlock()
 	old := s.shards[key]
 	if old != nil {
 		old.Close()
 	}
 	if shard != nil {
 		s.shards[key] = shard
 	} else {
 		delete(s.shards, key)
 	}
 }

 func (s *shardWatcher) watch() error {
 	watcher, err := fsnotify.NewWatcher()
 	if err != nil {
 		return err
 	}
 	if err := watcher.Add(s.dir); err != nil {
 		return err
 	}

 	go func() {
 		for {
 			select {
 			case <-watcher.Events:
 				s.scan()
 			case err := <-watcher.Errors:
 				if err != nil {
 					log.Println("watcher error:", err)
 				}
 			case <-s.quit:
 				watcher.Close()
 				return
 			}
 		}
 	}()
 	return nil
 }

 // NewShardedSearcher returns a searcher instance that loads all
 // shards corresponding to a glob into memory.
 func NewShardedSearcher(dir string) (Searcher, error) {
 	ss := shardWatcher{
 		dir:      dir,
 		shards:   make(map[string]*searchShard),
 		quit:     make(chan struct{}, 1),
 		throttle: make(chan struct{}, runtime.NumCPU()),
 	}

 	if err := ss.scan(); err != nil {
 		return nil, err
 	}

 	if err := ss.watch(); err != nil {
 		return nil, err
 	}

 	return &shardedSearcher{&ss}, nil
 }

 // Close closes references to open files. It may be called only once.
 func (ss *shardWatcher) Close() {
 	close(ss.quit)
 	ss.lock()
 	defer ss.unlock()
 	for _, s := range ss.shards {
 		s.Close()
 	}
 }

 type shardLoader interface {
 	Close()
 	getShards() []Searcher
 	rlock()
 	runlock()
 	String() string
 }

 type shardedSearcher struct {
 	shardLoader
 }

 func (ss *shardedSearcher) Search(ctx context.Context, pat query.Q, opts *SearchOptions) (*SearchResult, error) {
 	start := time.Now()
 	type res struct {
 		sr  *SearchResult
 		err error
 	}

 	aggregate := SearchResult{
 		RepoURLs:      map[string]string{},
 		LineFragments: map[string]string{},
 	}

 	// This critical section is large, but we don't want to deal with
 	// searches on shards that have just been closed.
 	ss.shardLoader.rlock()
 	defer ss.shardLoader.runlock()
 	aggregate.Wait = time.Now().Sub(start)
 	start = time.Now()

 	// TODO - allow for canceling the query.
 	shards := ss.getShards()
 	all := make(chan res, len(shards))

 	var childCtx context.Context
 	var cancel context.CancelFunc
 	if opts.MaxWallTime == 0 {
 		childCtx, cancel = context.WithCancel(ctx)
 	} else {
 		childCtx, cancel = context.WithTimeout(ctx, opts.MaxWallTime)
 	}

 	defer cancel()

 	// For each query, throttle the number of parallel
 	// actions. Since searching is mostly CPU bound, we limit the
 	// number of parallel searches. This reduces the peak working
 	// set, which hopefully stops https://cs.bazel.build from crashing
 	// when looking for the string "com".
 	throttle := make(chan int, 10*runtime.NumCPU())
 	for _, s := range shards {
 		go func(s Searcher) {
 			throttle <- 1
 			defer func() {
 				<-throttle
 				if r := recover(); r != nil {
 					log.Printf("crashed shard: %s: %s, %s", s.String(), r, debug.Stack())

 					var r SearchResult
 					r.Stats.Crashes = 1
 					all <- res{&r, nil}
 				}
 			}()

 			ms, err := s.Search(childCtx, pat, opts)
 			all <- res{ms, err}
 		}(s)
 	}

 	for range shards {
 		r := <-all
 		if r.err != nil {
 			return nil, r.err
 		}
 		aggregate.Files = append(aggregate.Files, r.sr.Files...)
 		aggregate.Stats.Add(r.sr.Stats)

 		if len(r.sr.Files) > 0 {
 			for k, v := range r.sr.RepoURLs {
 				aggregate.RepoURLs[k] = v
 			}
 			for k, v := range r.sr.LineFragments {
 				aggregate.LineFragments[k] = v
 			}
 		}

 		if cancel != nil && aggregate.Stats.MatchCount > opts.TotalMaxMatchCount {
 			cancel()
 			cancel = nil
 		}
 	}

 	sortFilesByScore(aggregate.Files)
 	aggregate.Duration = time.Now().Sub(start)
 	return &aggregate, nil
 }

 func (ss *shardedSearcher) List(ctx context.Context, r query.Q) (*RepoList, error) {
 	type res struct {
 		rl  *RepoList
 		err error
 	}

 	ss.rlock()
 	defer ss.runlock()

 	shards := ss.getShards()
 	shardCount := len(shards)
 	all := make(chan res, shardCount)

 	for _, s := range shards {
 		go func(s Searcher) {
 			defer func() {
 				if r := recover(); r != nil {
 					all <- res{
 						&RepoList{Crashes: 1}, nil,
 					}
 				}
 			}()
 			ms, err := s.List(ctx, r)
 			all <- res{ms, err}
 		}(s)
 	}

 	crashes := 0
 	uniq := map[string]*RepoListEntry{}

 	var names []string
 	for i := 0; i < shardCount; i++ {
 		r := <-all
 		if r.err != nil {
 			return nil, r.err
 		}
 		crashes += r.rl.Crashes
 		for _, r := range r.rl.Repos {
 			prev, ok := uniq[r.Repository.Name]
 			if !ok {
 				cp := *r
 				uniq[r.Repository.Name] = &cp
 				names = append(names, r.Repository.Name)
 			} else {
 				prev.Stats.Add(&r.Stats)
 			}
 		}
 	}
 	sort.Strings(names)

 	aggregate := make([]*RepoListEntry, 0, len(names))
 	for _, k := range names {
 		aggregate = append(aggregate, uniq[k])
 	}
 	return &RepoList{
 		Repos:   aggregate,
 		Crashes: crashes,
 	}, nil
 }
	// Copyright 2016 Google Inc. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package zoekt

	import (
	"fmt"
	"log"
	"os"
	"path/filepath"
	"runtime"
	"runtime/debug"
	"sort"
	"time"

	"golang.org/x/net/context"

	"github.com/fsnotify/fsnotify"
	"github.com/google/zoekt/query"
	)

	type searchShard struct {
	Searcher
	mtime time.Time
	}

	type shardWatcher struct {
	dir string

	// Limit the number of parallel queries. Since searching is
	// CPU bound, we can't do better than #CPU queries in
	// parallel. If we do so, we just create more memory
	// pressure.
	throttle chan struct{}

	shards map[string]*searchShard
	quit chan struct{}
	}

	func loadShard(fn string) (*searchShard, error) {
	f, err := os.Open(fn)
	if err != nil {
	return nil, err
	}
	fi, err := f.Stat()
	if err != nil {
	return nil, err
	}

	iFile, err := NewIndexFile(f)
	if err != nil {
	return nil, err
	}
	s, err := NewSearcher(iFile)
	if err != nil {
	iFile.Close()
	return nil, fmt.Errorf("NewSearcher(%s): %v", fn, err)
	}

	return &searchShard{
	mtime: fi.ModTime(),
	Searcher: s,
	}, nil
	}

	func (s *shardWatcher) String() string {
	return fmt.Sprintf("shardWatcher(%s)", s.dir)
	}

	func (s *shardWatcher) scan() error {
	fs, err := filepath.Glob(filepath.Join(s.dir, "*.zoekt"))
	if err != nil {
	return err
	}

	if len(fs) == 0 {
	return fmt.Errorf("directory %s is empty", s.dir)
	}

	ts := map[string]time.Time{}
	for _, fn := range fs {
	key := filepath.Base(fn)
	fi, err := os.Lstat(fn)
	if err != nil {
	continue
	}

	ts[key] = fi.ModTime()
	}

	s.lock()
	var toLoad []string
	for k, mtime := range ts {
	if s.shards[k] == nil \|\| s.shards[k].mtime != mtime {
	toLoad = append(toLoad, k)
	}
	}

	var toDrop []string
	// Unload deleted shards.
	for k := range s.shards {
	if _, ok := ts[k]; !ok {
	toDrop = append(toDrop, k)
	}
	}
	s.unlock()

	for _, t := range toDrop {
	log.Printf("unloading: %s", t)
	s.replace(t, nil)
	}

	for _, t := range toLoad {
	shard, err := loadShard(filepath.Join(s.dir, t))
	log.Printf("reloading: %s, err %v ", t, err)
	if err != nil {
	continue
	}
	s.replace(t, shard)
	}

	return nil
	}

	func (s *shardWatcher) rlock() {
	s.throttle <- struct{}{}
	}

	// getShards returns the currently loaded shards. The shards must be
	// accessed under a rlock call.
	func (s *shardWatcher) getShards() []Searcher {
	var res []Searcher
	for _, sh := range s.shards {
	res = append(res, sh)
	}
	return res
	}

	func (s *shardWatcher) runlock() {
	<-s.throttle
	}

	func (s *shardWatcher) lock() {
	n := cap(s.throttle)
	for n > 0 {
	s.throttle <- struct{}{}
	n--
	}
	}

	func (s *shardWatcher) unlock() {
	n := cap(s.throttle)
	for n > 0 {
	<-s.throttle
	n--
	}
	}

	func (s shardWatcher) replace(key string, shard searchShard) {
	s.lock()
	defer s.unlock()
	old := s.shards[key]
	if old != nil {
	old.Close()
	}
	if shard != nil {
	s.shards[key] = shard
	} else {
	delete(s.shards, key)
	}
	}

	func (s *shardWatcher) watch() error {
	watcher, err := fsnotify.NewWatcher()
	if err != nil {
	return err
	}
	if err := watcher.Add(s.dir); err != nil {
	return err
	}

	go func() {
	for {
	select {
	case <-watcher.Events:
	s.scan()
	case err := <-watcher.Errors:
	if err != nil {
	log.Println("watcher error:", err)
	}
	case <-s.quit:
	watcher.Close()
	return
	}
	}
	}()
	return nil
	}

	// NewShardedSearcher returns a searcher instance that loads all
	// shards corresponding to a glob into memory.
	func NewShardedSearcher(dir string) (Searcher, error) {
	ss := shardWatcher{
	dir: dir,
	shards: make(map[string]*searchShard),
	quit: make(chan struct{}, 1),
	throttle: make(chan struct{}, runtime.NumCPU()),
	}

	if err := ss.scan(); err != nil {
	return nil, err
	}

	if err := ss.watch(); err != nil {
	return nil, err
	}

	return &shardedSearcher{&ss}, nil
	}

	// Close closes references to open files. It may be called only once.
	func (ss *shardWatcher) Close() {
	close(ss.quit)
	ss.lock()
	defer ss.unlock()
	for _, s := range ss.shards {
	s.Close()
	}
	}

	type shardLoader interface {
	Close()
	getShards() []Searcher
	rlock()
	runlock()
	String() string
	}

	type shardedSearcher struct {
	shardLoader
	}

	func (ss shardedSearcher) Search(ctx context.Context, pat query.Q, opts SearchOptions) (*SearchResult, error) {
	start := time.Now()
	type res struct {
	sr *SearchResult
	err error
	}

	aggregate := SearchResult{
	RepoURLs: map[string]string{},
	LineFragments: map[string]string{},
	}

	// This critical section is large, but we don't want to deal with
	// searches on shards that have just been closed.
	ss.shardLoader.rlock()
	defer ss.shardLoader.runlock()
	aggregate.Wait = time.Now().Sub(start)
	start = time.Now()

	// TODO - allow for canceling the query.
	shards := ss.getShards()
	all := make(chan res, len(shards))

	var childCtx context.Context
	var cancel context.CancelFunc
	if opts.MaxWallTime == 0 {
	childCtx, cancel = context.WithCancel(ctx)
	} else {
	childCtx, cancel = context.WithTimeout(ctx, opts.MaxWallTime)
	}

	defer cancel()

	// For each query, throttle the number of parallel
	// actions. Since searching is mostly CPU bound, we limit the
	// number of parallel searches. This reduces the peak working
	// set, which hopefully stops https://cs.bazel.build from crashing
	// when looking for the string "com".
	throttle := make(chan int, 10*runtime.NumCPU())
	for _, s := range shards {
	go func(s Searcher) {
	throttle <- 1
	defer func() {
	<-throttle
	if r := recover(); r != nil {
	log.Printf("crashed shard: %s: %s, %s", s.String(), r, debug.Stack())

	var r SearchResult
	r.Stats.Crashes = 1
	all <- res{&r, nil}
	}
	}()

	ms, err := s.Search(childCtx, pat, opts)
	all <- res{ms, err}
	}(s)
	}

	for range shards {
	r := <-all
	if r.err != nil {
	return nil, r.err
	}
	aggregate.Files = append(aggregate.Files, r.sr.Files...)
	aggregate.Stats.Add(r.sr.Stats)

	if len(r.sr.Files) > 0 {
	for k, v := range r.sr.RepoURLs {
	aggregate.RepoURLs[k] = v
	}
	for k, v := range r.sr.LineFragments {
	aggregate.LineFragments[k] = v
	}
	}

	if cancel != nil && aggregate.Stats.MatchCount > opts.TotalMaxMatchCount {
	cancel()
	cancel = nil
	}
	}

	sortFilesByScore(aggregate.Files)
	aggregate.Duration = time.Now().Sub(start)
	return &aggregate, nil
	}

	func (ss shardedSearcher) List(ctx context.Context, r query.Q) (RepoList, error) {
	type res struct {
	rl *RepoList
	err error
	}

	ss.rlock()
	defer ss.runlock()

	shards := ss.getShards()
	shardCount := len(shards)
	all := make(chan res, shardCount)

	for _, s := range shards {
	go func(s Searcher) {
	defer func() {
	if r := recover(); r != nil {
	all <- res{
	&RepoList{Crashes: 1}, nil,
	}
	}
	}()
	ms, err := s.List(ctx, r)
	all <- res{ms, err}
	}(s)
	}

	crashes := 0
	uniq := map[string]*RepoListEntry{}

	var names []string
	for i := 0; i < shardCount; i++ {
	r := <-all
	if r.err != nil {
	return nil, r.err
	}
	crashes += r.rl.Crashes
	for _, r := range r.rl.Repos {
	prev, ok := uniq[r.Repository.Name]
	if !ok {
	cp := *r
	uniq[r.Repository.Name] = &cp
	names = append(names, r.Repository.Name)
	} else {
	prev.Stats.Add(&r.Stats)
	}
	}
	}
	sort.Strings(names)

	aggregate := make([]*RepoListEntry, 0, len(names))
	for _, k := range names {
	aggregate = append(aggregate, uniq[k])
	}
	return &RepoList{
	Repos: aggregate,
	Crashes: crashes,
	}, nil
	}