shards/shards.go - zoekt - Git at Google

 // Copyright 2016 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package shards

 import (
 	"context"
 	"log"
 	"runtime"
 	"runtime/debug"
 	"sort"
 	"time"

 	"golang.org/x/sync/semaphore"

 	"github.com/google/zoekt"
 	"github.com/google/zoekt/query"
 )

 type shardedSearcher struct {
 	// Limit the number of parallel queries. Since searching is
 	// CPU bound, we can't do better than #CPU queries in
 	// parallel.  If we do so, we just create more memory
 	// pressure.
 	throttle *semaphore.Weighted
 	capacity int64

 	shards map[string]zoekt.Searcher
 }

 func newShardedSearcher(n int64) *shardedSearcher {
 	ss := &shardedSearcher{
 		shards:   make(map[string]zoekt.Searcher),
 		throttle: semaphore.NewWeighted(n),
 		capacity: n,
 	}
 	return ss
 }

 // NewDirectorySearcher returns a searcher instance that loads all
 // shards corresponding to a glob into memory.
 func NewDirectorySearcher(dir string) (zoekt.Searcher, error) {
 	ss := newShardedSearcher(int64(runtime.NumCPU()))
 	_, err := NewDirectoryWatcher(dir, ss)
 	if err != nil {
 		return nil, err
 	}

 	return ss, nil
 }

 func (ss *shardedSearcher) String() string {
 	return "shardedSearcher"
 }

 // Close closes references to open files. It may be called only once.
 func (ss *shardedSearcher) Close() {
 	ss.lock(context.Background())
 	defer ss.unlock()
 	for _, s := range ss.shards {
 		s.Close()
 	}
 }

 func (ss *shardedSearcher) Search(ctx context.Context, pat query.Q, opts *zoekt.SearchOptions) (*zoekt.SearchResult, error) {
 	start := time.Now()
 	type res struct {
 		sr  *zoekt.SearchResult
 		err error
 	}

 	aggregate := &zoekt.SearchResult{
 		RepoURLs:      map[string]string{},
 		LineFragments: map[string]string{},
 	}

 	// This critical section is large, but we don't want to deal with
 	// searches on shards that have just been closed.
 	if err := ss.rlock(ctx); err != nil {
 		return aggregate, err
 	}
 	defer ss.runlock()
 	aggregate.Wait = time.Now().Sub(start)
 	start = time.Now()

 	// TODO - allow for canceling the query.
 	shards := ss.getShards()
 	all := make(chan res, len(shards))

 	var childCtx context.Context
 	var cancel context.CancelFunc
 	if opts.MaxWallTime == 0 {
 		childCtx, cancel = context.WithCancel(ctx)
 	} else {
 		childCtx, cancel = context.WithTimeout(ctx, opts.MaxWallTime)
 	}

 	defer cancel()

 	// For each query, throttle the number of parallel
 	// actions. Since searching is mostly CPU bound, we limit the
 	// number of parallel searches. This reduces the peak working
 	// set, which hopefully stops https://cs.bazel.build from crashing
 	// when looking for the string "com".
 	throttle := make(chan int, 10*runtime.NumCPU())
 	for _, s := range shards {
 		go func(s zoekt.Searcher) {
 			throttle <- 1
 			defer func() {
 				<-throttle
 				if r := recover(); r != nil {
 					log.Printf("crashed shard: %s: %s, %s", s.String(), r, debug.Stack())

 					var r zoekt.SearchResult
 					r.Stats.Crashes = 1
 					all <- res{&r, nil}
 				}
 			}()

 			ms, err := s.Search(childCtx, pat, opts)
 			all <- res{ms, err}
 		}(s)
 	}

 	for range shards {
 		r := <-all
 		if r.err != nil {
 			return nil, r.err
 		}
 		aggregate.Files = append(aggregate.Files, r.sr.Files...)
 		aggregate.Stats.Add(r.sr.Stats)

 		if len(r.sr.Files) > 0 {
 			for k, v := range r.sr.RepoURLs {
 				aggregate.RepoURLs[k] = v
 			}
 			for k, v := range r.sr.LineFragments {
 				aggregate.LineFragments[k] = v
 			}
 		}

 		if cancel != nil && aggregate.Stats.MatchCount > opts.TotalMaxMatchCount {
 			cancel()
 			cancel = nil
 		}
 	}

 	zoekt.SortFilesByScore(aggregate.Files)
 	aggregate.Duration = time.Now().Sub(start)
 	return aggregate, nil
 }

 func (ss *shardedSearcher) List(ctx context.Context, r query.Q) (*zoekt.RepoList, error) {
 	type res struct {
 		rl  *zoekt.RepoList
 		err error
 	}

 	if err := ss.rlock(ctx); err != nil {
 		return nil, err
 	}
 	defer ss.runlock()

 	shards := ss.getShards()
 	shardCount := len(shards)
 	all := make(chan res, shardCount)

 	for _, s := range shards {
 		go func(s zoekt.Searcher) {
 			defer func() {
 				if r := recover(); r != nil {
 					all <- res{
 						&zoekt.RepoList{Crashes: 1}, nil,
 					}
 				}
 			}()
 			ms, err := s.List(ctx, r)
 			all <- res{ms, err}
 		}(s)
 	}

 	crashes := 0
 	uniq := map[string]*zoekt.RepoListEntry{}

 	var names []string
 	for i := 0; i < shardCount; i++ {
 		r := <-all
 		if r.err != nil {
 			return nil, r.err
 		}
 		crashes += r.rl.Crashes
 		for _, r := range r.rl.Repos {
 			prev, ok := uniq[r.Repository.Name]
 			if !ok {
 				cp := *r
 				uniq[r.Repository.Name] = &cp
 				names = append(names, r.Repository.Name)
 			} else {
 				prev.Stats.Add(&r.Stats)
 			}
 		}
 	}
 	sort.Strings(names)

 	aggregate := make([]*zoekt.RepoListEntry, 0, len(names))
 	for _, k := range names {
 		aggregate = append(aggregate, uniq[k])
 	}
 	return &zoekt.RepoList{
 		Repos:   aggregate,
 		Crashes: crashes,
 	}, nil
 }

 func (s *shardedSearcher) rlock(ctx context.Context) error {
 	return s.throttle.Acquire(ctx, 1)
 }

 // getShards returns the currently loaded shards. The shards must be
 // accessed under a rlock call.
 func (s *shardedSearcher) getShards() []zoekt.Searcher {
 	var res []zoekt.Searcher
 	for _, sh := range s.shards {
 		res = append(res, sh)
 	}
 	return res
 }

 func (s *shardedSearcher) runlock() {
 	s.throttle.Release(1)
 }

 func (s *shardedSearcher) lock(ctx context.Context) error {
 	return s.throttle.Acquire(ctx, s.capacity)
 }

 func (s *shardedSearcher) unlock() {
 	s.throttle.Release(s.capacity)
 }

 func (s *shardedSearcher) load(key string) {
 	shard, err := loadShard(key)
 	log.Printf("reloading: %s, err %v ", key, err)
 	if err != nil {
 		return
 	}

 	s.replace(key, shard)
 }

 func (s *shardedSearcher) drop(key string) {
 	s.replace(key, nil)
 }

 func (s *shardedSearcher) replace(key string, shard zoekt.Searcher) {
 	s.lock(context.Background())
 	defer s.unlock()
 	old := s.shards[key]
 	if old != nil {
 		old.Close()
 	}

 	if shard == nil {
 		delete(s.shards, key)
 	} else {
 		s.shards[key] = shard
 	}
 }
	// Copyright 2016 Google Inc. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package shards

	import (
	"context"
	"log"
	"runtime"
	"runtime/debug"
	"sort"
	"time"

	"golang.org/x/sync/semaphore"

	"github.com/google/zoekt"
	"github.com/google/zoekt/query"
	)

	type shardedSearcher struct {
	// Limit the number of parallel queries. Since searching is
	// CPU bound, we can't do better than #CPU queries in
	// parallel. If we do so, we just create more memory
	// pressure.
	throttle *semaphore.Weighted
	capacity int64

	shards map[string]zoekt.Searcher
	}

	func newShardedSearcher(n int64) *shardedSearcher {
	ss := &shardedSearcher{
	shards: make(map[string]zoekt.Searcher),
	throttle: semaphore.NewWeighted(n),
	capacity: n,
	}
	return ss
	}

	// NewDirectorySearcher returns a searcher instance that loads all
	// shards corresponding to a glob into memory.
	func NewDirectorySearcher(dir string) (zoekt.Searcher, error) {
	ss := newShardedSearcher(int64(runtime.NumCPU()))
	_, err := NewDirectoryWatcher(dir, ss)
	if err != nil {
	return nil, err
	}

	return ss, nil
	}

	func (ss *shardedSearcher) String() string {
	return "shardedSearcher"
	}

	// Close closes references to open files. It may be called only once.
	func (ss *shardedSearcher) Close() {
	ss.lock(context.Background())
	defer ss.unlock()
	for _, s := range ss.shards {
	s.Close()
	}
	}

	func (ss shardedSearcher) Search(ctx context.Context, pat query.Q, opts zoekt.SearchOptions) (*zoekt.SearchResult, error) {
	start := time.Now()
	type res struct {
	sr *zoekt.SearchResult
	err error
	}

	aggregate := &zoekt.SearchResult{
	RepoURLs: map[string]string{},
	LineFragments: map[string]string{},
	}

	// This critical section is large, but we don't want to deal with
	// searches on shards that have just been closed.
	if err := ss.rlock(ctx); err != nil {
	return aggregate, err
	}
	defer ss.runlock()
	aggregate.Wait = time.Now().Sub(start)
	start = time.Now()

	// TODO - allow for canceling the query.
	shards := ss.getShards()
	all := make(chan res, len(shards))

	var childCtx context.Context
	var cancel context.CancelFunc
	if opts.MaxWallTime == 0 {
	childCtx, cancel = context.WithCancel(ctx)
	} else {
	childCtx, cancel = context.WithTimeout(ctx, opts.MaxWallTime)
	}

	defer cancel()

	// For each query, throttle the number of parallel
	// actions. Since searching is mostly CPU bound, we limit the
	// number of parallel searches. This reduces the peak working
	// set, which hopefully stops https://cs.bazel.build from crashing
	// when looking for the string "com".
	throttle := make(chan int, 10*runtime.NumCPU())
	for _, s := range shards {
	go func(s zoekt.Searcher) {
	throttle <- 1
	defer func() {
	<-throttle
	if r := recover(); r != nil {
	log.Printf("crashed shard: %s: %s, %s", s.String(), r, debug.Stack())

	var r zoekt.SearchResult
	r.Stats.Crashes = 1
	all <- res{&r, nil}
	}
	}()

	ms, err := s.Search(childCtx, pat, opts)
	all <- res{ms, err}
	}(s)
	}

	for range shards {
	r := <-all
	if r.err != nil {
	return nil, r.err
	}
	aggregate.Files = append(aggregate.Files, r.sr.Files...)
	aggregate.Stats.Add(r.sr.Stats)

	if len(r.sr.Files) > 0 {
	for k, v := range r.sr.RepoURLs {
	aggregate.RepoURLs[k] = v
	}
	for k, v := range r.sr.LineFragments {
	aggregate.LineFragments[k] = v
	}
	}

	if cancel != nil && aggregate.Stats.MatchCount > opts.TotalMaxMatchCount {
	cancel()
	cancel = nil
	}
	}

	zoekt.SortFilesByScore(aggregate.Files)
	aggregate.Duration = time.Now().Sub(start)
	return aggregate, nil
	}

	func (ss shardedSearcher) List(ctx context.Context, r query.Q) (zoekt.RepoList, error) {
	type res struct {
	rl *zoekt.RepoList
	err error
	}

	if err := ss.rlock(ctx); err != nil {
	return nil, err
	}
	defer ss.runlock()

	shards := ss.getShards()
	shardCount := len(shards)
	all := make(chan res, shardCount)

	for _, s := range shards {
	go func(s zoekt.Searcher) {
	defer func() {
	if r := recover(); r != nil {
	all <- res{
	&zoekt.RepoList{Crashes: 1}, nil,
	}
	}
	}()
	ms, err := s.List(ctx, r)
	all <- res{ms, err}
	}(s)
	}

	crashes := 0
	uniq := map[string]*zoekt.RepoListEntry{}

	var names []string
	for i := 0; i < shardCount; i++ {
	r := <-all
	if r.err != nil {
	return nil, r.err
	}
	crashes += r.rl.Crashes
	for _, r := range r.rl.Repos {
	prev, ok := uniq[r.Repository.Name]
	if !ok {
	cp := *r
	uniq[r.Repository.Name] = &cp
	names = append(names, r.Repository.Name)
	} else {
	prev.Stats.Add(&r.Stats)
	}
	}
	}
	sort.Strings(names)

	aggregate := make([]*zoekt.RepoListEntry, 0, len(names))
	for _, k := range names {
	aggregate = append(aggregate, uniq[k])
	}
	return &zoekt.RepoList{
	Repos: aggregate,
	Crashes: crashes,
	}, nil
	}

	func (s *shardedSearcher) rlock(ctx context.Context) error {
	return s.throttle.Acquire(ctx, 1)
	}

	// getShards returns the currently loaded shards. The shards must be
	// accessed under a rlock call.
	func (s *shardedSearcher) getShards() []zoekt.Searcher {
	var res []zoekt.Searcher
	for _, sh := range s.shards {
	res = append(res, sh)
	}
	return res
	}

	func (s *shardedSearcher) runlock() {
	s.throttle.Release(1)
	}

	func (s *shardedSearcher) lock(ctx context.Context) error {
	return s.throttle.Acquire(ctx, s.capacity)
	}

	func (s *shardedSearcher) unlock() {
	s.throttle.Release(s.capacity)
	}

	func (s *shardedSearcher) load(key string) {
	shard, err := loadShard(key)
	log.Printf("reloading: %s, err %v ", key, err)
	if err != nil {
	return
	}

	s.replace(key, shard)
	}

	func (s *shardedSearcher) drop(key string) {
	s.replace(key, nil)
	}

	func (s *shardedSearcher) replace(key string, shard zoekt.Searcher) {
	s.lock(context.Background())
	defer s.unlock()
	old := s.shards[key]
	if old != nil {
	old.Close()
	}

	if shard == nil {
	delete(s.shards, key)
	} else {
	s.shards[key] = shard
	}
	}