blob: 9d3cda9b01803477801909bbd56587d13fc80588 [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zoekt
import (
"fmt"
"log"
"regexp"
"sort"
"strings"
"unicode/utf8"
"golang.org/x/net/context"
"github.com/google/zoekt/query"
)
// An expression tree coupled with matches
type matchTree interface {
// provide the next document where we can may find something
// interesting.
nextDoc() uint32
// clears any per-document state of the matchTree, and
// prepares for evaluating the given doc. The argument is
// strictly increasing over time.
prepare(nextDoc uint32)
// returns whether this matches, and if we are sure.
matches(known map[matchTree]bool) (match bool, sure bool)
String() string
}
type bruteForceMatchTree struct {
// mutable
firstDone bool
docID uint32
}
type andMatchTree struct {
children []matchTree
}
type orMatchTree struct {
children []matchTree
}
type notMatchTree struct {
child matchTree
}
type regexpMatchTree struct {
regexp *regexp.Regexp
child matchTree
fileName bool
// mutable
reEvaluated bool
found []*candidateMatch
}
type substrMatchTree struct {
query *query.Substring
cands []*candidateMatch
coversContent bool
caseSensitive bool
fileName bool
// mutable
current []*candidateMatch
contEvaluated bool
}
type branchQueryMatchTree struct {
fileMasks []uint64
mask uint64
// mutable
firstDone bool
docID uint32
}
// all prepare methods
func (t *bruteForceMatchTree) prepare(doc uint32) {
t.docID = doc
t.firstDone = true
}
func (t *andMatchTree) prepare(doc uint32) {
for _, c := range t.children {
c.prepare(doc)
}
}
func (t *regexpMatchTree) prepare(doc uint32) {
t.found = t.found[:0]
t.reEvaluated = false
t.child.prepare(doc)
}
func (t *orMatchTree) prepare(doc uint32) {
for _, c := range t.children {
c.prepare(doc)
}
}
func (t *notMatchTree) prepare(doc uint32) {
t.child.prepare(doc)
}
func (t *substrMatchTree) prepare(nextDoc uint32) {
for len(t.cands) > 0 && t.cands[0].file < nextDoc {
t.cands = t.cands[1:]
}
i := 0
for ; i < len(t.cands) && t.cands[i].file == nextDoc; i++ {
}
t.current = t.cands[:i]
t.cands = t.cands[i:]
t.contEvaluated = false
}
func (t *branchQueryMatchTree) prepare(doc uint32) {
t.firstDone = true
t.docID = doc
}
// nextDoc
func (t *bruteForceMatchTree) nextDoc() uint32 {
if !t.firstDone {
return 0
}
return t.docID + 1
}
func (t *andMatchTree) nextDoc() uint32 {
var max uint32
for _, c := range t.children {
m := c.nextDoc()
if m > max {
max = m
}
}
return max
}
func (t *regexpMatchTree) nextDoc() uint32 {
return t.child.nextDoc()
}
func (t *orMatchTree) nextDoc() uint32 {
min := uint32(maxUInt32)
for _, c := range t.children {
m := c.nextDoc()
if m < min {
min = m
}
}
return min
}
func (t *notMatchTree) nextDoc() uint32 {
return 0
}
func (t *substrMatchTree) nextDoc() uint32 {
if len(t.cands) > 0 {
return t.cands[0].file
}
return maxUInt32
}
func (t *branchQueryMatchTree) nextDoc() uint32 {
var start uint32
if t.firstDone {
start = t.docID + 1
}
for i := start; i < uint32(len(t.fileMasks)); i++ {
if (t.mask & t.fileMasks[i]) != 0 {
return i
}
}
return maxUInt32
}
// all String methods
func (t *bruteForceMatchTree) String() string {
return "all"
}
func (t *andMatchTree) String() string {
return fmt.Sprintf("and%v", t.children)
}
func (t *regexpMatchTree) String() string {
return fmt.Sprintf("re(%s,%s)", t.regexp, t.child)
}
func (t *orMatchTree) String() string {
return fmt.Sprintf("or%v", t.children)
}
func (t *notMatchTree) String() string {
return fmt.Sprintf("not(%v)", t.child)
}
func (t *substrMatchTree) String() string {
f := ""
if t.fileName {
f = "f"
}
return fmt.Sprintf("%ssubstr(%q,%v)", f, t.query.Pattern, t.current)
}
func (t *branchQueryMatchTree) String() string {
return fmt.Sprintf("branch(%x)", t.mask)
}
func collectAtoms(t matchTree, f func(matchTree)) {
switch s := t.(type) {
case *andMatchTree:
for _, ch := range s.children {
collectAtoms(ch, f)
}
case *orMatchTree:
for _, ch := range s.children {
collectAtoms(ch, f)
}
case *notMatchTree:
collectAtoms(s.child, f)
default:
f(t)
}
}
func collectPositiveSubstrings(t matchTree, f func(*substrMatchTree)) {
switch s := t.(type) {
case *andMatchTree:
for _, ch := range s.children {
collectPositiveSubstrings(ch, f)
}
case *orMatchTree:
for _, ch := range s.children {
collectPositiveSubstrings(ch, f)
}
case *regexpMatchTree:
collectPositiveSubstrings(s.child, f)
case *notMatchTree:
case *substrMatchTree:
f(s)
}
}
func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
switch s := t.(type) {
case *andMatchTree:
for _, ch := range s.children {
if known[ch] {
visitMatches(ch, known, f)
}
}
case *orMatchTree:
for _, ch := range s.children {
if known[ch] {
visitMatches(ch, known, f)
}
}
case *notMatchTree:
// don't collect into negative trees.
default:
f(s)
}
}
func visitSubtreeMatches(t matchTree, known map[matchTree]bool, f func(*substrMatchTree)) {
visitMatches(t, known, func(mt matchTree) {
st, ok := mt.(*substrMatchTree)
if ok {
f(st)
}
})
}
func visitRegexMatches(t matchTree, known map[matchTree]bool, f func(*regexpMatchTree)) {
visitMatches(t, known, func(mt matchTree) {
st, ok := mt.(*regexpMatchTree)
if ok {
f(st)
}
})
}
func (p *contentProvider) evalContentMatches(s *substrMatchTree) {
if !s.coversContent {
pruned := s.current[:0]
for _, m := range s.current {
if p.matchContent(m) {
pruned = append(pruned, m)
}
}
s.current = pruned
} else {
// TODO - this side effect is kind of hidden and surprising.
for _, cm := range s.current {
cm.byteOffset = p.findOffset(cm.fileName, cm.runeOffset)
}
}
s.contEvaluated = true
}
func (p *contentProvider) evalRegexpMatches(s *regexpMatchTree) {
idxs := s.regexp.FindAllIndex(p.data(s.fileName), -1)
s.found = make([]*candidateMatch, 0, len(idxs))
for _, idx := range idxs {
s.found = append(s.found, &candidateMatch{
byteOffset: uint32(idx[0]),
byteMatchSz: uint32(idx[1] - idx[0]),
fileName: s.fileName,
})
}
s.reEvaluated = true
}
// all matches() methods.
func (t *bruteForceMatchTree) matches(known map[matchTree]bool) (bool, bool) {
return true, true
}
func (t *andMatchTree) matches(known map[matchTree]bool) (bool, bool) {
sure := true
for _, ch := range t.children {
v, ok := evalMatchTree(known, ch)
if ok && !v {
return false, true
}
if !ok {
sure = false
}
}
return true, sure
}
func (t *orMatchTree) matches(known map[matchTree]bool) (bool, bool) {
matches := false
sure := true
for _, ch := range t.children {
v, ok := evalMatchTree(known, ch)
if ok {
// we could short-circuit, but we want to use
// the other possibilities as a ranking
// signal.
matches = matches || v
} else {
sure = false
}
}
return matches, sure
}
func (t *branchQueryMatchTree) matches(known map[matchTree]bool) (bool, bool) {
return t.fileMasks[t.docID]&t.mask != 0, true
}
func (t *regexpMatchTree) matches(known map[matchTree]bool) (bool, bool) {
v, ok := evalMatchTree(known, t.child)
if ok && !v {
return false, true
}
if !t.reEvaluated {
return false, false
}
return len(t.found) > 0, true
}
func evalMatchTree(known map[matchTree]bool, mt matchTree) (bool, bool) {
if v, ok := known[mt]; ok {
return v, true
}
v, ok := mt.matches(known)
if ok {
known[mt] = v
}
return v, ok
}
func (t *notMatchTree) matches(known map[matchTree]bool) (bool, bool) {
v, ok := evalMatchTree(known, t.child)
return !v, ok
}
func (t *substrMatchTree) matches(known map[matchTree]bool) (bool, bool) {
if len(t.current) == 0 {
return false, true
}
sure := (t.coversContent || t.contEvaluated)
return true, sure
}
func (d *indexData) newMatchTree(q query.Q, stats *Stats) (matchTree, error) {
switch s := q.(type) {
case *query.Regexp:
subQ := query.RegexpToQuery(s.Regexp, ngramSize)
subQ = query.Map(subQ, func(q query.Q) query.Q {
if sub, ok := q.(*query.Substring); ok {
sub.FileName = s.FileName
sub.CaseSensitive = s.CaseSensitive
}
return q
})
subMT, err := d.newMatchTree(subQ, stats)
if err != nil {
return nil, err
}
prefix := ""
if !s.CaseSensitive {
prefix = "(?i)"
}
tr := &regexpMatchTree{
regexp: regexp.MustCompile(prefix + s.Regexp.String()),
child: subMT,
fileName: s.FileName,
}
return tr, nil
case *query.And:
var r []matchTree
for _, ch := range s.Children {
ct, err := d.newMatchTree(ch, stats)
if err != nil {
return nil, err
}
r = append(r, ct)
}
return &andMatchTree{r}, nil
case *query.Or:
var r []matchTree
for _, ch := range s.Children {
ct, err := d.newMatchTree(ch, stats)
if err != nil {
return nil, err
}
r = append(r, ct)
}
return &orMatchTree{r}, nil
case *query.Not:
ct, err := d.newMatchTree(s.Child, stats)
return &notMatchTree{
child: ct,
}, err
case *query.Substring:
return d.newSubstringMatchTree(s, stats)
case *query.Branch:
mask := uint64(0)
if s.Pattern == "HEAD" {
mask = 1
} else {
for nm, m := range d.branchIDs {
if strings.Contains(nm, s.Pattern) {
mask |= uint64(m)
}
}
}
return &branchQueryMatchTree{
mask: mask,
fileMasks: d.fileBranchMasks,
}, nil
case *query.Const:
if s.Value {
return &bruteForceMatchTree{}, nil
} else {
return &substrMatchTree{
query: &query.Substring{Pattern: "FALSE"},
}, nil
}
}
log.Panicf("type %T", q)
return nil, nil
}
func (d *indexData) newSubstringMatchTree(s *query.Substring, stats *Stats) (matchTree, error) {
st := &substrMatchTree{
query: s,
caseSensitive: s.CaseSensitive,
fileName: s.FileName,
}
if utf8.RuneCountInString(s.Pattern) < ngramSize {
if !s.FileName {
return nil, fmt.Errorf("pattern %q less than %d characters", s.Pattern, ngramSize)
}
st.cands = d.bruteForceMatchFilenames(s)
st.coversContent = true
return st, nil
}
result, err := d.iterateNgrams(s)
if err != nil {
return nil, err
}
st.coversContent = result.coversContent
st.cands = result.cands
stats.IndexBytesLoaded += int64(result.bytesRead)
return st, nil
}
func (d *indexData) simplify(in query.Q) query.Q {
eval := query.Map(in, func(q query.Q) query.Q {
if r, ok := q.(*query.Repo); ok {
return &query.Const{strings.Contains(d.repoMetaData.Name, r.Pattern)}
}
return q
})
return query.Simplify(eval)
}
func (o *SearchOptions) SetDefaults() {
if o.ShardMaxMatchCount == 0 {
// We cap the total number of matches, so overly broad
// searches don't crash the machine.
o.ShardMaxMatchCount = 100000
}
if o.TotalMaxMatchCount == 0 {
o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
}
if o.ShardMaxImportantMatch == 0 {
o.ShardMaxImportantMatch = 10
}
if o.TotalMaxImportantMatch == 0 {
o.TotalMaxImportantMatch = 10 * o.ShardMaxImportantMatch
}
}
func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error) {
copyOpts := *opts
opts = &copyOpts
opts.SetDefaults()
importantMatchCount := 0
var res SearchResult
if len(d.fileNameIndex) == 0 {
return &res, nil
}
q = d.simplify(q)
if c, ok := q.(*query.Const); ok && !c.Value {
return &res, nil
}
if opts.EstimateDocCount {
res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
return &res, nil
}
q = query.Map(q, query.ExpandFileContent)
mt, err := d.newMatchTree(q, &res.Stats)
if err != nil {
return nil, err
}
totalAtomCount := 0
var substrAtoms, fileAtoms []*substrMatchTree
var regexpAtoms []*regexpMatchTree
collectAtoms(mt, func(t matchTree) {
totalAtomCount++
if st, ok := t.(*substrMatchTree); ok {
res.Stats.NgramMatches += len(st.cands)
if st.fileName {
fileAtoms = append(fileAtoms, st)
} else {
substrAtoms = append(substrAtoms, st)
}
}
if re, ok := t.(*regexpMatchTree); ok {
regexpAtoms = append(regexpAtoms, re)
}
})
cp := contentProvider{
id: d,
stats: &res.Stats,
}
docCount := uint32(len(d.fileBranchMasks))
canceled := false
lastDoc := int(-1)
nextFileMatch:
for {
if !canceled {
select {
case <-ctx.Done():
canceled = true
default:
}
}
nextDoc := mt.nextDoc()
if nextDoc >= docCount {
break
}
if int(nextDoc) <= lastDoc {
log.Panicf("lastDoc %d, nextDoc %d", lastDoc, nextDoc)
}
lastDoc = int(nextDoc)
res.Stats.FilesConsidered++
mt.prepare(nextDoc)
if canceled || res.Stats.MatchCount >= opts.ShardMaxMatchCount ||
importantMatchCount >= opts.ShardMaxImportantMatch {
res.Stats.FilesSkipped++
continue
}
cp.setDocument(nextDoc)
known := make(map[matchTree]bool)
if v, ok := evalMatchTree(known, mt); ok && !v {
continue nextFileMatch
}
// Files are cheap to match. Do them first.
if len(fileAtoms) > 0 {
for _, st := range fileAtoms {
cp.evalContentMatches(st)
}
if v, ok := evalMatchTree(known, mt); ok && !v {
continue nextFileMatch
}
}
for _, st := range substrAtoms {
// TODO - this may evaluate too much.
cp.evalContentMatches(st)
}
if len(regexpAtoms) > 0 {
if v, ok := evalMatchTree(known, mt); ok && !v {
continue nextFileMatch
}
for _, re := range regexpAtoms {
cp.evalRegexpMatches(re)
}
}
if v, ok := evalMatchTree(known, mt); !ok {
log.Panicf("did not decide. Repo %s, doc %d, known %v",
d.repoMetaData.Name, nextDoc, known)
} else if !v {
continue nextFileMatch
}
fileMatch := FileMatch{
Repository: d.repoMetaData.Name,
FileName: string(d.fileName(nextDoc)),
// Maintain ordering of input files. This
// strictly dominates the in-file ordering of
// the matches.
Score: 10 * float64(nextDoc) / float64(len(d.boundaries)),
Checksum: d.getChecksum(nextDoc),
}
if s := d.subRepos[nextDoc]; s > 0 {
if s >= uint32(len(d.subRepoPaths)) {
log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
}
path := d.subRepoPaths[s]
fileMatch.SubRepositoryPath = path
sr := d.repoMetaData.SubRepoMap[path]
fileMatch.SubRepositoryName = sr.Name
if idx := d.branchIndex(nextDoc); idx >= 0 {
fileMatch.Version = sr.Branches[idx].Version
}
} else {
idx := d.branchIndex(nextDoc)
if idx >= 0 {
fileMatch.Version = d.repoMetaData.Branches[idx].Version
}
}
atomMatchCount := 0
visitMatches(mt, known, func(mt matchTree) {
atomMatchCount++
})
fileMatch.Score += float64(atomMatchCount) / float64(totalAtomCount) * scoreFactorAtomMatch
finalCands := gatherMatches(mt, known)
if len(finalCands) == 0 {
nm := d.fileName(nextDoc)
finalCands = append(finalCands,
&candidateMatch{
caseSensitive: false,
fileName: true,
substrBytes: nm,
substrLowered: nm,
file: nextDoc,
runeOffset: 0,
byteOffset: 0,
byteMatchSz: uint32(len(nm)),
})
}
fileMatch.LineMatches = cp.fillMatches(finalCands)
maxFileScore := 0.0
for i := range fileMatch.LineMatches {
if maxFileScore < fileMatch.LineMatches[i].Score {
maxFileScore = fileMatch.LineMatches[i].Score
}
// Order by ordering in file.
fileMatch.LineMatches[i].Score += 1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))
}
fileMatch.Score += maxFileScore
if fileMatch.Score > scoreImportantThreshold {
importantMatchCount++
}
fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
sortMatchesByScore(fileMatch.LineMatches)
if opts.Whole {
fileMatch.Content = cp.data(false)
}
res.Files = append(res.Files, fileMatch)
res.Stats.MatchCount += len(fileMatch.LineMatches)
res.Stats.FileCount++
}
SortFilesByScore(res.Files)
addRepo(&res, &d.repoMetaData)
for _, v := range d.repoMetaData.SubRepoMap {
addRepo(&res, v)
}
return &res, nil
}
func addRepo(res *SearchResult, repo *Repository) {
if res.RepoURLs == nil {
res.RepoURLs = map[string]string{}
}
res.RepoURLs[repo.Name] = repo.FileURLTemplate
if res.LineFragments == nil {
res.LineFragments = map[string]string{}
}
res.LineFragments[repo.Name] = repo.LineFragmentTemplate
}
func extractSubstringQueries(q query.Q) []*query.Substring {
var r []*query.Substring
switch s := q.(type) {
case *query.And:
for _, ch := range s.Children {
r = append(r, extractSubstringQueries(ch)...)
}
case *query.Or:
for _, ch := range s.Children {
r = append(r, extractSubstringQueries(ch)...)
}
case *query.Not:
r = append(r, extractSubstringQueries(s.Child)...)
case *query.Substring:
r = append(r, s)
}
return r
}
type sortByOffsetSlice []*candidateMatch
func (m sortByOffsetSlice) Len() int { return len(m) }
func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
func (m sortByOffsetSlice) Less(i, j int) bool {
return m[i].byteOffset < m[j].byteOffset
}
// Gather matches from this document. This never returns a mixture of
// filename/content matches: if there are content matches, all
// filename matches are trimmed from the result. The matches are
// returned in document order and are non-overlapping.
func gatherMatches(mt matchTree, known map[matchTree]bool) []*candidateMatch {
var cands []*candidateMatch
visitMatches(mt, known, func(mt matchTree) {
if smt, ok := mt.(*substrMatchTree); ok {
cands = append(cands, smt.current...)
}
if rmt, ok := mt.(*regexpMatchTree); ok {
cands = append(cands, rmt.found...)
}
})
foundContentMatch := false
for _, c := range cands {
if !c.fileName {
foundContentMatch = true
break
}
}
res := cands[:0]
for _, c := range cands {
if !foundContentMatch || !c.fileName {
res = append(res, c)
}
}
cands = res
// Merge adjacent candidates. This guarantees that the matches
// are non-overlapping.
sort.Sort((sortByOffsetSlice)(cands))
res = cands[:0]
for i, c := range cands {
if i == 0 {
res = append(res, c)
continue
}
last := res[len(res)-1]
lastEnd := last.byteOffset + last.byteMatchSz
end := c.byteOffset + c.byteMatchSz
if lastEnd >= c.byteOffset {
if end > lastEnd {
last.byteMatchSz = end - last.byteOffset
}
continue
}
res = append(res, c)
}
return res
}
func (d *indexData) branchIndex(docID uint32) int {
mask := d.fileBranchMasks[docID]
idx := 0
for mask != 0 {
if mask&0x1 != 0 {
return idx
}
idx++
mask >>= 1
}
return -1
}
// gatherBranches returns a list of branch names.
func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
foundBranchQuery := false
var branches []string
visitMatches(mt, known, func(mt matchTree) {
bq, ok := mt.(*branchQueryMatchTree)
if ok {
foundBranchQuery = true
branches = append(branches,
d.branchNames[uint(bq.mask)])
}
})
if !foundBranchQuery {
mask := d.fileBranchMasks[docID]
id := uint32(1)
for mask != 0 {
if mask&0x1 != 0 {
branches = append(branches, d.branchNames[uint(id)])
}
id <<= 1
mask >>= 1
}
}
return branches
}
func (d *indexData) List(ctx context.Context, q query.Q) (*RepoList, error) {
q = d.simplify(q)
c, ok := q.(*query.Const)
if !ok {
return nil, fmt.Errorf("List should receive Repo-only query.")
}
l := &RepoList{}
if c.Value {
l.Repos = append(l.Repos, &d.repoListEntry)
}
return l, nil
}