| // Copyright 2016 Google Inc. All rights reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package zoekt |
| |
| import ( |
| "context" |
| "fmt" |
| "log" |
| "regexp/syntax" |
| "sort" |
| "strings" |
| |
| "golang.org/x/net/trace" |
| |
| "github.com/google/zoekt/query" |
| ) |
| |
| const maxUInt16 = 0xffff |
| |
| // DebugScore controls whether we collect data on match scores are |
| // constructed. Intended for use in tests. |
| var DebugScore = false |
| |
| func (m *FileMatch) addScore(what string, s float64) { |
| if DebugScore { |
| m.Debug += fmt.Sprintf("%s:%f, ", what, s) |
| } |
| m.Score += s |
| } |
| |
| func (d *indexData) simplify(in query.Q) query.Q { |
| eval := query.Map(in, func(q query.Q) query.Q { |
| if r, ok := q.(*query.Repo); ok { |
| return &query.Const{Value: strings.Contains(d.repoMetaData.Name, r.Pattern)} |
| } |
| if l, ok := q.(*query.Language); ok { |
| _, has := d.metaData.LanguageMap[l.Language] |
| if !has { |
| return &query.Const{Value: false} |
| } |
| } |
| return q |
| }) |
| return query.Simplify(eval) |
| } |
| |
| func (o *SearchOptions) SetDefaults() { |
| if o.ShardMaxMatchCount == 0 { |
| // We cap the total number of matches, so overly broad |
| // searches don't crash the machine. |
| o.ShardMaxMatchCount = 100000 |
| } |
| if o.TotalMaxMatchCount == 0 { |
| o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount |
| } |
| if o.ShardMaxImportantMatch == 0 { |
| o.ShardMaxImportantMatch = 10 |
| } |
| if o.TotalMaxImportantMatch == 0 { |
| o.TotalMaxImportantMatch = 10 * o.ShardMaxImportantMatch |
| } |
| } |
| |
| func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) { |
| copyOpts := *opts |
| opts = ©Opts |
| opts.SetDefaults() |
| importantMatchCount := 0 |
| |
| var res SearchResult |
| if len(d.fileNameIndex) == 0 { |
| return &res, nil |
| } |
| |
| select { |
| case <-ctx.Done(): |
| res.Stats.ShardsSkipped++ |
| return &res, nil |
| default: |
| } |
| |
| tr := trace.New("indexData.Search", d.file.Name()) |
| tr.LazyPrintf("opts: %+v", opts) |
| defer func() { |
| if sr != nil { |
| tr.LazyPrintf("num files: %d", len(sr.Files)) |
| tr.LazyPrintf("stats: %+v", sr.Stats) |
| } |
| if err != nil { |
| tr.LazyPrintf("error: %v", err) |
| tr.SetError() |
| } |
| tr.Finish() |
| }() |
| |
| q = d.simplify(q) |
| tr.LazyLog(q, true) |
| if c, ok := q.(*query.Const); ok && !c.Value { |
| return &res, nil |
| } |
| |
| if opts.EstimateDocCount { |
| res.Stats.ShardFilesConsidered = len(d.fileBranchMasks) |
| return &res, nil |
| } |
| |
| q = query.Map(q, query.ExpandFileContent) |
| |
| mt, err := d.newMatchTree(q) |
| if err != nil { |
| return nil, err |
| } |
| |
| totalAtomCount := 0 |
| visitMatchTree(mt, func(t matchTree) { |
| totalAtomCount++ |
| }) |
| |
| cp := &contentProvider{ |
| id: d, |
| stats: &res.Stats, |
| } |
| |
| docCount := uint32(len(d.fileBranchMasks)) |
| lastDoc := int(-1) |
| |
| nextFileMatch: |
| for { |
| canceled := false |
| select { |
| case <-ctx.Done(): |
| canceled = true |
| default: |
| } |
| |
| nextDoc := mt.nextDoc() |
| if int(nextDoc) <= lastDoc { |
| nextDoc = uint32(lastDoc + 1) |
| } |
| if nextDoc >= docCount { |
| break |
| } |
| lastDoc = int(nextDoc) |
| |
| if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) || |
| (opts.ShardMaxImportantMatch > 0 && importantMatchCount >= opts.ShardMaxImportantMatch) { |
| res.Stats.FilesSkipped += d.repoListEntry.Stats.Documents - lastDoc |
| break |
| } |
| |
| res.Stats.FilesConsidered++ |
| mt.prepare(nextDoc) |
| |
| cp.setDocument(nextDoc) |
| |
| known := make(map[matchTree]bool) |
| for cost := costMin; cost <= costMax; cost++ { |
| v, ok := mt.matches(cp, cost, known) |
| if ok && !v { |
| continue nextFileMatch |
| } |
| |
| if cost == costMax && !ok { |
| log.Panicf("did not decide. Repo %s, doc %d, known %v", |
| d.repoMetaData.Name, nextDoc, known) |
| } |
| } |
| |
| fileMatch := FileMatch{ |
| Repository: d.repoMetaData.Name, |
| FileName: string(d.fileName(nextDoc)), |
| Checksum: d.getChecksum(nextDoc), |
| Language: d.languageMap[d.languages[nextDoc]], |
| } |
| |
| if s := d.subRepos[nextDoc]; s > 0 { |
| if s >= uint32(len(d.subRepoPaths)) { |
| log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths) |
| } |
| path := d.subRepoPaths[s] |
| fileMatch.SubRepositoryPath = path |
| sr := d.repoMetaData.SubRepoMap[path] |
| fileMatch.SubRepositoryName = sr.Name |
| if idx := d.branchIndex(nextDoc); idx >= 0 { |
| fileMatch.Version = sr.Branches[idx].Version |
| } |
| } else { |
| idx := d.branchIndex(nextDoc) |
| if idx >= 0 { |
| fileMatch.Version = d.repoMetaData.Branches[idx].Version |
| } |
| } |
| |
| atomMatchCount := 0 |
| visitMatches(mt, known, func(mt matchTree) { |
| atomMatchCount++ |
| }) |
| finalCands := gatherMatches(mt, known) |
| |
| if len(finalCands) == 0 { |
| nm := d.fileName(nextDoc) |
| finalCands = append(finalCands, |
| &candidateMatch{ |
| caseSensitive: false, |
| fileName: true, |
| substrBytes: nm, |
| substrLowered: nm, |
| file: nextDoc, |
| runeOffset: 0, |
| byteOffset: 0, |
| byteMatchSz: uint32(len(nm)), |
| }) |
| } |
| fileMatch.LineMatches = cp.fillMatches(finalCands) |
| |
| maxFileScore := 0.0 |
| for i := range fileMatch.LineMatches { |
| if maxFileScore < fileMatch.LineMatches[i].Score { |
| maxFileScore = fileMatch.LineMatches[i].Score |
| } |
| |
| // Order by ordering in file. |
| fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) |
| } |
| |
| // Maintain ordering of input files. This |
| // strictly dominates the in-file ordering of |
| // the matches. |
| fileMatch.addScore("fragment", maxFileScore) |
| fileMatch.addScore("atom", float64(atomMatchCount)/float64(totalAtomCount)*scoreFactorAtomMatch) |
| |
| // Prefer earlier docs. |
| fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(nextDoc)/float64(len(d.boundaries)))) |
| fileMatch.addScore("shard-order", scoreShardRankFactor*float64(d.repoMetaData.Rank)/maxUInt16) |
| |
| if fileMatch.Score > scoreImportantThreshold { |
| importantMatchCount++ |
| } |
| fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) |
| sortMatchesByScore(fileMatch.LineMatches) |
| if opts.Whole { |
| fileMatch.Content = cp.data(false) |
| } |
| |
| res.Files = append(res.Files, fileMatch) |
| res.Stats.MatchCount += len(fileMatch.LineMatches) |
| res.Stats.FileCount++ |
| } |
| SortFilesByScore(res.Files) |
| |
| addRepo(&res, &d.repoMetaData) |
| for _, v := range d.repoMetaData.SubRepoMap { |
| addRepo(&res, v) |
| } |
| |
| visitMatchTree(mt, func(mt matchTree) { |
| if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { |
| atom.updateStats(&res.Stats) |
| } |
| }) |
| return &res, nil |
| } |
| |
| func addRepo(res *SearchResult, repo *Repository) { |
| if res.RepoURLs == nil { |
| res.RepoURLs = map[string]string{} |
| } |
| res.RepoURLs[repo.Name] = repo.FileURLTemplate |
| |
| if res.LineFragments == nil { |
| res.LineFragments = map[string]string{} |
| } |
| res.LineFragments[repo.Name] = repo.LineFragmentTemplate |
| } |
| |
| type sortByOffsetSlice []*candidateMatch |
| |
| func (m sortByOffsetSlice) Len() int { return len(m) } |
| func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } |
| func (m sortByOffsetSlice) Less(i, j int) bool { |
| return m[i].byteOffset < m[j].byteOffset |
| } |
| |
| // Gather matches from this document. This never returns a mixture of |
| // filename/content matches: if there are content matches, all |
| // filename matches are trimmed from the result. The matches are |
| // returned in document order and are non-overlapping. |
| func gatherMatches(mt matchTree, known map[matchTree]bool) []*candidateMatch { |
| var cands []*candidateMatch |
| visitMatches(mt, known, func(mt matchTree) { |
| if smt, ok := mt.(*substrMatchTree); ok { |
| cands = append(cands, smt.current...) |
| } |
| if rmt, ok := mt.(*regexpMatchTree); ok { |
| cands = append(cands, rmt.found...) |
| } |
| }) |
| |
| foundContentMatch := false |
| for _, c := range cands { |
| if !c.fileName { |
| foundContentMatch = true |
| break |
| } |
| } |
| |
| res := cands[:0] |
| for _, c := range cands { |
| if !foundContentMatch || !c.fileName { |
| res = append(res, c) |
| } |
| } |
| cands = res |
| |
| // Merge adjacent candidates. This guarantees that the matches |
| // are non-overlapping. |
| sort.Sort((sortByOffsetSlice)(cands)) |
| res = cands[:0] |
| for i, c := range cands { |
| if i == 0 { |
| res = append(res, c) |
| continue |
| } |
| last := res[len(res)-1] |
| lastEnd := last.byteOffset + last.byteMatchSz |
| end := c.byteOffset + c.byteMatchSz |
| if lastEnd >= c.byteOffset { |
| if end > lastEnd { |
| last.byteMatchSz = end - last.byteOffset |
| } |
| continue |
| } |
| |
| res = append(res, c) |
| } |
| |
| return res |
| } |
| |
| func (d *indexData) branchIndex(docID uint32) int { |
| mask := d.fileBranchMasks[docID] |
| idx := 0 |
| for mask != 0 { |
| if mask&0x1 != 0 { |
| return idx |
| } |
| idx++ |
| mask >>= 1 |
| } |
| return -1 |
| } |
| |
| // gatherBranches returns a list of branch names. |
| func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { |
| foundBranchQuery := false |
| var branches []string |
| |
| visitMatches(mt, known, func(mt matchTree) { |
| bq, ok := mt.(*branchQueryMatchTree) |
| if ok { |
| foundBranchQuery = true |
| branches = append(branches, |
| d.branchNames[uint(bq.mask)]) |
| } |
| }) |
| |
| if !foundBranchQuery { |
| mask := d.fileBranchMasks[docID] |
| id := uint32(1) |
| for mask != 0 { |
| if mask&0x1 != 0 { |
| branches = append(branches, d.branchNames[uint(id)]) |
| } |
| id <<= 1 |
| mask >>= 1 |
| } |
| } |
| return branches |
| } |
| |
| func (d *indexData) List(ctx context.Context, q query.Q) (rl *RepoList, err error) { |
| tr := trace.New("indexData.List", d.file.Name()) |
| defer func() { |
| if rl != nil { |
| tr.LazyPrintf("repos size: %d", len(rl.Repos)) |
| tr.LazyPrintf("crashes: %d", rl.Crashes) |
| } |
| if err != nil { |
| tr.LazyPrintf("error: %v", err) |
| tr.SetError() |
| } |
| tr.Finish() |
| }() |
| |
| q = d.simplify(q) |
| tr.LazyLog(q, true) |
| c, ok := q.(*query.Const) |
| |
| if !ok { |
| return nil, fmt.Errorf("List should receive Repo-only query") |
| } |
| |
| l := &RepoList{} |
| if c.Value { |
| l.Repos = append(l.Repos, &d.repoListEntry) |
| } |
| return l, nil |
| } |
| |
| // regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If |
| // mt is equivalent to the input r, isEqual = true and the matchTree can be used |
| // in place of the regex r. If singleLine = true, then the matchTree and all |
| // its children only match terms on the same line. singleLine is used during |
| // recursion to decide whether to return an andLineMatchTree (singleLine = true) |
| // or a andMatchTree (singleLine = false). |
| func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) { |
| // TODO - we could perhaps transform Begin/EndText in '\n'? |
| // TODO - we could perhaps transform CharClass in (OrQuery ) |
| // if there are just a few runes, and part of a OpConcat? |
| switch r.Op { |
| case syntax.OpLiteral: |
| s := string(r.Rune) |
| if len(s) >= minTextSize { |
| mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive}) |
| return mt, true, !strings.Contains(s, "\n"), err |
| } |
| case syntax.OpCapture: |
| return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) |
| |
| case syntax.OpPlus: |
| return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) |
| |
| case syntax.OpRepeat: |
| if r.Min >= 1 { |
| return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) |
| } |
| |
| case syntax.OpConcat, syntax.OpAlternate: |
| var qs []matchTree |
| isEq := true |
| singleLine = true |
| for _, sr := range r.Sub { |
| if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil { |
| if err != nil { |
| return nil, false, false, err |
| } |
| isEq = isEq && subIsEq |
| singleLine = singleLine && subSingleLine |
| qs = append(qs, sq) |
| } |
| } |
| if r.Op == syntax.OpConcat { |
| if len(qs) > 1 { |
| isEq = false |
| } |
| newQs := make([]matchTree, 0, len(qs)) |
| for _, q := range qs { |
| if _, ok := q.(*bruteForceMatchTree); ok { |
| continue |
| } |
| newQs = append(newQs, q) |
| } |
| if len(newQs) == 1 { |
| return newQs[0], isEq, singleLine, nil |
| } |
| if len(newQs) == 0 { |
| return &bruteForceMatchTree{}, isEq, singleLine, nil |
| } |
| if singleLine { |
| return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil |
| } |
| return &andMatchTree{newQs}, isEq, singleLine, nil |
| } |
| for _, q := range qs { |
| if _, ok := q.(*bruteForceMatchTree); ok { |
| return q, isEq, false, nil |
| } |
| } |
| if len(qs) == 0 { |
| return &noMatchTree{"const"}, isEq, false, nil |
| } |
| return &orMatchTree{qs}, isEq, false, nil |
| case syntax.OpStar: |
| if r.Sub[0].Op == syntax.OpAnyCharNotNL { |
| return &bruteForceMatchTree{}, false, true, nil |
| } |
| } |
| return &bruteForceMatchTree{}, false, false, nil |
| } |