Delete gerrit/zoekt. The source-code is hosted on https://github.com/sourcegraph/zoekt Change-Id: Icb6e8dd6d9479fe8f78b6868e90f28e508bd2e2b
diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 7cd8594..0000000 --- a/.gitignore +++ /dev/null
@@ -1,6 +0,0 @@ -*~ -cmd/zoekt-index/zoekt-index -cmd/zoekt-webserver/zoekt-webserver -cmd/zoekt-mirror-github/zoekt-mirror-github -cmd/zoekt-server/zoekt-server -cmd/zoekt-git-index/zoekt-git-index
diff --git a/CONTRIBUTING b/CONTRIBUTING deleted file mode 100644 index 61b75c4..0000000 --- a/CONTRIBUTING +++ /dev/null
@@ -1,4 +0,0 @@ -**NOTICE:** -[github.com/sourcegraph/zoekt](https://github.com/sourcegraph/zoekt) is the -active main repository for Zoekt development. Please contribute pull requests -to that repository.
diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 261eeb9..0000000 --- a/LICENSE +++ /dev/null
@@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License.
diff --git a/README.md b/README.md index 6050602..3ab4d09 100644 --- a/README.md +++ b/README.md
@@ -1,8 +1,3 @@ - - "Zoekt, en gij zult spinazie eten" - Jan Eertink - - ("seek, and ye shall eat spinach" - My primary school teacher) - This is a fast text search engine, intended for use with source code. (Pronunciation: roughly as you would pronounce "zooked" in English)
diff --git a/all.bash b/all.bash deleted file mode 100755 index 4b5ae45..0000000 --- a/all.bash +++ /dev/null
@@ -1,4 +0,0 @@ -#!/bin/sh -set -eux -go test github.com/google/zoekt/... -go install github.com/google/zoekt/cmd/...
diff --git a/api.go b/api.go deleted file mode 100644 index 2bd8a88..0000000 --- a/api.go +++ /dev/null
@@ -1,312 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "context" - "fmt" - "time" - - "github.com/google/zoekt/query" -) - -// FileMatch contains all the matches within a file. -type FileMatch struct { - // Ranking; the higher, the better. - Score float64 // TODO - hide this field? - - // For debugging. Needs DebugScore set, but public so tests in - // other packages can print some diagnostics. - Debug string - - FileName string - - // Repository is the globally unique name of the repo of the - // match - Repository string - Branches []string - LineMatches []LineMatch - - // Only set if requested - Content []byte - - // Checksum of the content. - Checksum []byte - - // Detected language of the result. - Language string - - // SubRepositoryName is the globally unique name of the repo, - // if it came from a subrepository - SubRepositoryName string - - // SubRepositoryPath holds the prefix where the subrepository - // was mounted. - SubRepositoryPath string - - // Commit SHA1 (hex) of the (sub)repo holding the file. - Version string -} - -// LineMatch holds the matches within a single line in a file. -type LineMatch struct { - // The line in which a match was found. - Line []byte - LineStart int - LineEnd int - LineNumber int - - // If set, this was a match on the filename. - FileName bool - - // The higher the better. Only ranks the quality of the match - // within the file, does not take rank of file into account - Score float64 - LineFragments []LineFragmentMatch -} - -// LineFragmentMatch a segment of matching text within a line. -type LineFragmentMatch struct { - // Offset within the line, in bytes. - LineOffset int - - // Offset from file start, in bytes. - Offset uint32 - - // Number bytes that match. - MatchLength int -} - -// Stats contains interesting numbers on the search -type Stats struct { - // Amount of I/O for reading contents. - ContentBytesLoaded int64 - - // Amount of I/O for reading from index. - IndexBytesLoaded int64 - - // Number of search shards that had a crash. - Crashes int - - // Wall clock time for this search - Duration time.Duration - - // Number of files containing a match. - FileCount int - - // Number of files in shards that we considered. - ShardFilesConsidered int - - // Files that we evaluated. Equivalent to files for which all - // atom matches (including negations) evaluated to true. - FilesConsidered int - - // Files for which we loaded file content to verify substring matches - FilesLoaded int - - // Candidate files whose contents weren't examined because we - // gathered enough matches. - FilesSkipped int - - // Shards that we did not process because a query was canceled. - ShardsSkipped int - - // Number of non-overlapping matches - MatchCount int - - // Number of candidate matches as a result of searching ngrams. - NgramMatches int - - // Wall clock time for queued search. - Wait time.Duration - - // Number of times regexp was called on files that we evaluated. - RegexpsConsidered int -} - -func (s *Stats) Add(o Stats) { - s.ContentBytesLoaded += o.ContentBytesLoaded - s.IndexBytesLoaded += o.IndexBytesLoaded - s.Crashes += o.Crashes - s.FileCount += o.FileCount - s.FilesConsidered += o.FilesConsidered - s.FilesLoaded += o.FilesLoaded - s.FilesSkipped += o.FilesSkipped - s.MatchCount += o.MatchCount - s.NgramMatches += o.NgramMatches - s.ShardFilesConsidered += o.ShardFilesConsidered - s.ShardsSkipped += o.ShardsSkipped -} - -// SearchResult contains search matches and extra data -type SearchResult struct { - Stats - Files []FileMatch - - // RepoURLs holds a repo => template string map. - RepoURLs map[string]string - - // FragmentNames holds a repo => template string map, for - // the line number fragment. - LineFragments map[string]string -} - -// RepositoryBranch describes an indexed branch, which is a name -// combined with a version. -type RepositoryBranch struct { - Name string - Version string -} - -// Repository holds repository metadata. -type Repository struct { - // The repository name - Name string - - // The repository URL. - URL string - - // The physical source where this repo came from, eg. full - // path to the zip filename or git repository directory. This - // will not be exposed in the UI, but can be used to detect - // orphaned index shards. - Source string - - // The branches indexed in this repo. - Branches []RepositoryBranch - - // Nil if this is not the super project. - SubRepoMap map[string]*Repository - - // URL template to link to the commit of a branch - CommitURLTemplate string - - // The repository URL for getting to a file. Has access to - // {{Branch}}, {{Path}} - FileURLTemplate string - - // The URL fragment to add to a file URL for line numbers. has - // access to {{LineNumber}}. The fragment should include the - // separator, generally '#' or ';'. - LineFragmentTemplate string - - // All zoekt.* configuration settings. - RawConfig map[string]string - - // Importance of the repository, bigger is more important - Rank uint16 - - // IndexOptions is a hash of the options used to create the index for the - // repo. - IndexOptions string -} - -// IndexMetadata holds metadata stored in the index file. It contains -// data generated by the core indexing library. -type IndexMetadata struct { - IndexFormatVersion int - IndexFeatureVersion int - IndexMinReaderVersion int - IndexTime time.Time - PlainASCII bool - LanguageMap map[string]byte - ZoektVersion string -} - -// Statistics of a (collection of) repositories. -type RepoStats struct { - // Repos is used for aggregrating the number of repositories. - Repos int - - // Shards is the total number of search shards. - Shards int - - // Documents holds the number of documents or files. - Documents int - - // IndexBytes is the amount of RAM used for index overhead. - IndexBytes int64 - - // ContentBytes is the amount of RAM used for raw content. - ContentBytes int64 -} - -func (s *RepoStats) Add(o *RepoStats) { - // can't update Repos, since one repo may have multiple - // shards. - s.Shards += o.Shards - s.IndexBytes += o.IndexBytes - s.Documents += o.Documents - s.ContentBytes += o.ContentBytes -} - -type RepoListEntry struct { - Repository Repository - IndexMetadata IndexMetadata - Stats RepoStats -} - -// RepoList holds a set of Repository metadata. -type RepoList struct { - Repos []*RepoListEntry - Crashes int -} - -type Searcher interface { - Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error) - - // List lists repositories. The query `q` can only contain - // query.Repo atoms. - List(ctx context.Context, q query.Q) (*RepoList, error) - Close() - - // Describe the searcher for debug messages. - String() string -} - -type SearchOptions struct { - // Return an upper-bound estimate of eligible documents in - // stats.ShardFilesConsidered. - EstimateDocCount bool - - // Return the whole file. - Whole bool - - // Maximum number of matches: skip all processing an index - // shard after we found this many non-overlapping matches. - ShardMaxMatchCount int - - // Maximum number of matches: stop looking for more matches - // once we have this many matches across shards. - TotalMaxMatchCount int - - // Maximum number of important matches: skip processing - // shard after we found this many important matches. - ShardMaxImportantMatch int - - // Maximum number of important matches across shards. - TotalMaxImportantMatch int - - // Abort the search after this much time has passed. - MaxWallTime time.Duration - - // Trim the number of results after collating and sorting the - // results - MaxDocDisplayCount int -} - -func (s *SearchOptions) String() string { - return fmt.Sprintf("%#v", s) -}
diff --git a/bits.go b/bits.go deleted file mode 100644 index 62f6110..0000000 --- a/bits.go +++ /dev/null
@@ -1,268 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "encoding/binary" - "unicode" - "unicode/utf8" -) - -func generateCaseNgrams(g ngram) []ngram { - asRunes := ngramToRunes(g) - - variants := make([]ngram, 0, 8) - cur := asRunes - for { - for i := 0; i < 3; i++ { - next := unicode.SimpleFold(cur[i]) - cur[i] = next - if next != asRunes[i] { - break - } - } - - variants = append(variants, runesToNGram(cur)) - if cur == asRunes { - break - } - } - - return variants -} - -func toLower(in []byte) []byte { - out := make([]byte, 0, len(in)) - var buf [4]byte - for _, c := range string(in) { - i := utf8.EncodeRune(buf[:], unicode.ToLower(c)) - out = append(out, buf[:i]...) - } - return out -} - -// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may -// be larger than 'lower'. Returns whether there was a match, and if -// yes, the byte size of the match. -func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) { - matchTotal := 0 - for len(lower) > 0 && len(mixed) > 0 { - lr, lsz := utf8.DecodeRune(lower) - lower = lower[lsz:] - - mr, msz := utf8.DecodeRune(mixed) - mixed = mixed[msz:] - matchTotal += msz - - if lr != unicode.ToLower(mr) { - return 0, false - } - } - - return matchTotal, len(lower) == 0 -} - -type ngram uint64 - -func runesToNGram(b [ngramSize]rune) ngram { - return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2])) -} - -func bytesToNGram(b []byte) ngram { - return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])}) -} - -func stringToNGram(s string) ngram { - return bytesToNGram([]byte(s)) -} - -func ngramToBytes(n ngram) []byte { - rs := ngramToRunes(n) - return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])} -} - -const runeMask = 1<<21 - 1 - -func ngramToRunes(n ngram) [ngramSize]rune { - return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)} -} - -func (n ngram) String() string { - rs := ngramToRunes(n) - return string(rs[:]) -} - -type runeNgramOff struct { - ngram ngram - byteSize uint32 // size of ngram - byteOff uint32 - runeOff uint32 -} - -func splitNGrams(str []byte) []runeNgramOff { - var runeGram [3]rune - var off [3]uint32 - var runeCount int - - result := make([]runeNgramOff, 0, len(str)) - var i uint32 - - chars := -1 - for len(str) > 0 { - chars++ - r, sz := utf8.DecodeRune(str) - str = str[sz:] - runeGram[0] = runeGram[1] - off[0] = off[1] - runeGram[1] = runeGram[2] - off[1] = off[2] - runeGram[2] = r - off[2] = uint32(i) - i += uint32(sz) - runeCount++ - if runeCount < ngramSize { - continue - } - - ng := runesToNGram(runeGram) - result = append(result, runeNgramOff{ - ngram: ng, - byteSize: i - off[0], - byteOff: off[0], - runeOff: uint32(chars), - }) - } - return result -} - -const ( - _classChar = 0 - _classDigit = iota - _classPunct = iota - _classOther = iota - _classSpace = iota -) - -func byteClass(c byte) int { - if (c >= 'a' && c <= 'z') || c >= 'A' && c <= 'Z' { - return _classChar - } - if c >= '0' && c <= '9' { - return _classDigit - } - - switch c { - case ' ', '\n': - return _classSpace - case '.', ',', ';', '"', '\'': - return _classPunct - default: - return _classOther - } -} - -func marshalDocSections(secs []DocumentSection) []byte { - ints := make([]uint32, 0, len(secs)*2) - for _, s := range secs { - ints = append(ints, uint32(s.Start), uint32(s.End)) - } - - return toSizedDeltas(ints) -} - -func unmarshalDocSections(in []byte, buf []DocumentSection) (secs []DocumentSection) { - // TODO - ints is unnecessary garbage here. - ints := fromSizedDeltas(in, nil) - if cap(buf) >= len(ints)/2 { - buf = buf[:0] - } else { - buf = make([]DocumentSection, 0, len(ints)/2) - } - - for len(ints) > 0 { - buf = append(buf, DocumentSection{ints[0], ints[1]}) - ints = ints[2:] - } - return buf -} - -type ngramSlice []ngram - -func (p ngramSlice) Len() int { return len(p) } - -func (p ngramSlice) Less(i, j int) bool { - return p[i] < p[j] -} - -func (p ngramSlice) Swap(i, j int) { - p[i], p[j] = p[j], p[i] -} - -func toSizedDeltas(offsets []uint32) []byte { - var enc [8]byte - - deltas := make([]byte, 0, len(offsets)*2) - - m := binary.PutUvarint(enc[:], uint64(len(offsets))) - deltas = append(deltas, enc[:m]...) - - var last uint32 - for _, p := range offsets { - delta := p - last - last = p - - m := binary.PutUvarint(enc[:], uint64(delta)) - deltas = append(deltas, enc[:m]...) - } - return deltas -} - -func fromSizedDeltas(data []byte, ps []uint32) []uint32 { - sz, m := binary.Uvarint(data) - data = data[m:] - - if cap(ps) < int(sz) { - ps = make([]uint32, 0, sz) - } else { - ps = ps[:0] - } - - var last uint32 - for len(data) > 0 { - delta, m := binary.Uvarint(data) - offset := last + uint32(delta) - last = offset - data = data[m:] - ps = append(ps, offset) - } - return ps -} - -func fromDeltas(data []byte, buf []uint32) []uint32 { - buf = buf[:0] - if cap(buf) < len(data)/2 { - buf = make([]uint32, 0, len(data)/2) - } - - var last uint32 - for len(data) > 0 { - delta, m := binary.Uvarint(data) - offset := last + uint32(delta) - last = offset - data = data[m:] - buf = append(buf, offset) - } - return buf -}
diff --git a/bits_test.go b/bits_test.go deleted file mode 100644 index 41710a9..0000000 --- a/bits_test.go +++ /dev/null
@@ -1,198 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "encoding/binary" - "log" - "math/rand" - "reflect" - "sort" - "testing" - "testing/quick" - - "github.com/google/go-cmp/cmp" -) - -var _ = log.Println - -func TestNgram(t *testing.T) { - in := "abc" - n := stringToNGram(in) - if n.String() != "abc" { - t.Errorf("got %q, want %q", n, "abc") - } - - f := func(b ngramRunes) bool { - n := runesToNGram(b) - got := ngramRunes(ngramToRunes(n)) - if !reflect.DeepEqual(b, got) { - t.Log(cmp.Diff(b, got)) - return false - } - return true - } - if err := quick.Check(f, nil); err != nil { - t.Error(err) - } -} - -type ngramRunes [ngramSize]rune - -func (ngramRunes) Generate(rand *rand.Rand, size int) reflect.Value { - // Same implementation used by testing/quick to generate strings. But we - // force it to ngramSize runes. - var b ngramRunes - for i := range b { - b[i] = rune(rand.Intn(0x10ffff)) - } - return reflect.ValueOf(b) -} - -func TestDocSection(t *testing.T) { - in := []DocumentSection{{1, 2}, {3, 4}} - serialized := marshalDocSections(in) - roundtrip := unmarshalDocSections(serialized, nil) - if !reflect.DeepEqual(in, roundtrip) { - t.Errorf("got %v, want %v", roundtrip, in) - } -} - -func TestGenerateCaseNgrams(t *testing.T) { - ng := stringToNGram("aB1") - gotNG := generateCaseNgrams(ng) - - got := map[string]bool{} - for _, n := range gotNG { - got[string(ngramToBytes(n))] = true - } - - want := map[string]bool{ - "aB1": true, - "AB1": true, - "ab1": true, - "Ab1": true, - } - - if !reflect.DeepEqual(got, want) { - t.Errorf("got %v, want %v", got, want) - } -} - -func TestNextFileIndex(t *testing.T) { - for _, tc := range []struct { - off, curFile uint32 - ends []uint32 - want uint32 - }{ - {maxUInt32, 0, []uint32{34}, 1}, - {9, 0, []uint32{34}, 0}, - {450, 0, []uint32{100, 200, 300, 400, 500, 600}, 4}, - } { - got := nextFileIndex(tc.off, tc.curFile, tc.ends) - if got != tc.want { - t.Errorf("%v: got %d, want %d", tc, got, tc.want) - } - } -} - -func TestSizedDeltas(t *testing.T) { - encode := func(nums []uint32) []byte { - return toSizedDeltas(nums) - } - decode := func(data []byte) []uint32 { - if len(data) == 0 { - return nil - } - return fromSizedDeltas(data, nil) - } - testIncreasingIntCoder(t, encode, decode) -} - -func TestFromDeltas(t *testing.T) { - decode := func(data []byte) []uint32 { - if len(data) == 0 { - return nil - } - return fromDeltas(data, nil) - } - testIncreasingIntCoder(t, toDeltas, decode) -} - -func TestCompressedPostingIterator(t *testing.T) { - decode := func(data []byte) []uint32 { - if len(data) == 0 { - return nil - } - - var nums []uint32 - i := newCompressedPostingIterator(data, stringToNGram("abc")) - for i.first() != maxUInt32 { - nums = append(nums, i.first()) - i.next(i.first()) - } - return nums - } - testIncreasingIntCoder(t, toDeltas, decode) -} - -func toDeltas(offsets []uint32) []byte { - var enc [8]byte - - deltas := make([]byte, 0, len(offsets)*2) - - var last uint32 - for _, p := range offsets { - delta := p - last - last = p - - m := binary.PutUvarint(enc[:], uint64(delta)) - deltas = append(deltas, enc[:m]...) - } - return deltas -} - -func testIncreasingIntCoder(t *testing.T, encode func([]uint32) []byte, decode func([]byte) []uint32) { - f := func(nums []uint32) bool { - nums = sortedUnique(nums) - b := encode(nums) - got := decode(b) - if len(nums) == len(got) && len(nums) == 0 { - return true - } - if !reflect.DeepEqual(got, nums) { - t.Log(cmp.Diff(nums, got)) - return false - } - return true - } - if err := quick.Check(f, nil); err != nil { - t.Error(err) - } -} - -func sortedUnique(nums []uint32) []uint32 { - if len(nums) == 0 { - return nums - } - sort.Slice(nums, func(i, j int) bool { return nums[i] < nums[j] }) - filtered := nums[:1] - for _, n := range nums[1:] { - if filtered[len(filtered)-1] != n { - filtered = append(filtered, n) - } - } - return filtered -}
diff --git a/build-deploy.sh b/build-deploy.sh deleted file mode 100644 index 7fb442f..0000000 --- a/build-deploy.sh +++ /dev/null
@@ -1,51 +0,0 @@ -#!/bin/bash - -# this script packages up all the binaries, and a script (deploy.sh) -# to twiddle with the server and the binaries - -set -ex - -# Put the date first so we can sort. -if [[ -z "$VERSION" ]]; then - VERSION=$(date --iso-8601=minutes | tr -d ':' | sed 's|\+.*$||') - if [[ -d .git ]]; then - VERSION=${VERSION}-$(git show --pretty=format:%h -q) - fi -fi - -set -u - -out=zoekt-${VERSION} -mkdir -p ${out} - -for d in $(find cmd -maxdepth 1 -type d) -do - go build -tags netgo -ldflags "-X github.com/google/zoekt.Version=$VERSION" -o ${out}/$(basename $d) github.com/google/zoekt/$d -done - -cat <<EOF > ${out}/deploy.sh -#!/bin/bash - -echo "Set the following in the environment." -echo "" -echo ' export PATH="'$PWD'/bin:$PATH' -echo "" - -set -eux - -# Allow sandbox to create NS's -sudo sh -c 'echo 1 > /proc/sys/kernel/unprivileged_userns_clone' - -# we mmap the entire index, but typically only want the file contents. -sudo sh -c 'echo 1 >/proc/sys/vm/overcommit_memory' - -# allow bind to 80 and 443 -sudo setcap 'cap_net_bind_service=+ep' bin/zoekt-webserver - -EOF - -chmod 755 ${out}/* - -tar --owner=root --group=root -czf zoekt-deploy-${VERSION}.tar.gz ${out}/* - -rm -rf ${out}
diff --git a/build/builder.go b/build/builder.go deleted file mode 100644 index 5461828..0000000 --- a/build/builder.go +++ /dev/null
@@ -1,567 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// package build implements a more convenient interface for building -// zoekt indices. -package build - -import ( - "crypto/sha1" - "flag" - "fmt" - "io" - "io/ioutil" - "log" - "net/url" - "os" - "os/exec" - "path/filepath" - "reflect" - "regexp" - "runtime" - "runtime/pprof" - "sort" - "strings" - "sync" - - "github.com/bmatcuk/doublestar" - "github.com/google/zoekt" - "github.com/google/zoekt/ctags" -) - -var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") - -// Branch describes a single branch version. -type Branch struct { - Name string - Version string -} - -// Options sets options for the index building. -type Options struct { - // IndexDir is a directory that holds *.zoekt index files. - IndexDir string - - // SizeMax is the maximum file size - SizeMax int - - // Parallelism is the maximum number of shards to index in parallel - Parallelism int - - // ShardMax sets the maximum corpus size for a single shard - ShardMax int - - // TrigramMax sets the maximum number of distinct trigrams per document. - TrigramMax int - - // RepositoryDescription holds names and URLs for the repository. - RepositoryDescription zoekt.Repository - - // SubRepositories is a path => sub repository map. - SubRepositories map[string]*zoekt.Repository - - // Path to exuberant ctags binary to run - CTags string - - // If set, ctags must succeed. - CTagsMustSucceed bool - - // Write memory profiles to this file. - MemProfile string - - // LargeFiles is a slice of glob patterns, including ** for any number - // of directories, where matching file paths should be indexed - // regardless of their size. The full pattern syntax is here: - // https://github.com/bmatcuk/doublestar/tree/v1#patterns. - LargeFiles []string -} - -// HashOptions creates a hash of the options that affect an index. -func (o *Options) HashOptions() string { - hasher := sha1.New() - - hasher.Write([]byte(o.CTags)) - hasher.Write([]byte(fmt.Sprintf("%t", o.CTagsMustSucceed))) - hasher.Write([]byte(fmt.Sprintf("%d", o.SizeMax))) - hasher.Write([]byte(fmt.Sprintf("%q", o.LargeFiles))) - - return fmt.Sprintf("%x", hasher.Sum(nil)) -} - -type largeFilesFlag struct{ *Options } - -func (f largeFilesFlag) String() string { - // From flag.Value documentation: - // - // The flag package may call the String method with a zero-valued receiver, - // such as a nil pointer. - if f.Options == nil { - return "" - } - s := append([]string{""}, f.LargeFiles...) - return strings.Join(s, "-large_file ") -} - -func (f largeFilesFlag) Set(value string) error { - f.LargeFiles = append(f.LargeFiles, value) - return nil -} - -// Flags adds flags for build options to fs. -func (o *Options) Flags(fs *flag.FlagSet) { - x := *o - x.SetDefaults() - fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") - fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") - fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") - fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") - fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") - fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") - fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") -} - -// Builder manages (parallel) creation of uniformly sized shards. The -// builder buffers up documents until it collects enough documents and -// then builds a shard and writes. -type Builder struct { - opts Options - throttle chan int - - nextShardNum int - todo []*zoekt.Document - size int - - parser ctags.Parser - - building sync.WaitGroup - - errMu sync.Mutex - buildError error - - // temp name => final name for finished shards. We only rename - // them once all shards succeed to avoid Frankstein corpuses. - finishedShards map[string]string -} - -type finishedShard struct { - temp, final string -} - -// SetDefaults sets reasonable default options. -func (o *Options) SetDefaults() { - if o.CTags == "" { - ctags, err := exec.LookPath("universal-ctags") - if err == nil { - o.CTags = ctags - } - } - - if o.CTags == "" { - ctags, err := exec.LookPath("ctags-exuberant") - if err == nil { - o.CTags = ctags - } - } - if o.Parallelism == 0 { - o.Parallelism = 4 - } - if o.SizeMax == 0 { - o.SizeMax = 2 << 20 - } - if o.ShardMax == 0 { - o.ShardMax = 100 << 20 - } - if o.TrigramMax == 0 { - o.TrigramMax = 20000 - } - - if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { - parsed, _ := url.Parse(o.RepositoryDescription.URL) - if parsed != nil { - o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) - } - } -} - -func hashString(s string) string { - h := sha1.New() - io.WriteString(h, s) - return fmt.Sprintf("%x", h.Sum(nil)) -} - -// ShardName returns the name the given index shard. -func (o *Options) shardName(n int) string { - abs := url.QueryEscape(o.RepositoryDescription.Name) - if len(abs) > 200 { - abs = abs[:200] + hashString(abs)[:8] - } - return filepath.Join(o.IndexDir, - fmt.Sprintf("%s_v%d.%05d.zoekt", abs, zoekt.IndexFormatVersion, n)) -} - -// IncrementalSkipIndexing returns true if the index present on disk matches -// the build options. -func (o *Options) IncrementalSkipIndexing() bool { - fn := o.shardName(0) - - f, err := os.Open(fn) - if err != nil { - return false - } - - iFile, err := zoekt.NewIndexFile(f) - if err != nil { - return false - } - defer iFile.Close() - - repo, index, err := zoekt.ReadMetadata(iFile) - if err != nil { - return false - } - - if index.IndexFeatureVersion != zoekt.FeatureVersion { - return false - } - - if repo.IndexOptions != o.HashOptions() { - return false - } - - return reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) -} - -// IgnoreSizeMax determines whether the max size should be ignored. -func (o *Options) IgnoreSizeMax(name string) bool { - for _, pattern := range o.LargeFiles { - pattern = strings.TrimSpace(pattern) - m, _ := doublestar.PathMatch(pattern, name) - if m { - return true - } - } - - return false -} - -// NewBuilder creates a new Builder instance. -func NewBuilder(opts Options) (*Builder, error) { - opts.SetDefaults() - if opts.RepositoryDescription.Name == "" { - return nil, fmt.Errorf("builder: must set Name") - } - - b := &Builder{ - opts: opts, - throttle: make(chan int, opts.Parallelism), - finishedShards: map[string]string{}, - } - - if b.opts.CTags == "" && b.opts.CTagsMustSucceed { - return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set") - } - - if strings.Contains(opts.CTags, "universal-ctags") { - parser, err := ctags.NewParser(opts.CTags) - if err != nil && opts.CTagsMustSucceed { - return nil, fmt.Errorf("ctags.NewParser: %v", err) - } - - b.parser = parser - } - if _, err := b.newShardBuilder(); err != nil { - return nil, err - } - - return b, nil -} - -// AddFile is a convenience wrapper for the Add method -func (b *Builder) AddFile(name string, content []byte) error { - return b.Add(zoekt.Document{Name: name, Content: content}) -} - -func (b *Builder) Add(doc zoekt.Document) error { - // We could pass the document on to the shardbuilder, but if - // we pass through a part of the source tree with binary/large - // files, the corresponding shard would be mostly empty, so - // insert a reason here too. - if len(doc.Content) > b.opts.SizeMax && !b.opts.IgnoreSizeMax(doc.Name) { - doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) - } else if err := zoekt.CheckText(doc.Content, b.opts.TrigramMax); err != nil { - doc.SkipReason = err.Error() - doc.Language = "binary" - } - - b.todo = append(b.todo, &doc) - b.size += len(doc.Name) + len(doc.Content) - if b.size > b.opts.ShardMax { - return b.flush() - } - - return nil -} - -// Finish creates a last shard from the buffered documents, and clears -// stale shards from previous runs. This should always be called, also -// in failure cases, to ensure cleanup. -func (b *Builder) Finish() error { - b.flush() - b.building.Wait() - - if b.buildError != nil { - for tmp := range b.finishedShards { - os.Remove(tmp) - } - b.finishedShards = map[string]string{} - return b.buildError - } - - for tmp, final := range b.finishedShards { - if err := os.Rename(tmp, final); err != nil { - b.buildError = err - } - } - b.finishedShards = map[string]string{} - - if b.nextShardNum > 0 { - b.deleteRemainingShards() - } - return b.buildError -} - -func (b *Builder) deleteRemainingShards() { - for { - shard := b.nextShardNum - b.nextShardNum++ - name := b.opts.shardName(shard) - if err := os.Remove(name); os.IsNotExist(err) { - break - } - } -} - -func (b *Builder) flush() error { - todo := b.todo - b.todo = nil - b.size = 0 - b.errMu.Lock() - defer b.errMu.Unlock() - if b.buildError != nil { - return b.buildError - } - - hasShard := b.nextShardNum > 0 - if len(todo) == 0 && hasShard { - return nil - } - - shard := b.nextShardNum - b.nextShardNum++ - - if b.opts.Parallelism > 1 { - b.building.Add(1) - go func() { - b.throttle <- 1 - done, err := b.buildShard(todo, shard) - <-b.throttle - - b.errMu.Lock() - defer b.errMu.Unlock() - if err != nil && b.buildError == nil { - b.buildError = err - } - if err == nil { - b.finishedShards[done.temp] = done.final - } - b.building.Done() - }() - } else { - // No goroutines when we're not parallel. This - // simplifies memory profiling. - done, err := b.buildShard(todo, shard) - b.buildError = err - if err == nil { - b.finishedShards[done.temp] = done.final - } - if b.opts.MemProfile != "" { - // drop memory, and profile. - todo = nil - b.writeMemProfile(b.opts.MemProfile) - } - - return b.buildError - } - - return nil -} - -var profileNumber int - -func (b *Builder) writeMemProfile(name string) { - nm := fmt.Sprintf("%s.%d", name, profileNumber) - profileNumber++ - f, err := os.Create(nm) - if err != nil { - log.Fatal("could not create memory profile: ", err) - } - runtime.GC() // get up-to-date statistics - if err := pprof.WriteHeapProfile(f); err != nil { - log.Fatal("could not write memory profile: ", err) - } - f.Close() - log.Printf("wrote mem profile %q", nm) -} - -// map [0,inf) to [0,1) monotonically -func squashRange(j int) float64 { - x := float64(j) - return x / (1 + x) -} - -var testRe = regexp.MustCompile("test") - -type rankedDoc struct { - *zoekt.Document - rank []float64 -} - -func rank(d *zoekt.Document, origIdx int) []float64 { - test := 0.0 - if testRe.MatchString(d.Name) { - test = 1.0 - } - - // Smaller is earlier (=better). - return []float64{ - // Prefer docs that are not tests - test, - - // With many symbols - 1.0 - squashRange(len(d.Symbols)), - - // With short content - squashRange(len(d.Content)), - - // With short names - squashRange(len(d.Name)), - - // That is present is as many branches as possible - 1.0 - squashRange(len(d.Branches)), - - // Preserve original ordering. - squashRange(origIdx), - } -} - -func sortDocuments(todo []*zoekt.Document) { - rs := make([]rankedDoc, 0, len(todo)) - for i, t := range todo { - rd := rankedDoc{t, rank(t, i)} - rs = append(rs, rd) - } - sort.Slice(rs, func(i, j int) bool { - r1 := rs[i].rank - r2 := rs[j].rank - for i := range r1 { - if r1[i] < r2[i] { - return true - } - if r1[i] > r2[i] { - return false - } - } - - return false - }) - for i := range todo { - todo[i] = rs[i].Document - } -} - -func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { - if b.opts.CTags != "" { - err := ctagsAddSymbols(todo, b.parser, b.opts.CTags) - if b.opts.CTagsMustSucceed && err != nil { - return nil, err - } - if err != nil { - log.Printf("ignoring %s error: %v", b.opts.CTags, err) - } - } - - name := b.opts.shardName(nextShardNum) - - shardBuilder, err := b.newShardBuilder() - if err != nil { - return nil, err - } - sortDocuments(todo) - for _, t := range todo { - if err := shardBuilder.Add(*t); err != nil { - return nil, err - } - } - - return b.writeShard(name, shardBuilder) -} - -func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) { - desc := b.opts.RepositoryDescription - desc.SubRepoMap = b.opts.SubRepositories - desc.IndexOptions = b.opts.HashOptions() - - shardBuilder, err := zoekt.NewIndexBuilder(&desc) - if err != nil { - return nil, err - } - return shardBuilder, nil -} - -func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) { - dir := filepath.Dir(fn) - if err := os.MkdirAll(dir, 0o700); err != nil { - return nil, err - } - - f, err := ioutil.TempFile(dir, filepath.Base(fn)+".*.tmp") - if err != nil { - return nil, err - } - if runtime.GOOS != "windows" { - if err := f.Chmod(0o666 &^ umask); err != nil { - return nil, err - } - } - - defer f.Close() - if err := ib.Write(f); err != nil { - return nil, err - } - fi, err := f.Stat() - if err != nil { - return nil, err - } - if err := f.Close(); err != nil { - return nil, err - } - - log.Printf("finished %s: %d index bytes (overhead %3.1f)", fn, fi.Size(), - float64(fi.Size())/float64(ib.ContentSize()+1)) - - return &finishedShard{f.Name(), fn}, nil -} - -// umask holds the Umask of the current process -var umask os.FileMode
diff --git a/build/builder_test.go b/build/builder_test.go deleted file mode 100644 index 133b39a..0000000 --- a/build/builder_test.go +++ /dev/null
@@ -1,51 +0,0 @@ -package build - -import ( - "flag" - "testing" - - "github.com/google/go-cmp/cmp" -) - -func TestFlags(t *testing.T) { - cases := []struct { - args []string - want Options - }{{ - // Defaults - args: []string{}, - want: Options{}, - }, { - args: []string{"-index", "/tmp"}, - want: Options{ - IndexDir: "/tmp", - }, - }, { - // single large file pattern - args: []string{"-large_file", "*.md"}, - want: Options{ - LargeFiles: []string{"*.md"}, - }, - }, { - // multiple large file pattern - args: []string{"-large_file", "*.md", "-large_file", "*.yaml"}, - want: Options{ - LargeFiles: []string{"*.md", "*.yaml"}, - }, - }} - - for _, c := range cases { - c.want.SetDefaults() - // depends on $PATH setting. - c.want.CTags = "" - - got := Options{} - fs := flag.NewFlagSet("", flag.ContinueOnError) - got.Flags(fs) - if err := fs.Parse(c.args); err != nil { - t.Errorf("failed to parse args %v: %v", c.args, err) - } else if !cmp.Equal(got, c.want) { - t.Errorf("mismatch for %v (-want +got):\n%s", c.args, cmp.Diff(c.want, got)) - } - } -}
diff --git a/build/builder_unix.go b/build/builder_unix.go deleted file mode 100644 index fde31d5..0000000 --- a/build/builder_unix.go +++ /dev/null
@@ -1,26 +0,0 @@ -// Copyright 2018 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// +build !windows - -package build - -import ( - "os" - "syscall" -) - -func init() { - umask = os.FileMode(syscall.Umask(0)) - syscall.Umask(int(umask)) -}
diff --git a/build/ctags.go b/build/ctags.go deleted file mode 100644 index 9c2ac5b..0000000 --- a/build/ctags.go +++ /dev/null
@@ -1,278 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build - -import ( - "bytes" - "fmt" - "io/ioutil" - "os" - "os/exec" - "path/filepath" - "strings" - "time" - - "github.com/google/zoekt" - "github.com/google/zoekt/ctags" -) - -func runCTags(bin string, inputs map[string][]byte) ([]*ctags.Entry, error) { - const debug = false - if len(inputs) == 0 { - return nil, nil - } - dir, err := ioutil.TempDir("", "ctags-input") - if err != nil { - return nil, err - } - if !debug { - defer os.RemoveAll(dir) - } - - // --sort shells out to sort(1). - args := []string{bin, "-n", "-f", "-", "--sort=no"} - - fileCount := 0 - for n, c := range inputs { - if len(c) == 0 { - continue - } - - full := filepath.Join(dir, n) - if err := os.MkdirAll(filepath.Dir(full), 0o700); err != nil { - return nil, err - } - err := ioutil.WriteFile(full, c, 0o600) - if err != nil { - return nil, err - } - args = append(args, n) - fileCount++ - } - if fileCount == 0 { - return nil, nil - } - - cmd := exec.Command(args[0], args[1:]...) - cmd.Dir = dir - - var errBuf, outBuf bytes.Buffer - cmd.Stderr = &errBuf - cmd.Stdout = &outBuf - - if err := cmd.Start(); err != nil { - return nil, err - } - - errChan := make(chan error, 1) - go func() { - err := cmd.Wait() - errChan <- err - }() - timeout := time.After(5 * time.Second) - select { - case <-timeout: - cmd.Process.Kill() - return nil, fmt.Errorf("timeout executing ctags") - case err := <-errChan: - if err != nil { - return nil, fmt.Errorf("exec(%s): %v, stderr: %s", cmd.Args, err, errBuf.String()) - } - } - - var entries []*ctags.Entry - for _, l := range bytes.Split(outBuf.Bytes(), []byte{'\n'}) { - if len(l) == 0 { - continue - } - e, err := ctags.Parse(string(l)) - if err != nil { - return nil, err - } - - if len(e.Sym) == 1 { - continue - } - entries = append(entries, e) - } - return entries, nil -} - -func runCTagsChunked(bin string, in map[string][]byte) ([]*ctags.Entry, error) { - var res []*ctags.Entry - - cur := map[string][]byte{} - sz := 0 - for k, v := range in { - cur[k] = v - sz += len(k) - - // 100k seems reasonable. - if sz > (100 << 10) { - r, err := runCTags(bin, cur) - if err != nil { - return nil, err - } - res = append(res, r...) - - cur = map[string][]byte{} - sz = 0 - } - } - r, err := runCTags(bin, cur) - if err != nil { - return nil, err - } - res = append(res, r...) - return res, nil -} - -func ctagsAddSymbolsParser(todo []*zoekt.Document, parser ctags.Parser) error { - for _, doc := range todo { - if doc.Symbols != nil { - continue - } - - es, err := parser.Parse(doc.Name, doc.Content) - if err != nil { - return err - } - if len(es) == 0 { - continue - } - doc.Language = strings.ToLower(es[0].Language) - - symOffsets, err := tagsToSections(doc.Content, es) - if err != nil { - return fmt.Errorf("%s: %v", doc.Name, err) - } - doc.Symbols = symOffsets - } - - return nil -} - -func ctagsAddSymbols(todo []*zoekt.Document, parser ctags.Parser, bin string) error { - if parser != nil { - return ctagsAddSymbolsParser(todo, parser) - } - - pathIndices := map[string]int{} - contents := map[string][]byte{} - for i, t := range todo { - if t.Symbols != nil { - continue - } - - _, ok := pathIndices[t.Name] - if ok { - continue - } - - pathIndices[t.Name] = i - contents[t.Name] = t.Content - } - - var err error - var entries []*ctags.Entry - entries, err = runCTagsChunked(bin, contents) - if err != nil { - return err - } - - fileTags := map[string][]*ctags.Entry{} - for _, e := range entries { - fileTags[e.Path] = append(fileTags[e.Path], e) - } - - for k, tags := range fileTags { - symOffsets, err := tagsToSections(contents[k], tags) - if err != nil { - return fmt.Errorf("%s: %v", k, err) - } - todo[pathIndices[k]].Symbols = symOffsets - if len(tags) > 0 { - todo[pathIndices[k]].Language = strings.ToLower(tags[0].Language) - } - } - return nil -} - -func tagsToSections(content []byte, tags []*ctags.Entry) ([]zoekt.DocumentSection, error) { - nls := newLinesIndices(content) - nls = append(nls, uint32(len(content))) - var symOffsets []zoekt.DocumentSection - var lastEnd uint32 - var lastLine int - var lastIntraEnd int - for _, t := range tags { - if t.Line <= 0 { - // Observed this with a .JS file. - continue - } - lineIdx := t.Line - 1 - if lineIdx >= len(nls) { - return nil, fmt.Errorf("linenum for entry out of range %v", t) - } - - lineOff := uint32(0) - if lineIdx > 0 { - lineOff = nls[lineIdx-1] + 1 - } - - end := nls[lineIdx] - line := content[lineOff:end] - if lastLine == lineIdx { - line = line[lastIntraEnd:] - } else { - lastIntraEnd = 0 - } - - intraOff := lastIntraEnd + bytes.Index(line, []byte(t.Sym)) - if intraOff < 0 { - // for Go code, this is very common, since - // ctags barfs on multi-line declarations - continue - } - start := lineOff + uint32(intraOff) - if start < lastEnd { - // This can happen if we have multiple tags on the same line. - // Give up. - continue - } - - endSym := start + uint32(len(t.Sym)) - - symOffsets = append(symOffsets, zoekt.DocumentSection{ - Start: start, - End: endSym, - }) - lastEnd = endSym - lastLine = lineIdx - lastIntraEnd = intraOff + len(t.Sym) - } - - return symOffsets, nil -} - -func newLinesIndices(in []byte) []uint32 { - out := make([]uint32, 0, len(in)/30) - for i, c := range in { - if c == '\n' { - out = append(out, uint32(i)) - } - } - return out -}
diff --git a/build/ctags_test.go b/build/ctags_test.go deleted file mode 100644 index c853b2e..0000000 --- a/build/ctags_test.go +++ /dev/null
@@ -1,94 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build - -import ( - "reflect" - "testing" - - "github.com/google/zoekt" - "github.com/google/zoekt/ctags" -) - -func TestTagsToSections(t *testing.T) { - c := []byte("package foo\nfunc bar(j int) {}\n//bla") - // ----------01234567890 1234567890123456789 012345 - - tags := []*ctags.Entry{ - { - Sym: "bar", - Line: 2, - }, - } - - secs, err := tagsToSections(c, tags) - if err != nil { - t.Fatal("tagsToSections", err) - } - - if len(secs) != 1 || secs[0].Start != 17 || secs[0].End != 20 { - t.Fatalf("got %#v, want 1 section (17,20)", secs) - } -} - -func TestTagsToSectionsMultiple(t *testing.T) { - c := []byte("class Foob { int x; int b; }") - // ----------012345678901234567890123456789 - - tags := []*ctags.Entry{ - { - Sym: "x", - Line: 1, - }, - { - Sym: "b", - Line: 1, - }, - } - - got, err := tagsToSections(c, tags) - if err != nil { - t.Fatal("tagsToSections", err) - } - - want := []zoekt.DocumentSection{ - {Start: 17, End: 18}, - {Start: 24, End: 25}, - } - if !reflect.DeepEqual(got, want) { - t.Errorf("got %v, want %v", got, want) - } -} - -func TestTagsToSectionsEOF(t *testing.T) { - c := []byte("package foo\nfunc bar(j int) {}") - // ----------01234567890 1234567890123456789 012345 - - tags := []*ctags.Entry{ - { - Sym: "bar", - Line: 2, - }, - } - - secs, err := tagsToSections(c, tags) - if err != nil { - t.Fatal("tagsToSections", err) - } - - if len(secs) != 1 || secs[0].Start != 17 || secs[0].End != 20 { - t.Fatalf("got %#v, want 1 section (17,20)", secs) - } -}
diff --git a/build/e2e_test.go b/build/e2e_test.go deleted file mode 100644 index d72cda4..0000000 --- a/build/e2e_test.go +++ /dev/null
@@ -1,491 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build - -import ( - "fmt" - "io/ioutil" - "log" - "os" - "path/filepath" - "reflect" - "strings" - "testing" - "time" - - "golang.org/x/net/context" - - "github.com/google/zoekt" - "github.com/google/zoekt/query" - "github.com/google/zoekt/shards" -) - -func TestBasic(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - - opts := Options{ - IndexDir: dir, - ShardMax: 1024, - RepositoryDescription: zoekt.Repository{ - Name: "repo", - }, - Parallelism: 2, - SizeMax: 1 << 20, - } - - b, err := NewBuilder(opts) - if err != nil { - t.Fatalf("NewBuilder: %v", err) - } - - for i := 0; i < 4; i++ { - s := fmt.Sprintf("%d", i) - b.AddFile("F"+s, []byte(strings.Repeat(s, 1000))) - } - - if err := b.Finish(); err != nil { - t.Errorf("Finish: %v", err) - } - - fs, _ := filepath.Glob(dir + "/*") - if len(fs) <= 1 { - t.Fatalf("want multiple shards, got %v", fs) - } - - ss, err := shards.NewDirectorySearcher(dir) - if err != nil { - t.Fatalf("NewDirectorySearcher(%s): %v", dir, err) - } - - q, err := query.Parse("111") - if err != nil { - t.Fatalf("Parse(111): %v", err) - } - - var sOpts zoekt.SearchOptions - ctx := context.Background() - result, err := ss.Search(ctx, q, &sOpts) - if err != nil { - t.Fatalf("Search(%v): %v", q, err) - } - - if len(result.Files) != 1 || result.Files[0].FileName != "F1" { - t.Errorf("got %v, want 1 file.", result.Files) - } - defer ss.Close() -} - -func TestLargeFileOption(t *testing.T) { - dir, err := ioutil.TempDir("", "large_files_test") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - sizeMax := 1000 - opts := Options{ - IndexDir: dir, - LargeFiles: []string{"F0", "F2"}, - RepositoryDescription: zoekt.Repository{ - Name: "repo", - }, - SizeMax: sizeMax, - } - - b, err := NewBuilder(opts) - if err != nil { - t.Fatalf("NewBuilder: %v", err) - } - - for i := 0; i < 4; i++ { - s := fmt.Sprintf("%d", i) - b.AddFile("F"+s, []byte(strings.Repeat("a", sizeMax+1))) - } - - if err := b.Finish(); err != nil { - t.Errorf("Finish: %v", err) - } - - ss, err := shards.NewDirectorySearcher(dir) - if err != nil { - t.Fatalf("NewDirectorySearcher(%s): %v", dir, err) - } - - q, err := query.Parse("aaa") - if err != nil { - t.Fatalf("Parse(aaa): %v", err) - } - - var sOpts zoekt.SearchOptions - ctx := context.Background() - result, err := ss.Search(ctx, q, &sOpts) - if err != nil { - t.Fatalf("Search(%v): %v", q, err) - } - - if len(result.Files) != 2 { - t.Errorf("got %v files, want 2 files.", len(result.Files)) - } - defer ss.Close() -} - -func TestUpdate(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - opts := Options{ - IndexDir: dir, - ShardMax: 1024, - RepositoryDescription: zoekt.Repository{ - Name: "repo", - FileURLTemplate: "url", - }, - Parallelism: 2, - SizeMax: 1 << 20, - } - - if b, err := NewBuilder(opts); err != nil { - t.Fatalf("NewBuilder: %v", err) - } else { - b.AddFile("F", []byte("hoi")) - if err := b.Finish(); err != nil { - t.Errorf("Finish: %v", err) - } - } - ss, err := shards.NewDirectorySearcher(dir) - if err != nil { - t.Fatalf("NewDirectorySearcher(%s): %v", dir, err) - } - - ctx := context.Background() - repos, err := ss.List(ctx, &query.Repo{Pattern: "repo"}) - if err != nil { - t.Fatalf("List: %v", err) - } - - if len(repos.Repos) != 1 { - t.Errorf("List(repo): got %v, want 1 repo", repos.Repos) - } - - fs, err := filepath.Glob(filepath.Join(dir, "*")) - if err != nil { - t.Fatalf("glob: %v", err) - } - - opts.RepositoryDescription = zoekt.Repository{ - Name: "repo2", - FileURLTemplate: "url2", - } - - if b, err := NewBuilder(opts); err != nil { - t.Fatalf("NewBuilder: %v", err) - } else { - b.AddFile("F", []byte("hoi")) - if err := b.Finish(); err != nil { - t.Errorf("Finish: %v", err) - } - } - - // This is ugly, and potentially flaky, but there is no - // observable synchronization for the Sharded searcher, so - // this is the best we can do. - time.Sleep(100 * time.Millisecond) - - ctx = context.Background() - if repos, err = ss.List(ctx, &query.Repo{Pattern: "repo"}); err != nil { - t.Fatalf("List: %v", err) - } else if len(repos.Repos) != 2 { - t.Errorf("List(repo): got %v, want 2 repos", repos.Repos) - } - - for _, fn := range fs { - log.Printf("removing %s", fn) - if err := os.Remove(fn); err != nil { - t.Fatalf("Remove(%s): %v", fn, err) - } - } - - time.Sleep(100 * time.Millisecond) - - ctx = context.Background() - if repos, err = ss.List(ctx, &query.Repo{Pattern: "repo"}); err != nil { - t.Fatalf("List: %v", err) - } else if len(repos.Repos) != 1 { - var ss []string - for _, r := range repos.Repos { - ss = append(ss, r.Repository.Name) - } - t.Errorf("List(repo): got %v, want 1 repo", ss) - } -} - -func TestDeleteOldShards(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - opts := Options{ - IndexDir: dir, - ShardMax: 1024, - RepositoryDescription: zoekt.Repository{ - Name: "repo", - FileURLTemplate: "url", - }, - SizeMax: 1 << 20, - } - opts.SetDefaults() - - b, err := NewBuilder(opts) - if err != nil { - t.Fatalf("NewBuilder: %v", err) - } - for i := 0; i < 4; i++ { - s := fmt.Sprintf("%d\n", i) - b.AddFile("F"+s, []byte(strings.Repeat(s, 1024/2))) - } - if err := b.Finish(); err != nil { - t.Errorf("Finish: %v", err) - } - - glob := filepath.Join(dir, "*") - fs, err := filepath.Glob(glob) - if err != nil { - t.Fatalf("Glob(%s): %v", glob, err) - } else if len(fs) != 4 { - t.Fatalf("Glob(%s): got %v, want 4 shards", glob, fs) - } - - if fi, err := os.Lstat(fs[0]); err != nil { - t.Fatalf("Lstat: %v", err) - } else if fi.Mode()&0o666 == 0o600 { - // This fails spuriously if your umask is very restrictive. - t.Errorf("got mode %o, should respect umask.", fi.Mode()) - } - - // Do again, without sharding. - opts.ShardMax = 1 << 20 - b, err = NewBuilder(opts) - if err != nil { - t.Fatalf("NewBuilder: %v", err) - } - for i := 0; i < 4; i++ { - s := fmt.Sprintf("%d\n", i) - b.AddFile("F"+s, []byte(strings.Repeat(s, 1024/2))) - } - if err := b.Finish(); err != nil { - t.Errorf("Finish: %v", err) - } - - fs, err = filepath.Glob(glob) - if err != nil { - t.Fatalf("Glob(%s): %v", glob, err) - } else if len(fs) != 1 { - t.Fatalf("Glob(%s): got %v, want 1 shard", glob, fs) - } - - // Again, but don't index anything; should leave old shards intact. - b, err = NewBuilder(opts) - if err != nil { - t.Fatalf("NewBuilder: %v", err) - } - if err := b.Finish(); err != nil { - t.Errorf("Finish: %v", err) - } - - fs, err = filepath.Glob(glob) - if err != nil { - t.Fatalf("Glob(%s): %v", glob, err) - } else if len(fs) != 1 { - t.Fatalf("Glob(%s): got %v, want 1 shard", glob, fs) - } -} - -func TestPartialSuccess(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - opts := Options{ - IndexDir: dir, - ShardMax: 1024, - SizeMax: 1 << 20, - Parallelism: 1, - } - opts.RepositoryDescription.Name = "repo" - opts.SetDefaults() - - b, err := NewBuilder(opts) - if err != nil { - t.Fatalf("NewBuilder: %v", err) - } - - for i := 0; i < 4; i++ { - nm := fmt.Sprintf("F%d", i) - - // no error checking: the 2nd call will fail - b.AddFile(nm, []byte(strings.Repeat("01234567\n", 128))) - if i == 1 { - // force writes to fail. - if err := os.Chmod(dir, 0o555); err != nil { - t.Fatalf("chmod(%s): %s", dir, err) - } - } - } - - if err := os.Chmod(dir, 0o755); err != nil { - t.Fatalf("chmod(%s, writable): %s", dir, err) - } - - // No error checking. - b.Finish() - - // Finish cleans up temporary files. - if fs, err := filepath.Glob(dir + "/*"); err != nil { - t.Errorf("glob(%s): %v", dir, err) - } else if len(fs) != 0 { - t.Errorf("got shards %v, want []", fs) - } -} - -type filerankCase struct { - name string - docs []*zoekt.Document - want []int -} - -func testFileRankAspect(t *testing.T, c filerankCase) { - var want []*zoekt.Document - for _, j := range c.want { - want = append(want, c.docs[j]) - } - - got := make([]*zoekt.Document, len(c.docs)) - copy(got, c.docs) - sortDocuments(got) - - print := func(ds []*zoekt.Document) string { - r := "" - for _, d := range ds { - r += fmt.Sprintf("%v, ", d) - } - return r - } - if !reflect.DeepEqual(got, want) { - t.Errorf("got docs [%v], want [%v]", print(got), print(want)) - } -} - -func TestFileRank(t *testing.T) { - for _, c := range []filerankCase{{ - name: "filename", - docs: []*zoekt.Document{ - { - Name: "longlonglong", - Content: []byte("bla"), - }, - { - Name: "short", - Content: []byte("bla"), - }, - }, - want: []int{1, 0}, - }, { - name: "test", - docs: []*zoekt.Document{ - { - Name: "test", - Content: []byte("bla"), - }, - { - Name: "longlonglong", - Content: []byte("bla"), - }, - }, - want: []int{1, 0}, - }, { - name: "content", - docs: []*zoekt.Document{ - { - Content: []byte("bla"), - }, - { - Content: []byte("blablablabla"), - }, - { - Content: []byte("blabla"), - }, - }, - want: []int{0, 2, 1}, - }} { - t.Run(c.name, func(t *testing.T) { - testFileRankAspect(t, c) - }) - } -} - -func TestEmptyContent(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - opts := Options{ - IndexDir: dir, - RepositoryDescription: zoekt.Repository{ - Name: "repo", - }, - } - opts.SetDefaults() - - b, err := NewBuilder(opts) - if err != nil { - t.Fatalf("NewBuilder: %v", err) - } - if err := b.Finish(); err != nil { - t.Errorf("Finish: %v", err) - } - - fs, _ := filepath.Glob(dir + "/*") - if len(fs) != 1 { - t.Fatalf("want a shard, got %v", fs) - } - - ss, err := shards.NewDirectorySearcher(dir) - if err != nil { - t.Fatalf("NewDirectorySearcher(%s): %v", dir, err) - } - defer ss.Close() - - ctx := context.Background() - result, err := ss.List(ctx, &query.Const{Value: true}) - if err != nil { - t.Fatalf("List: %v", err) - } - - if len(result.Repos) != 1 || result.Repos[0].Repository.Name != "repo" { - t.Errorf("got %+v, want 1 repo.", result.Repos) - } -}
diff --git a/cmd/flags.go b/cmd/flags.go deleted file mode 100644 index e0b9005..0000000 --- a/cmd/flags.go +++ /dev/null
@@ -1,45 +0,0 @@ -// Copyright 2019 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cmd - -import ( - "flag" - "fmt" - "os" - "path/filepath" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" -) - -var ( - version = flag.Bool("version", false, "Print version number") - opts = &build.Options{} -) - -func init() { - opts.Flags(flag.CommandLine) -} - -func OptionsFromFlags() *build.Options { - if *version { - name := filepath.Base(os.Args[0]) - fmt.Printf("%s version %q\n", name, zoekt.Version) - os.Exit(0) - } - - opts.SetDefaults() - return opts -}
diff --git a/cmd/zoekt-archive-index/archive.go b/cmd/zoekt-archive-index/archive.go deleted file mode 100644 index c1afe5b..0000000 --- a/cmd/zoekt-archive-index/archive.go +++ /dev/null
@@ -1,188 +0,0 @@ -package main - -import ( - "archive/tar" - "archive/zip" - "bytes" - "compress/gzip" - "fmt" - "io" - "io/ioutil" - "net/http" - "net/url" - "os" - "strings" -) - -type Archive interface { - Next() (*File, error) - Close() error -} - -type File struct { - io.ReadCloser - Name string - Size int64 -} - -type tarArchive struct { - io.Closer - tr *tar.Reader -} - -func (a *tarArchive) Next() (*File, error) { - for { - hdr, err := a.tr.Next() - if err != nil { - return nil, err - } - - // We only care about files - if hdr.Typeflag != tar.TypeReg && hdr.Typeflag != tar.TypeRegA { - continue - } - - return &File{ - ReadCloser: ioutil.NopCloser(a.tr), - Name: hdr.Name, - Size: hdr.Size, - }, nil - } -} - -type zipArchive struct { - io.Closer - files []*zip.File -} - -func (a *zipArchive) Next() (*File, error) { - if len(a.files) == 0 { - return nil, io.EOF - } - - f := a.files[0] - a.files = a.files[1:] - - r, err := f.Open() - if err != nil { - return nil, err - } - - return &File{ - ReadCloser: r, - Name: f.Name, - Size: int64(f.UncompressedSize64), - }, nil -} - -func newZipArchive(r io.Reader, closer io.Closer) (*zipArchive, error) { - f, ok := r.(interface { - io.ReaderAt - Stat() (os.FileInfo, error) - }) - if !ok { - return nil, fmt.Errorf("streaming zip files not supported") - } - - fi, err := f.Stat() - if err != nil { - return nil, err - } - - zr, err := zip.NewReader(f, fi.Size()) - if err != nil { - return nil, err - } - - // Filter out non files - files := zr.File[:0] - for _, f := range zr.File { - if f.Mode().IsRegular() { - files = append(files, f) - } - } - - return &zipArchive{ - Closer: closer, - files: files, - }, nil -} - -func detectContentType(r io.Reader) (string, io.Reader, error) { - var buf [512]byte - n, err := io.ReadFull(r, buf[:]) - if err != nil && err != io.ErrUnexpectedEOF { - return "", nil, err - } - - ct := http.DetectContentType(buf[:n]) - - // If we are a seeker, we can just undo our read - if s, ok := r.(io.Seeker); ok { - _, err := s.Seek(int64(-n), io.SeekCurrent) - return ct, r, err - } - - // Otherwise return a new reader which merges in the read bytes - return ct, io.MultiReader(bytes.NewReader(buf[:n]), r), nil -} - -func openReader(u string) (io.ReadCloser, error) { - if strings.HasPrefix(u, "https://") || strings.HasPrefix(u, "http://") { - resp, err := http.Get(u) - if err != nil { - return nil, err - } - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - b, err := ioutil.ReadAll(io.LimitReader(resp.Body, 1024)) - _ = resp.Body.Close() - if err != nil { - return nil, err - } - return nil, &url.Error{ - Op: "Get", - URL: u, - Err: fmt.Errorf("%s: %s", resp.Status, string(b)), - } - } - return resp.Body, nil - } else if u == "-" { - return ioutil.NopCloser(os.Stdin), nil - } - - return os.Open(u) -} - -// openArchive opens the tar at the URL or filepath u. Also supported is tgz -// files over http. -func openArchive(u string) (ar Archive, err error) { - readCloser, err := openReader(u) - if err != nil { - return nil, err - } - defer func() { - if err != nil { - _ = readCloser.Close() - } - }() - - ct, r, err := detectContentType(readCloser) - if err != nil { - return nil, err - } - switch ct { - case "application/x-gzip": - r, err = gzip.NewReader(r) - if err != nil { - return nil, err - } - - case "application/zip": - return newZipArchive(r, readCloser) - } - - return &tarArchive{ - Closer: readCloser, - tr: tar.NewReader(r), - }, nil -}
diff --git a/cmd/zoekt-archive-index/e2e_test.go b/cmd/zoekt-archive-index/e2e_test.go deleted file mode 100644 index daaf556..0000000 --- a/cmd/zoekt-archive-index/e2e_test.go +++ /dev/null
@@ -1,182 +0,0 @@ -package main - -import ( - "archive/tar" - "archive/zip" - "compress/gzip" - "context" - "errors" - "flag" - "fmt" - "io" - "io/ioutil" - "log" - "os" - "strings" - "testing" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" - "github.com/google/zoekt/query" - "github.com/google/zoekt/shards" -) - -func TestMain(m *testing.M) { - flag.Parse() - if !testing.Verbose() { - log.SetOutput(ioutil.Discard) - } - os.Exit(m.Run()) -} - -func writeArchive(w io.Writer, format string, files map[string]string) (err error) { - if format == "zip" { - zw := zip.NewWriter(w) - for name, body := range files { - f, err := zw.Create(name) - if err != nil { - return err - } - if _, err := f.Write([]byte(body)); err != nil { - return err - } - } - return zw.Close() - } - - if format == "tgz" { - gw := gzip.NewWriter(w) - defer func() { - err2 := gw.Close() - if err == nil { - err = err2 - } - }() - w = gw - format = "tar" - } - - if format != "tar" { - return errors.New("expected tar") - } - - tw := tar.NewWriter(w) - - for name, body := range files { - hdr := &tar.Header{ - Name: name, - Mode: 0o600, - Size: int64(len(body)), - } - if err := tw.WriteHeader(hdr); err != nil { - return err - } - if _, err := tw.Write([]byte(body)); err != nil { - return err - } - } - if err := tw.Close(); err != nil { - return err - } - - return nil -} - -// TestIndexArg tests zoekt-archive-index by creating an archive and then -// indexing and executing searches and checking we get expected results. -// Additionally, we test that the index is properly updated with the -// -incremental=true option changing the options between indexes and ensuring -// the results change as expected. -func TestIndexIncrementally(t *testing.T) { - for _, format := range []string{"tar", "tgz", "zip"} { - t.Run(format, func(t *testing.T) { - testIndexIncrementally(t, format) - }) - } -} - -func testIndexIncrementally(t *testing.T, format string) { - indexdir, err := ioutil.TempDir("", "TestIndexArg-index") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(indexdir) - archive, err := ioutil.TempFile("", "TestIndexArg-archive") - if err != nil { - t.Fatalf("TempFile: %v", err) - } - defer os.Remove(archive.Name()) - - fileSize := 1000 - - files := map[string]string{} - for i := 0; i < 4; i++ { - s := fmt.Sprintf("%d", i) - files["F"+s] = strings.Repeat("a", fileSize) - } - - err = writeArchive(archive, format, files) - if err != nil { - t.Fatalf("unable to create archive %v", err) - } - archive.Close() - - // tests contain options used to build an index and the expected number of - // files in the result set based on the options. - tests := []struct { - largeFiles []string - wantNumFiles int - }{ - { - largeFiles: []string{}, - wantNumFiles: 0, - }, - { - largeFiles: []string{"F0", "F2"}, - wantNumFiles: 2, - }, - } - - for _, test := range tests { - largeFiles, wantNumFiles := test.largeFiles, test.wantNumFiles - - bopts := build.Options{ - SizeMax: fileSize - 1, - IndexDir: indexdir, - LargeFiles: largeFiles, - } - opts := Options{ - Incremental: true, - Archive: archive.Name(), - Name: "repo", - Branch: "master", - Commit: "cccccccccccccccccccccccccccccccccccccccc", - Strip: 0, - } - - if err := do(opts, bopts); err != nil { - t.Fatalf("error creating index: %v", err) - } - - ss, err := shards.NewDirectorySearcher(indexdir) - if err != nil { - t.Fatalf("NewDirectorySearcher(%s): %v", indexdir, err) - } - defer ss.Close() - - q, err := query.Parse("aaa") - if err != nil { - t.Fatalf("Parse(aaa): %v", err) - } - - var sOpts zoekt.SearchOptions - result, err := ss.Search(context.Background(), q, &sOpts) - if err != nil { - t.Fatalf("Search(%v): %v", q, err) - } - - if len(result.Files) != wantNumFiles { - t.Errorf("got %v, want %d files.", result.Files, wantNumFiles) - } - } -}
diff --git a/cmd/zoekt-archive-index/main.go b/cmd/zoekt-archive-index/main.go deleted file mode 100644 index 8754b4e..0000000 --- a/cmd/zoekt-archive-index/main.go +++ /dev/null
@@ -1,232 +0,0 @@ -// Command zoekt-archive-index indexes an archive. -// -// Example via github.com: -// -// zoekt-archive-index -incremental -commit b57cb1605fd11ba2ecfa7f68992b4b9cc791934d -name github.com/gorilla/mux -strip_components 1 https://codeload.github.com/gorilla/mux/legacy.tar.gz/b57cb1605fd11ba2ecfa7f68992b4b9cc791934d -// -// zoekt-archive-index -branch master https://github.com/gorilla/mux/commit/b57cb1605fd11ba2ecfa7f68992b4b9cc791934d -package main - -import ( - "errors" - "flag" - "fmt" - "io" - "io/ioutil" - "log" - "net/url" - "strings" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" - "github.com/google/zoekt/cmd" - "github.com/google/zoekt/gitindex" - "go.uber.org/automaxprocs/maxprocs" -) - -// stripComponents removes the specified number of leading path -// elements. Pathnames with fewer elements will return the empty string. -func stripComponents(path string, count int) string { - for i := 0; path != "" && i < count; i++ { - i := strings.Index(path, "/") - if i < 0 { - return "" - } - path = path[i+1:] - } - return path -} - -// isGitOID checks if the revision is a git OID SHA string. -// -// Note: This doesn't mean the SHA exists in a repository, nor does it mean it -// isn't a ref. Git allows 40-char hexadecimal strings to be references. -func isGitOID(s string) bool { - if len(s) != 40 { - return false - } - for _, r := range s { - if !(('0' <= r && r <= '9') || - ('a' <= r && r <= 'f') || - ('A' <= r && r <= 'F')) { - return false - } - } - return true -} - -type Options struct { - Incremental bool - - Archive string - Name string - RepoURL string - Branch string - Commit string - Strip int -} - -func (o *Options) SetDefaults() { - // We guess based on the archive URL. - u, _ := url.Parse(o.Archive) - if u == nil { - return - } - - setRef := func(ref string) { - if isGitOID(ref) && o.Commit == "" { - o.Commit = ref - } - if !isGitOID(ref) && o.Branch == "" { - o.Branch = ref - } - } - - switch u.Host { - case "github.com", "codeload.github.com": - // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1 - // https://github.com/octokit/octokit.rb/blob/master/README.md - // https://github.com/octokit/octokit.rb/tree/master/lib - // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master - parts := strings.Split(u.Path, "/") - if len(parts) > 2 && o.Name == "" { - o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) - o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) - } - if len(parts) > 4 { - setRef(parts[4]) - if u.Host == "github.com" { - o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4]) - } - } - o.Strip = 1 - case "api.github.com": - // https://api.github.com/repos/octokit/octokit.rb/tarball/master - parts := strings.Split(u.Path, "/") - if len(parts) > 2 && o.Name == "" { - o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) - o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) - } - if len(parts) > 5 { - setRef(parts[5]) - } - o.Strip = 1 - } -} - -func do(opts Options, bopts build.Options) error { - opts.SetDefaults() - - if opts.Name == "" && opts.RepoURL == "" { - return errors.New("-name or -url required") - } - if opts.Branch == "" { - return errors.New("-branch required") - } - - if opts.Name != "" { - bopts.RepositoryDescription.Name = opts.Name - } - if opts.RepoURL != "" { - u, err := url.Parse(opts.RepoURL) - if err != nil { - return err - } - if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil { - return err - } - } - bopts.SetDefaults() - bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}} - brs := []string{opts.Branch} - - if opts.Incremental && bopts.IncrementalSkipIndexing() { - return nil - } - - a, err := openArchive(opts.Archive) - if err != nil { - return err - } - defer a.Close() - - bopts.RepositoryDescription.Source = opts.Archive - builder, err := build.NewBuilder(bopts) - if err != nil { - return err - } - - add := func(f *File) error { - defer f.Close() - - contents, err := ioutil.ReadAll(f) - if err != nil { - return err - } - - name := stripComponents(f.Name, opts.Strip) - if name == "" { - return nil - } - - return builder.Add(zoekt.Document{ - Name: name, - Content: contents, - Branches: brs, - }) - } - - for { - f, err := a.Next() - if err == io.EOF { - break - } - if err != nil { - return err - } - - if err := add(f); err != nil { - return err - } - } - - return builder.Finish() -} - -func main() { - var ( - incremental = flag.Bool("incremental", true, "only index changed repositories") - - name = flag.String("name", "", "The repository name for the archive") - urlRaw = flag.String("url", "", "The repository URL for the archive") - branch = flag.String("branch", "", "The branch name for the archive") - commit = flag.String("commit", "", "The commit sha for the archive. If incremental this will avoid updating shards already at commit") - strip = flag.Int("strip_components", 0, "Remove the specified number of leading path elements. Pathnames with fewer elements will be silently skipped.") - ) - flag.Parse() - - // Tune GOMAXPROCS to match Linux container CPU quota. - maxprocs.Set() - - log.SetFlags(log.LstdFlags | log.Lshortfile) - - if len(flag.Args()) != 1 { - log.Fatal("expected argument for archive location") - } - archive := flag.Args()[0] - bopts := cmd.OptionsFromFlags() - opts := Options{ - Incremental: *incremental, - - Archive: archive, - Name: *name, - RepoURL: *urlRaw, - Branch: *branch, - Commit: *commit, - Strip: *strip, - } - - if err := do(opts, *bopts); err != nil { - log.Fatal(err) - } -}
diff --git a/cmd/zoekt-git-clone/main.go b/cmd/zoekt-git-clone/main.go deleted file mode 100644 index eb85488..0000000 --- a/cmd/zoekt-git-clone/main.go +++ /dev/null
@@ -1,67 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This binary fetches all repos of a user or organization and clones -// them. It is strongly recommended to get a personal API token from -// https://github.com/settings/tokens, save the token in a file, and -// point the --token option to it. -package main - -import ( - "flag" - "fmt" - "log" - "net/url" - "os" - "path/filepath" - "strings" - - "github.com/google/zoekt/gitindex" -) - -func main() { - dest := flag.String("dest", "", "destination directory") - flag.Parse() - - if *dest == "" { - log.Fatal("must set --dest") - } - if len(flag.Args()) == 0 { - log.Fatal("must provide URL") - } - u, err := url.Parse(flag.Arg(0)) - if err != nil { - log.Fatalf("url.Parse: %v", err) - } - - name := filepath.Join(u.Host, u.Path) - name = strings.TrimSuffix(name, ".git") - - destDir := filepath.Dir(filepath.Join(*dest, name)) - if err := os.MkdirAll(destDir, 0o755); err != nil { - log.Fatal(err) - } - - config := map[string]string{ - "zoekt.name": name, - } - - destRepo, err := gitindex.CloneRepo(destDir, filepath.Base(name), u.String(), config) - if err != nil { - log.Fatalf("CloneRepo: %v", err) - } - if destRepo != "" { - fmt.Println(destRepo) - } -}
diff --git a/cmd/zoekt-git-index/main.go b/cmd/zoekt-git-index/main.go deleted file mode 100644 index 50997e5..0000000 --- a/cmd/zoekt-git-index/main.go +++ /dev/null
@@ -1,96 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "flag" - "log" - "os" - "path/filepath" - "strings" - - "github.com/google/zoekt/cmd" - "github.com/google/zoekt/gitindex" - "go.uber.org/automaxprocs/maxprocs" -) - -func main() { - allowMissing := flag.Bool("allow_missing_branches", false, "allow missing branches.") - submodules := flag.Bool("submodules", true, "if set to false, do not recurse into submodules") - branchesStr := flag.String("branches", "HEAD", "git branches to index.") - branchPrefix := flag.String("prefix", "refs/heads/", "prefix for branch names") - - incremental := flag.Bool("incremental", true, "only index changed repositories") - repoCacheDir := flag.String("repo_cache", "", "directory holding bare git repos, named by URL. "+ - "this is used to find repositories for submodules. "+ - "It also affects name if the indexed repository is under this directory.") - flag.Parse() - - // Tune GOMAXPROCS to match Linux container CPU quota. - maxprocs.Set() - - if *repoCacheDir != "" { - dir, err := filepath.Abs(*repoCacheDir) - if err != nil { - log.Fatalf("Abs: %v", err) - } - *repoCacheDir = dir - } - opts := cmd.OptionsFromFlags() - - var branches []string - if *branchesStr != "" { - branches = strings.Split(*branchesStr, ",") - } - - gitRepos := map[string]string{} - for _, repoDir := range flag.Args() { - repoDir, err := filepath.Abs(repoDir) - if err != nil { - log.Fatal(err) - } - repoDir = filepath.Clean(repoDir) - - name := strings.TrimSuffix(repoDir, "/.git") - if *repoCacheDir != "" && strings.HasPrefix(name, *repoCacheDir) { - name = strings.TrimPrefix(name, *repoCacheDir+"/") - name = strings.TrimSuffix(name, ".git") - } else { - name = strings.TrimSuffix(filepath.Base(name), ".git") - } - gitRepos[repoDir] = name - } - - exitStatus := 0 - for dir, name := range gitRepos { - opts.RepositoryDescription.Name = name - gitOpts := gitindex.Options{ - BranchPrefix: *branchPrefix, - Incremental: *incremental, - Submodules: *submodules, - RepoCacheDir: *repoCacheDir, - AllowMissingBranch: *allowMissing, - BuildOptions: *opts, - Branches: branches, - RepoDir: dir, - } - - if err := gitindex.IndexGitRepo(gitOpts); err != nil { - log.Printf("indexGitRepo(%s): %v", dir, err) - exitStatus = 1 - } - } - os.Exit(exitStatus) -}
diff --git a/cmd/zoekt-hg-index/main.go b/cmd/zoekt-hg-index/main.go deleted file mode 100644 index dd782a6..0000000 --- a/cmd/zoekt-hg-index/main.go +++ /dev/null
@@ -1,89 +0,0 @@ -// Copyright 2020 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// zoekt-hg-index provides bare-bones Mercurial indexing -package main - -import ( - "flag" - "fmt" - "log" - "path/filepath" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" - "github.com/google/zoekt/cmd" - - "go.uber.org/automaxprocs/maxprocs" - "humungus.tedunangst.com/r/gerc" -) - -func main() { - revisionStr := flag.String("revision", "", "hg revision to index") - flag.Parse() - maxprocs.Set() - opts := cmd.OptionsFromFlags() - - if len(flag.Args()) < 1 { - log.Fatal("hg repo directory argument missing") - } - dir, err := filepath.Abs(flag.Arg(0)) - if err != nil { - log.Fatal(err) - } - opts.RepositoryDescription.Name = dir - - if err := indexHg(dir, *revisionStr, opts); err != nil { - log.Fatal(err) - } -} - -func indexHg(dir, rev string, opts *build.Options) error { - r, err := gerc.Open(dir) - if err != nil { - log.Fatal(err) - } - defer r.Close() - - builder, err := build.NewBuilder(*opts) - if err != nil { - return err - } - defer builder.Finish() - - mfs, err := r.GetFiles(gerc.FilesArgs{ - Revision: rev, - }) - if err != nil { - return fmt.Errorf("GetFiles %v", err) - } - - for _, mf := range mfs { - fd := gerc.FileDataArgs{ - Filename: mf.Name, - Revision: rev, - } - content, err := r.GetFileData(fd) - if err != nil { - return fmt.Errorf("GetFileData %v", err) - } - if err := builder.Add(zoekt.Document{ - Name: mf.Name, - Content: content, - }); err != nil { - return err - } - } - return builder.Finish() -}
diff --git a/cmd/zoekt-index/main.go b/cmd/zoekt-index/main.go deleted file mode 100644 index a103f21..0000000 --- a/cmd/zoekt-index/main.go +++ /dev/null
@@ -1,143 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "flag" - "fmt" - "io/ioutil" - "log" - "os" - "path/filepath" - "runtime/pprof" - "strings" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" - "github.com/google/zoekt/cmd" - "go.uber.org/automaxprocs/maxprocs" -) - -type fileInfo struct { - name string - size int64 -} - -type fileAggregator struct { - ignoreDirs map[string]struct{} - sizeMax int64 - sink chan fileInfo -} - -func (a *fileAggregator) add(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - - if info.IsDir() { - base := filepath.Base(path) - if _, ok := a.ignoreDirs[base]; ok { - return filepath.SkipDir - } - } - - if info.Mode().IsRegular() { - a.sink <- fileInfo{path, info.Size()} - } - return nil -} - -func main() { - cpuProfile := flag.String("cpu_profile", "", "write cpu profile to file") - ignoreDirs := flag.String("ignore_dirs", ".git,.hg,.svn", "comma separated list of directories to ignore.") - flag.Parse() - - // Tune GOMAXPROCS to match Linux container CPU quota. - maxprocs.Set() - - opts := cmd.OptionsFromFlags() - if *cpuProfile != "" { - f, err := os.Create(*cpuProfile) - if err != nil { - log.Fatal(err) - } - pprof.StartCPUProfile(f) - defer pprof.StopCPUProfile() - } - - ignoreDirMap := map[string]struct{}{} - if *ignoreDirs != "" { - dirs := strings.Split(*ignoreDirs, ",") - for _, d := range dirs { - d = strings.TrimSpace(d) - if d != "" { - ignoreDirMap[d] = struct{}{} - } - } - } - for _, arg := range flag.Args() { - opts.RepositoryDescription.Source = arg - if err := indexArg(arg, *opts, ignoreDirMap); err != nil { - log.Fatal(err) - } - } -} - -func indexArg(arg string, opts build.Options, ignore map[string]struct{}) error { - dir, err := filepath.Abs(filepath.Clean(arg)) - if err != nil { - return err - } - - opts.RepositoryDescription.Name = filepath.Base(dir) - builder, err := build.NewBuilder(opts) - if err != nil { - return err - } - defer builder.Finish() - - comm := make(chan fileInfo, 100) - agg := fileAggregator{ - ignoreDirs: ignore, - sink: comm, - sizeMax: int64(opts.SizeMax), - } - - go func() { - if err := filepath.Walk(dir, agg.add); err != nil { - log.Fatal(err) - } - close(comm) - }() - - for f := range comm { - displayName := strings.TrimPrefix(f.name, dir+"/") - if f.size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(displayName) { - builder.Add(zoekt.Document{ - Name: displayName, - SkipReason: fmt.Sprintf("document size %d larger than limit %d", f.size, opts.SizeMax), - }) - continue - } - content, err := ioutil.ReadFile(f.name) - if err != nil { - return err - } - - builder.AddFile(displayName, content) - } - - return builder.Finish() -}
diff --git a/cmd/zoekt-indexserver/config.go b/cmd/zoekt-indexserver/config.go deleted file mode 100644 index d213774..0000000 --- a/cmd/zoekt-indexserver/config.go +++ /dev/null
@@ -1,268 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "bytes" - "encoding/json" - "io/ioutil" - "log" - "math/rand" - "net/http" - "net/url" - "os" - "os/exec" - "path/filepath" - "time" - - "github.com/fsnotify/fsnotify" -) - -type ConfigEntry struct { - GithubUser string - GithubOrg string - BitBucketServerProject string - GitHubURL string - GitilesURL string - CGitURL string - BitBucketServerURL string - DisableTLS bool - CredentialPath string - ProjectType string - Name string - Exclude string - GitLabURL string - OnlyPublic bool - GerritApiURL string - Topics []string - ExcludeTopics []string -} - -func randomize(entries []ConfigEntry) []ConfigEntry { - perm := rand.Perm(len(entries)) - - var shuffled []ConfigEntry - for _, i := range perm { - shuffled = append(shuffled, entries[i]) - } - - return shuffled -} - -func isHTTP(u string) bool { - asURL, err := url.Parse(u) - return err == nil && (asURL.Scheme == "http" || asURL.Scheme == "https") -} - -func readConfigURL(u string) ([]ConfigEntry, error) { - var body []byte - var readErr error - - if isHTTP(u) { - rep, err := http.Get(u) - if err != nil { - return nil, err - } - defer rep.Body.Close() - - body, readErr = ioutil.ReadAll(rep.Body) - } else { - body, readErr = ioutil.ReadFile(u) - } - - if readErr != nil { - return nil, readErr - } - - var result []ConfigEntry - if err := json.Unmarshal(body, &result); err != nil { - return nil, err - } - return result, nil -} - -func watchFile(path string) (<-chan struct{}, error) { - watcher, err := fsnotify.NewWatcher() - if err != nil { - return nil, err - } - - if err := watcher.Add(filepath.Dir(path)); err != nil { - return nil, err - } - - out := make(chan struct{}, 1) - go func() { - var last time.Time - for { - select { - case <-watcher.Events: - fi, err := os.Stat(path) - if err == nil && fi.ModTime() != last { - out <- struct{}{} - last = fi.ModTime() - } - case err := <-watcher.Errors: - if err != nil { - log.Printf("watcher error: %v", err) - } - } - } - }() - return out, nil -} - -func periodicMirrorFile(repoDir string, opts *Options, pendingRepos chan<- string) { - ticker := time.NewTicker(opts.mirrorInterval) - - var watcher <-chan struct{} - if !isHTTP(opts.mirrorConfigFile) { - var err error - watcher, err = watchFile(opts.mirrorConfigFile) - if err != nil { - log.Printf("watchFile(%q): %v", opts.mirrorConfigFile, err) - } - } - - var lastCfg []ConfigEntry - for { - cfg, err := readConfigURL(opts.mirrorConfigFile) - if err != nil { - log.Printf("readConfig(%s): %v", opts.mirrorConfigFile, err) - } else { - lastCfg = cfg - } - - executeMirror(lastCfg, repoDir, pendingRepos) - - select { - case <-watcher: - log.Printf("mirror config %s changed", opts.mirrorConfigFile) - case <-ticker.C: - } - } -} - -func executeMirror(cfg []ConfigEntry, repoDir string, pendingRepos chan<- string) { - // Randomize the ordering in which we query - // things. This is to ensure that quota limits don't - // always hit the last one in the list. - cfg = randomize(cfg) - for _, c := range cfg { - var cmd *exec.Cmd - if c.GitHubURL != "" || c.GithubUser != "" || c.GithubOrg != "" { - cmd = exec.Command("zoekt-mirror-github", - "-dest", repoDir, "-delete") - if c.GitHubURL != "" { - cmd.Args = append(cmd.Args, "-url", c.GitHubURL) - } - if c.GithubUser != "" { - cmd.Args = append(cmd.Args, "-user", c.GithubUser) - } else if c.GithubOrg != "" { - cmd.Args = append(cmd.Args, "-org", c.GithubOrg) - } - if c.Name != "" { - cmd.Args = append(cmd.Args, "-name", c.Name) - } - if c.Exclude != "" { - cmd.Args = append(cmd.Args, "-exclude", c.Exclude) - } - if c.CredentialPath != "" { - cmd.Args = append(cmd.Args, "-token", c.CredentialPath) - } - for _, topic := range c.Topics { - cmd.Args = append(cmd.Args, "-topic", topic) - } - for _, topic := range c.ExcludeTopics { - cmd.Args = append(cmd.Args, "-exclude_topic", topic) - } - } else if c.GitilesURL != "" { - cmd = exec.Command("zoekt-mirror-gitiles", - "-dest", repoDir, "-name", c.Name) - if c.Exclude != "" { - cmd.Args = append(cmd.Args, "-exclude", c.Exclude) - } - cmd.Args = append(cmd.Args, c.GitilesURL) - } else if c.CGitURL != "" { - cmd = exec.Command("zoekt-mirror-gitiles", - "-type", "cgit", - "-dest", repoDir, "-name", c.Name) - if c.Exclude != "" { - cmd.Args = append(cmd.Args, "-exclude", c.Exclude) - } - cmd.Args = append(cmd.Args, c.CGitURL) - } else if c.BitBucketServerURL != "" { - cmd = exec.Command("zoekt-mirror-bitbucket-server", - "-dest", repoDir, "-url", c.BitBucketServerURL, "-delete") - if c.BitBucketServerProject != "" { - cmd.Args = append(cmd.Args, "-project", c.BitBucketServerProject) - } - if c.DisableTLS { - cmd.Args = append(cmd.Args, "-disable-tls") - } - if c.ProjectType != "" { - cmd.Args = append(cmd.Args, "-type", c.ProjectType) - } - if c.Name != "" { - cmd.Args = append(cmd.Args, "-name", c.Name) - } - if c.Exclude != "" { - cmd.Args = append(cmd.Args, "-exclude", c.Exclude) - } - if c.CredentialPath != "" { - cmd.Args = append(cmd.Args, "-credentials", c.CredentialPath) - } - } else if c.GitLabURL != "" { - cmd = exec.Command("zoekt-mirror-gitlab", - "-dest", repoDir, "-url", c.GitLabURL) - if c.Name != "" { - cmd.Args = append(cmd.Args, "-name", c.Name) - } - if c.Exclude != "" { - cmd.Args = append(cmd.Args, "-exclude", c.Exclude) - } - if c.OnlyPublic { - cmd.Args = append(cmd.Args, "-public") - } - if c.CredentialPath != "" { - cmd.Args = append(cmd.Args, "-token", c.CredentialPath) - } - } else if c.GerritApiURL != "" { - cmd = exec.Command("zoekt-mirror-gerrit", - "-dest", repoDir) - if c.CredentialPath != "" { - cmd.Args = append(cmd.Args, "-http-credentials", c.CredentialPath) - } - if c.Name != "" { - cmd.Args = append(cmd.Args, "-name", c.Name) - } - if c.Exclude != "" { - cmd.Args = append(cmd.Args, "-exclude", c.Exclude) - } - cmd.Args = append(cmd.Args, c.GerritApiURL) - } - - stdout, _ := loggedRun(cmd) - - for _, fn := range bytes.Split(stdout, []byte{'\n'}) { - if len(fn) == 0 { - continue - } - - pendingRepos <- string(fn) - } - - } -}
diff --git a/cmd/zoekt-indexserver/main.go b/cmd/zoekt-indexserver/main.go deleted file mode 100644 index 8392bb8..0000000 --- a/cmd/zoekt-indexserver/main.go +++ /dev/null
@@ -1,298 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This program manages a zoekt indexing deployment: -// * recycling logs -// * periodically fetching new data. -// * periodically reindexing all git repos. - -package main - -import ( - "bytes" - "context" - "flag" - "fmt" - "log" - "math" - "os" - "os/exec" - "path/filepath" - "runtime" - "strings" - "time" - - "github.com/google/zoekt" - "github.com/google/zoekt/gitindex" -) - -const day = time.Hour * 24 - -func loggedRun(cmd *exec.Cmd) (out, err []byte) { - outBuf := &bytes.Buffer{} - errBuf := &bytes.Buffer{} - cmd.Stdout = outBuf - cmd.Stderr = errBuf - - log.Printf("run %v", cmd.Args) - if err := cmd.Run(); err != nil { - log.Printf("command %s failed: %v\nOUT: %s\nERR: %s", - cmd.Args, err, outBuf.String(), errBuf.String()) - } - - return outBuf.Bytes(), errBuf.Bytes() -} - -type Options struct { - cpuFraction float64 - cpuCount int - fetchInterval time.Duration - mirrorInterval time.Duration - indexFlagsStr string - indexFlags []string - mirrorConfigFile string - maxLogAge time.Duration - indexTimeout time.Duration -} - -func (o *Options) validate() { - if o.cpuFraction <= 0.0 || o.cpuFraction > 1.0 { - log.Fatal("cpu_fraction must be between 0.0 and 1.0") - } - - o.cpuCount = int(math.Trunc(float64(runtime.GOMAXPROCS(0)) * o.cpuFraction)) - if o.cpuCount < 1 { - o.cpuCount = 1 - } - if o.indexFlagsStr != "" { - o.indexFlags = strings.Split(o.indexFlagsStr, " ") - } -} - -func (o *Options) defineFlags() { - flag.DurationVar(&o.indexTimeout, "index_timeout", time.Hour, "kill index job after this much time") - flag.DurationVar(&o.maxLogAge, "max_log_age", 3*day, "recycle index logs after this much time") - flag.DurationVar(&o.fetchInterval, "fetch_interval", time.Hour, "run fetches this often") - flag.StringVar(&o.mirrorConfigFile, "mirror_config", - "", "JSON file holding mirror configuration.") - - flag.DurationVar(&o.mirrorInterval, "mirror_duration", 24*time.Hour, "find and clone new repos at this frequency.") - flag.Float64Var(&o.cpuFraction, "cpu_fraction", 0.25, - "use this fraction of the cores for indexing.") - flag.StringVar(&o.indexFlagsStr, "git_index_flags", "", "space separated list of flags passed through to zoekt-git-index (e.g. -git_index_flags='-symbols=false -submodules=false'") -} - -// periodicFetch runs git-fetch every once in a while. Results are -// posted on pendingRepos. -func periodicFetch(repoDir, indexDir string, opts *Options, pendingRepos chan<- string) { - t := time.NewTicker(opts.fetchInterval) - for { - repos, err := gitindex.FindGitRepos(repoDir) - if err != nil { - log.Println(err) - continue - } - if len(repos) == 0 { - log.Printf("no repos found under %s", repoDir) - } - - // TODO: Randomize to make sure quota throttling hits everyone. - - later := map[string]struct{}{} - for _, dir := range repos { - if ok := fetchGitRepo(dir); !ok { - later[dir] = struct{}{} - } else { - pendingRepos <- dir - } - } - - for r := range later { - pendingRepos <- r - } - - <-t.C - } -} - -// fetchGitRepo runs git-fetch, and returns true if there was an -// update. -func fetchGitRepo(dir string) bool { - cmd := exec.Command("git", "--git-dir", dir, "fetch", "origin") - outBuf := &bytes.Buffer{} - errBuf := &bytes.Buffer{} - - // Prevent prompting - cmd.Stdin = &bytes.Buffer{} - cmd.Stderr = errBuf - cmd.Stdout = outBuf - if err := cmd.Run(); err != nil { - log.Printf("command %s failed: %v\nOUT: %s\nERR: %s", - cmd.Args, err, outBuf.String(), errBuf.String()) - } else { - return len(outBuf.Bytes()) != 0 - } - return false -} - -// indexPendingRepos consumes the directories on the repos channel and -// indexes them, sequentially. -func indexPendingRepos(indexDir, repoDir string, opts *Options, repos <-chan string) { - for dir := range repos { - indexPendingRepo(dir, indexDir, repoDir, opts) - - // Failures (eg. timeout) will leave temp files - // around. We have to clean them, or they will fill up the indexing volume. - if failures, err := filepath.Glob(filepath.Join(indexDir, "*.tmp")); err != nil { - log.Printf("Glob: %v", err) - } else { - for _, f := range failures { - os.Remove(f) - } - } - } -} - -func indexPendingRepo(dir, indexDir, repoDir string, opts *Options) { - ctx, cancel := context.WithTimeout(context.Background(), opts.indexTimeout) - defer cancel() - args := []string{ - "-require_ctags", - fmt.Sprintf("-parallelism=%d", opts.cpuCount), - "-repo_cache", repoDir, - "-index", indexDir, - "-incremental", - } - args = append(args, opts.indexFlags...) - args = append(args, dir) - cmd := exec.CommandContext(ctx, "zoekt-git-index", args...) - loggedRun(cmd) -} - -// deleteLogs deletes old logs. -func deleteLogs(logDir string, maxAge time.Duration) { - fs, err := filepath.Glob(filepath.Join(logDir, "*")) - if err != nil { - log.Fatalf("filepath.Glob(%s): %v", logDir, err) - } - - threshold := time.Now().Add(-maxAge) - for _, fn := range fs { - if fi, err := os.Lstat(fn); err == nil && fi.ModTime().Before(threshold) { - os.Remove(fn) - } - } -} - -func deleteLogsLoop(logDir string, maxAge time.Duration) { - tick := time.NewTicker(maxAge / 100) - for { - deleteLogs(logDir, maxAge) - <-tick.C - } -} - -// Delete the shard if its corresponding git repo can't be found. -func deleteIfOrphan(repoDir string, fn string) error { - f, err := os.Open(fn) - if err != nil { - return nil - } - defer f.Close() - - ifile, err := zoekt.NewIndexFile(f) - if err != nil { - return nil - } - defer ifile.Close() - - repo, _, err := zoekt.ReadMetadata(ifile) - if err != nil { - return nil - } - - _, err = os.Stat(repo.Source) - if os.IsNotExist(err) { - log.Printf("deleting orphan shard %s; source %q not found", fn, repo.Source) - return os.Remove(fn) - } - - return err -} - -func deleteOrphanIndexes(indexDir, repoDir string, watchInterval time.Duration) { - t := time.NewTicker(watchInterval) - - expr := indexDir + "/*" - for { - fs, err := filepath.Glob(expr) - if err != nil { - log.Printf("Glob(%q): %v", expr, err) - } - - for _, f := range fs { - if err := deleteIfOrphan(repoDir, f); err != nil { - log.Printf("deleteIfOrphan(%q): %v", f, err) - } - } - <-t.C - } -} - -func main() { - var opts Options - opts.defineFlags() - dataDir := flag.String("data_dir", - filepath.Join(os.Getenv("HOME"), "zoekt-serving"), "directory holding all data.") - indexDir := flag.String("index_dir", "", "directory holding index shards. Defaults to $data_dir/index/") - flag.Parse() - opts.validate() - - if *dataDir == "" { - log.Fatal("must set --data_dir") - } - - // Automatically prepend our own path at the front, to minimize - // required configuration. - if l, err := os.Readlink("/proc/self/exe"); err == nil { - os.Setenv("PATH", filepath.Dir(l)+":"+os.Getenv("PATH")) - } - - logDir := filepath.Join(*dataDir, "logs") - if *indexDir == "" { - *indexDir = filepath.Join(*dataDir, "index") - } - repoDir := filepath.Join(*dataDir, "repos") - for _, s := range []string{logDir, *indexDir, repoDir} { - if _, err := os.Stat(s); err == nil { - continue - } - - if err := os.MkdirAll(s, 0o755); err != nil { - log.Fatalf("MkdirAll %s: %v", s, err) - } - } - - _, err := readConfigURL(opts.mirrorConfigFile) - if err != nil { - log.Fatalf("readConfigURL(%s): %v", opts.mirrorConfigFile, err) - } - - pendingRepos := make(chan string, 10) - go periodicMirrorFile(repoDir, &opts, pendingRepos) - go deleteLogsLoop(logDir, opts.maxLogAge) - go deleteOrphanIndexes(*indexDir, repoDir, opts.fetchInterval) - go indexPendingRepos(*indexDir, repoDir, &opts, pendingRepos) - periodicFetch(repoDir, *indexDir, &opts, pendingRepos) -}
diff --git a/cmd/zoekt-mirror-bitbucket-server/main.go b/cmd/zoekt-mirror-bitbucket-server/main.go deleted file mode 100644 index 9cfb204..0000000 --- a/cmd/zoekt-mirror-bitbucket-server/main.go +++ /dev/null
@@ -1,270 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This binary fetches all repos of a project, and of a specific type, in case -// these are specified, and clones them. By default it fetches and clones all -// existing repos. -package main - -import ( - "context" - "crypto/tls" - "flag" - "fmt" - "io/ioutil" - "log" - "net/http" - "net/url" - "os" - "path/filepath" - "strings" - "time" - - "github.com/gfleury/go-bitbucket-v1" - - "github.com/google/zoekt/gitindex" -) - -func main() { - dest := flag.String("dest", "", "destination directory") - serverUrl := flag.String("url", "", "BitBucket Server url") - disableTLS := flag.Bool("disable-tls", false, "disables TLS verification") - credentialsFile := flag.String("credentials", ".bitbucket-credentials", "file holding BitBucket Server credentials") - project := flag.String("project", "", "project to mirror") - deleteRepos := flag.Bool("delete", false, "delete missing repos") - namePattern := flag.String("name", "", "only clone repos whose name matches the given regexp.") - excludePattern := flag.String("exclude", "", "don't mirror repos whose names match this regexp.") - projectType := flag.String("type", "", "only clone repos whose type matches the given string. "+ - "Type can be either NORMAl or PERSONAL. Clones projects of both types if not set.") - flag.Parse() - - if *serverUrl == "" { - log.Fatal("must set --url") - } - - rootURL, err := url.Parse(*serverUrl) - if err != nil { - log.Fatalf("url.Parse(): %v", err) - } - - if *dest == "" { - log.Fatal("must set --dest") - } - - if *projectType != "" && !IsValidProjectType(*projectType) { - log.Fatal("type should be either NORMAL or PERSONAL") - } - - destDir := filepath.Join(*dest, rootURL.Host) - if err := os.MkdirAll(destDir, 0o755); err != nil { - log.Fatal(err) - } - - username := "" - password := "" - if *credentialsFile == "" { - log.Fatal("must set --credentials") - } else { - content, err := ioutil.ReadFile(*credentialsFile) - if err != nil { - log.Fatal(err) - } - credentials := strings.Fields(string(content)) - username, password = credentials[0], credentials[1] - } - - basicAuth := bitbucketv1.BasicAuth{UserName: username, Password: password} - ctx, cancel := context.WithTimeout(context.Background(), 120000*time.Millisecond) - ctx = context.WithValue(ctx, bitbucketv1.ContextBasicAuth, basicAuth) - defer cancel() - - apiPath, err := url.Parse("/rest") - if err != nil { - log.Fatal(err) - } - - apiBaseURL := rootURL.ResolveReference(apiPath).String() - - var config *bitbucketv1.Configuration - if *disableTLS { - tr := &http.Transport{ - TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, - } - httpClient := &http.Client{ - Transport: tr, - } - httpClientConfig := func(configs *bitbucketv1.Configuration) { - configs.HTTPClient = httpClient - } - config = bitbucketv1.NewConfiguration(apiBaseURL, httpClientConfig) - } else { - config = bitbucketv1.NewConfiguration(apiBaseURL) - } - client := bitbucketv1.NewAPIClient(ctx, config) - - var repos []bitbucketv1.Repository - - if *project != "" { - repos, err = getProjectRepos(*client, *project) - } else { - repos, err = getAllRepos(*client) - } - - if err != nil { - log.Fatal(err) - } - - filter, err := gitindex.NewFilter(*namePattern, *excludePattern) - if err != nil { - log.Fatal(err) - } - - trimmed := repos[:0] - for _, r := range repos { - if filter.Include(r.Slug) && (*projectType == "" || r.Project.Type == *projectType) { - trimmed = append(trimmed, r) - } - } - repos = trimmed - - if err := cloneRepos(destDir, rootURL.Host, repos, password); err != nil { - log.Fatalf("cloneRepos: %v", err) - } - - if *deleteRepos { - if err := deleteStaleRepos(*dest, filter, repos); err != nil { - log.Fatalf("deleteStaleRepos: %v", err) - } - } -} - -func deleteStaleRepos(destDir string, filter *gitindex.Filter, repos []bitbucketv1.Repository) error { - var baseURL string - if len(repos) > 0 { - baseURL = repos[0].Links.Self[0].Href - } else { - return nil - } - u, err := url.Parse(baseURL) - if err != nil { - return err - } - u.Path = "" - - names := map[string]struct{}{} - for _, r := range repos { - names[filepath.Join(u.Host, r.Project.Key, r.Slug+".git")] = struct{}{} - } - - if err := gitindex.DeleteRepos(destDir, u, names, filter); err != nil { - log.Fatalf("deleteRepos: %v", err) - } - return nil -} - -func IsValidProjectType(projectType string) bool { - switch projectType { - case "NORMAL", "PERSONAL": - return true - } - return false -} - -func getAllRepos(client bitbucketv1.APIClient) ([]bitbucketv1.Repository, error) { - var allRepos []bitbucketv1.Repository - opts := map[string]interface{}{ - "limit": 1000, - "start": 0, - } - - for { - resp, err := client.DefaultApi.GetRepositories_19(opts) - if err != nil { - return nil, err - } - - repos, err := bitbucketv1.GetRepositoriesResponse(resp) - if err != nil { - return nil, err - } - - if len(repos) == 0 { - break - } - - opts["start"] = opts["start"].(int) + opts["limit"].(int) - - allRepos = append(allRepos, repos...) - } - return allRepos, nil -} - -func getProjectRepos(client bitbucketv1.APIClient, projectName string) ([]bitbucketv1.Repository, error) { - var allRepos []bitbucketv1.Repository - opts := map[string]interface{}{ - "limit": 1000, - "start": 0, - } - - for { - resp, err := client.DefaultApi.GetRepositoriesWithOptions(projectName, opts) - if err != nil { - return nil, err - } - - repos, err := bitbucketv1.GetRepositoriesResponse(resp) - if err != nil { - return nil, err - } - - if len(repos) == 0 { - break - } - - opts["start"] = opts["start"].(int) + opts["limit"].(int) - - allRepos = append(allRepos, repos...) - } - return allRepos, nil -} - -func cloneRepos(destDir string, host string, repos []bitbucketv1.Repository, password string) error { - for _, r := range repos { - fullName := filepath.Join(r.Project.Key, r.Slug) - config := map[string]string{ - "zoekt.web-url-type": "bitbucket-server", - "zoekt.web-url": r.Links.Self[0].Href, - "zoekt.name": filepath.Join(host, fullName), - } - - httpsCloneUrl := "" - for _, cloneUrl := range r.Links.Clone { - // In fact, this is an https url, i.e. there's no separate Name for https. - if cloneUrl.Name == "http" { - s := strings.Split(cloneUrl.Href, "@") - httpsCloneUrl = s[0] + ":" + password + "@" + s[1] - } - } - - if httpsCloneUrl != "" { - dest, err := gitindex.CloneRepo(destDir, fullName, httpsCloneUrl, config) - if err != nil { - return err - } - if dest != "" { - fmt.Println(dest) - } - } - } - - return nil -}
diff --git a/cmd/zoekt-mirror-gerrit/main.go b/cmd/zoekt-mirror-gerrit/main.go deleted file mode 100644 index b186de7..0000000 --- a/cmd/zoekt-mirror-gerrit/main.go +++ /dev/null
@@ -1,182 +0,0 @@ -// Copyright 2017 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This binary fetches all repos of a Gerrit host. - -package main - -import ( - "bytes" - "flag" - "fmt" - "io/ioutil" - "log" - "net/http" - "net/url" - "path/filepath" - "strconv" - "strings" - - gerrit "github.com/andygrunwald/go-gerrit" - "github.com/google/zoekt/gitindex" -) - -type loggingRT struct { - http.RoundTripper -} - -type closeBuffer struct { - *bytes.Buffer -} - -func (b *closeBuffer) Close() error { return nil } - -const debug = false - -func (rt *loggingRT) RoundTrip(req *http.Request) (rep *http.Response, err error) { - if debug { - log.Println("Req: ", req) - } - rep, err = rt.RoundTripper.RoundTrip(req) - if debug { - log.Println("Rep: ", rep, err) - } - if err == nil { - body, _ := ioutil.ReadAll(rep.Body) - - rep.Body.Close() - if debug { - log.Println("body: ", string(body)) - } - rep.Body = &closeBuffer{bytes.NewBuffer(body)} - } - return rep, err -} - -func newLoggingClient() *http.Client { - return &http.Client{ - Transport: &loggingRT{ - RoundTripper: http.DefaultTransport, - }, - } -} - -func main() { - dest := flag.String("dest", "", "destination directory") - namePattern := flag.String("name", "", "only clone repos whose name matches the regexp.") - excludePattern := flag.String("exclude", "", "don't mirror repos whose names match this regexp.") - httpCrendentialsPath := flag.String("http-credentials", "", "path to a file containing http credentials stored like 'user:password'.") - flag.Parse() - - if len(flag.Args()) < 1 { - log.Fatal("must provide URL argument.") - } - - rootURL, err := url.Parse(flag.Arg(0)) - if err != nil { - log.Fatalf("url.Parse(): %v", err) - } - - if *httpCrendentialsPath != "" { - creds, err := ioutil.ReadFile(*httpCrendentialsPath) - if err != nil { - log.Print("Cannot read gerrit http credentials, going Anonymous") - } else { - splitCreds := strings.Split(strings.TrimSpace(string(creds)), ":") - rootURL.User = url.UserPassword(splitCreds[0], splitCreds[1]) - } - } - - if *dest == "" { - log.Fatal("must set --dest") - } - - filter, err := gitindex.NewFilter(*namePattern, *excludePattern) - if err != nil { - log.Fatal(err) - } - - client, err := gerrit.NewClient(rootURL.String(), newLoggingClient()) - if err != nil { - log.Fatalf("NewClient(%s): %v", rootURL, err) - } - - info, _, err := client.Config.GetServerInfo() - if err != nil { - log.Fatalf("GetServerInfo: %v", err) - } - - var projectURL string - for _, s := range []string{"http", "anonymous http"} { - projectURL = info.Download.Schemes[s].URL - } - if projectURL == "" { - log.Fatalf("project URL is empty, got Schemes %#v", info.Download.Schemes) - } - - projects := make(map[string]gerrit.ProjectInfo) - skip := "0" - for { - page, _, err := client.Projects.ListProjects(&gerrit.ProjectOptions{Skip: skip}) - if err != nil { - log.Fatalf("ListProjects: %v", err) - } - - if len(*page) == 0 { - break - } - for k, v := range *page { - projects[k] = v - } - skip = strconv.Itoa(len(projects)) - } - - for k, v := range projects { - if !filter.Include(k) { - continue - } - - cloneURL, err := url.Parse(strings.Replace(projectURL, "${project}", k, -1)) - if err != nil { - log.Fatalf("url.Parse: %v", err) - } - - name := filepath.Join(cloneURL.Host, cloneURL.Path) - config := map[string]string{ - "zoekt.name": name, - "zoekt.gerrit-project": k, - "zoekt.gerrit-host": rootURL.String(), - } - - for _, wl := range v.WebLinks { - // default gerrit gitiles config is named browse, and does not include - // root domain name in it. Cheating. - switch wl.Name { - case "browse": - config["zoekt.web-url"] = fmt.Sprintf("%s://%s%s", rootURL.Scheme, - rootURL.Host, wl.URL) - config["zoekt.web-url-type"] = "gitiles" - default: - config["zoekt.web-url"] = wl.URL - config["zoekt.web-url-type"] = wl.Name - } - } - - if dest, err := gitindex.CloneRepo(*dest, name, cloneURL.String(), config); err != nil { - log.Fatalf("CloneRepo: %v", err) - } else { - fmt.Println(dest) - } - } -}
diff --git a/cmd/zoekt-mirror-github/main.go b/cmd/zoekt-mirror-github/main.go deleted file mode 100644 index 1b0d8f8..0000000 --- a/cmd/zoekt-mirror-github/main.go +++ /dev/null
@@ -1,314 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This binary fetches all repos of a user or organization and clones -// them. It is strongly recommended to get a personal API token from -// https://github.com/settings/tokens, save the token in a file, and -// point the --token option to it. -package main - -import ( - "context" - "flag" - "fmt" - "io/ioutil" - "log" - "net/url" - "os" - "path/filepath" - "strconv" - "strings" - - "github.com/google/go-github/v27/github" - "golang.org/x/oauth2" - - "github.com/google/zoekt/gitindex" -) - -type topicsFlag []string - -func (f *topicsFlag) String() string { - return strings.Join(*f, ",") -} - -func (f *topicsFlag) Set(value string) error { - *f = append(*f, value) - return nil -} - -type reposFilters struct { - topics []string - excludeTopics []string -} - -func main() { - dest := flag.String("dest", "", "destination directory") - githubURL := flag.String("url", "", "GitHub Enterprise url. If not set github.com will be used as the host.") - org := flag.String("org", "", "organization to mirror") - user := flag.String("user", "", "user to mirror") - token := flag.String("token", - filepath.Join(os.Getenv("HOME"), ".github-token"), - "file holding API token.") - forks := flag.Bool("forks", false, "also mirror forks.") - deleteRepos := flag.Bool("delete", false, "delete missing repos") - namePattern := flag.String("name", "", "only clone repos whose name matches the given regexp.") - excludePattern := flag.String("exclude", "", "don't mirror repos whose names match this regexp.") - topics := topicsFlag{} - flag.Var(&topics, "topic", "only clone repos whose have one of given topics. You can add multiple topics by setting this more than once.") - excludeTopics := topicsFlag{} - flag.Var(&excludeTopics, "exclude_topic", "don't clone repos whose have one of given topics. You can add multiple topics by setting this more than once.") - - flag.Parse() - - if *dest == "" { - log.Fatal("must set --dest") - } - if *githubURL == "" && *org == "" && *user == "" { - log.Fatal("must set either --org or --user when github.com is used as host") - } - - var host string - var apiBaseURL string - var client *github.Client - if *githubURL != "" { - rootURL, err := url.Parse(*githubURL) - if err != nil { - log.Fatal(err) - } - host = rootURL.Host - apiPath, err := url.Parse("/api/v3/") - if err != nil { - log.Fatal(err) - } - apiBaseURL = rootURL.ResolveReference(apiPath).String() - client, err = github.NewEnterpriseClient(apiBaseURL, apiBaseURL, nil) - if err != nil { - log.Fatal(err) - } - } else { - host = "github.com" - apiBaseURL = "https://github.com/" - client = github.NewClient(nil) - } - destDir := filepath.Join(*dest, host) - if err := os.MkdirAll(destDir, 0o755); err != nil { - log.Fatal(err) - } - - if *token != "" { - content, err := ioutil.ReadFile(*token) - if err != nil { - log.Fatal(err) - } - - ts := oauth2.StaticTokenSource( - &oauth2.Token{ - AccessToken: strings.TrimSpace(string(content)), - }) - tc := oauth2.NewClient(context.Background(), ts) - if *githubURL != "" { - client, err = github.NewEnterpriseClient(apiBaseURL, apiBaseURL, tc) - if err != nil { - log.Fatal(err) - } - } else { - client = github.NewClient(tc) - } - } - - reposFilters := reposFilters{ - topics: topics, - excludeTopics: excludeTopics, - } - var repos []*github.Repository - var err error - if *org != "" { - repos, err = getOrgRepos(client, *org, reposFilters) - } else if *user != "" { - repos, err = getUserRepos(client, *user, reposFilters) - } else { - log.Printf("no user or org specified, cloning all repos.") - repos, err = getUserRepos(client, "", reposFilters) - } - - if err != nil { - log.Fatal(err) - } - - if !*forks { - trimmed := repos[:0] - for _, r := range repos { - if r.Fork == nil || !*r.Fork { - trimmed = append(trimmed, r) - } - } - repos = trimmed - } - - filter, err := gitindex.NewFilter(*namePattern, *excludePattern) - if err != nil { - log.Fatal(err) - } - - { - trimmed := repos[:0] - for _, r := range repos { - if filter.Include(*r.Name) { - trimmed = append(trimmed, r) - } - } - repos = trimmed - } - - if err := cloneRepos(destDir, repos); err != nil { - log.Fatalf("cloneRepos: %v", err) - } - - if *deleteRepos { - if err := deleteStaleRepos(*dest, filter, repos, *org+*user); err != nil { - log.Fatalf("deleteStaleRepos: %v", err) - } - } -} - -func deleteStaleRepos(destDir string, filter *gitindex.Filter, repos []*github.Repository, user string) error { - var baseURL string - if len(repos) > 0 { - baseURL = *repos[0].HTMLURL - } else { - return nil - } - u, err := url.Parse(baseURL) - if err != nil { - return err - } - u.Path = user - - names := map[string]struct{}{} - for _, r := range repos { - u, err := url.Parse(*r.HTMLURL) - if err != nil { - return err - } - - names[filepath.Join(u.Host, u.Path+".git")] = struct{}{} - } - if err := gitindex.DeleteRepos(destDir, u, names, filter); err != nil { - log.Fatalf("deleteRepos: %v", err) - } - return nil -} - -func hasIntersection(s1, s2 []string) bool { - hash := make(map[string]bool) - for _, e := range s1 { - hash[e] = true - } - for _, e := range s2 { - if hash[e] { - return true - } - } - return false -} - -func filterByTopic(repos []*github.Repository, include []string, exclude []string) (filteredRepos []*github.Repository) { - for _, repo := range repos { - if (len(include) == 0 || hasIntersection(include, repo.Topics)) && - !hasIntersection(exclude, repo.Topics) { - filteredRepos = append(filteredRepos, repo) - } - } - return -} - -func getOrgRepos(client *github.Client, org string, reposFilters reposFilters) ([]*github.Repository, error) { - var allRepos []*github.Repository - opt := &github.RepositoryListByOrgOptions{} - for { - repos, resp, err := client.Repositories.ListByOrg(context.Background(), org, opt) - if err != nil { - return nil, err - } - if len(repos) == 0 { - break - } - - opt.Page = resp.NextPage - repos = filterByTopic(repos, reposFilters.topics, reposFilters.excludeTopics) - allRepos = append(allRepos, repos...) - if resp.NextPage == 0 { - break - } - } - return allRepos, nil -} - -func getUserRepos(client *github.Client, user string, reposFilters reposFilters) ([]*github.Repository, error) { - var allRepos []*github.Repository - opt := &github.RepositoryListOptions{} - for { - repos, resp, err := client.Repositories.List(context.Background(), user, opt) - if err != nil { - return nil, err - } - if len(repos) == 0 { - break - } - - opt.Page = resp.NextPage - repos = filterByTopic(repos, reposFilters.topics, reposFilters.excludeTopics) - allRepos = append(allRepos, repos...) - if resp.NextPage == 0 { - break - } - } - return allRepos, nil -} - -func itoa(p *int) string { - if p != nil { - return strconv.Itoa(*p) - } - return "" -} - -func cloneRepos(destDir string, repos []*github.Repository) error { - for _, r := range repos { - host, err := url.Parse(*r.HTMLURL) - if err != nil { - return err - } - config := map[string]string{ - "zoekt.web-url-type": "github", - "zoekt.web-url": *r.HTMLURL, - "zoekt.name": filepath.Join(host.Hostname(), *r.FullName), - - "zoekt.github-stars": itoa(r.StargazersCount), - "zoekt.github-watchers": itoa(r.WatchersCount), - "zoekt.github-subscribers": itoa(r.SubscribersCount), - "zoekt.github-forks": itoa(r.ForksCount), - } - dest, err := gitindex.CloneRepo(destDir, *r.FullName, *r.CloneURL, config) - if err != nil { - return err - } - if dest != "" { - fmt.Println(dest) - } - - } - - return nil -}
diff --git a/cmd/zoekt-mirror-gitiles/cgit.go b/cmd/zoekt-mirror-gitiles/cgit.go deleted file mode 100644 index 7517cf8..0000000 --- a/cmd/zoekt-mirror-gitiles/cgit.go +++ /dev/null
@@ -1,118 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "bytes" - "fmt" - "io/ioutil" - "log" - "net/http" - "net/url" - "regexp" - "strings" -) - -// I will go to programmer hell for trying to parse HTML with -// regexps. Why doesn't CGit have a JSON interface? -var cgitRepoEntryRE = regexp.MustCompile( - `class='sublevel-repo'><a title='([^'"]*)' href='([^']*)'>`) - -func normalizedGet(u *url.URL) ([]byte, error) { - rep, err := http.Get(u.String()) - if err != nil { - return nil, err - } - defer rep.Body.Close() - if rep.StatusCode != 200 { - return nil, fmt.Errorf("status %s", rep.Status) - } - - c, err := ioutil.ReadAll(rep.Body) - if err != nil { - return nil, err - } - - c = bytes.Replace(c, []byte{'\n'}, []byte{' '}, -1) - return c, nil -} - -// getCGitRepos finds repo names from the CGit index page hosted at -// URL `u`. -func getCGitRepos(u *url.URL, filter func(string) bool) (map[string]*crawlTarget, error) { - c, err := normalizedGet(u) - if err != nil { - return nil, err - } - - pages := map[string]*crawlTarget{} - for _, m := range cgitRepoEntryRE.FindAllSubmatch(c, -1) { - nm := strings.TrimSuffix(string(m[1]), ".git") - - if !filter(nm) { - continue - } - - relUrl := string(m[2]) - - u, err := u.Parse(relUrl) - if err != nil { - log.Printf("ignoring u.Parse(%q): %v", relUrl, err) - continue - } - pages[nm] = &crawlTarget{ - webURL: u.String(), - webURLType: "cgit", - } - } - - // TODO - parallel? - for _, target := range pages { - u, _ := url.Parse(target.webURL) - c, err := cgitCloneURL(u) - if err != nil { - log.Printf("ignoring cgitCloneURL(%s): %v", u, c) - continue - } - - target.cloneURL = c.String() - } - return pages, nil -} - -// We'll take the first URL we get. This may put the git:// URL (which -// is insecure) at the top, but individual machines (such as -// git.savannah.gnu) probably would rather receive git:// traffic -// which is more efficient. - -// TODO - do something like `Clone.*<a.*href=` to get the first -// URL. Older versions don't say vcs-git. -var cloneURLRe = regexp.MustCompile( - `rel=["']vcs-git["'] *href=["']([^"']*)["']`) - -func cgitCloneURL(u *url.URL) (*url.URL, error) { - c, err := normalizedGet(u) - if err != nil { - return nil, err - } - - m := cloneURLRe.FindSubmatch(c) - cl, err := url.Parse(string(m[1])) - if err != nil { - return nil, err - } - - return cl, nil -}
diff --git a/cmd/zoekt-mirror-gitiles/gitiles.go b/cmd/zoekt-mirror-gitiles/gitiles.go deleted file mode 100644 index 9510d10..0000000 --- a/cmd/zoekt-mirror-gitiles/gitiles.go +++ /dev/null
@@ -1,70 +0,0 @@ -// Copyright 2017 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "bytes" - "encoding/json" - "io/ioutil" - "net/http" - "net/url" - "path" -) - -type Project struct { - Name string - CloneURL string `json:"clone_url"` -} - -func getGitilesRepos(root *url.URL, filter func(string) bool) (map[string]*crawlTarget, error) { - jsRoot := *root - jsRoot.RawQuery = "format=JSON" - resp, err := http.Get(jsRoot.String()) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - content, err := ioutil.ReadAll(resp.Body) - if err != nil { - return nil, err - } - - const xssTag = ")]}'\n" - content = bytes.TrimPrefix(content, []byte(xssTag)) - - m := map[string]*Project{} - if err := json.Unmarshal(content, &m); err != nil { - return nil, err - } - - result := map[string]*crawlTarget{} - for k, v := range m { - if k == "All-Users" || k == "All-Projects" { - continue - } - if !filter(k) { - continue - } - web := *root - web.Path = path.Join(web.Path, v.Name) - result[k] = &crawlTarget{ - cloneURL: v.CloneURL, - webURL: web.String(), - webURLType: "gitiles", - } - } - return result, nil -}
diff --git a/cmd/zoekt-mirror-gitiles/main.go b/cmd/zoekt-mirror-gitiles/main.go deleted file mode 100644 index c4210a9..0000000 --- a/cmd/zoekt-mirror-gitiles/main.go +++ /dev/null
@@ -1,102 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This binary fetches all repos of a Gitiles host. It does double -// duty for other "simple" web hosts -package main - -import ( - "flag" - "fmt" - "log" - "net/url" - "os" - "path/filepath" - - "github.com/google/zoekt/gitindex" -) - -type crawlTarget struct { - cloneURL string - webURL string - webURLType string -} - -type hostCrawler func(*url.URL, func(string) bool) (map[string]*crawlTarget, error) - -func main() { - dest := flag.String("dest", "", "destination directory") - namePattern := flag.String("name", "", "only clone repos whose name matches the regexp.") - excludePattern := flag.String("exclude", "", "don't mirror repos whose names match this regexp.") - hostType := flag.String("type", "gitiles", "which webserver to crawl. Choices: gitiles, cgit") - flag.Parse() - - if len(flag.Args()) < 1 { - log.Fatal("must provide URL argument.") - } - - var crawler hostCrawler - switch *hostType { - case "gitiles": - crawler = getGitilesRepos - case "cgit": - crawler = getCGitRepos - default: - log.Fatalf("unknown host type %q", *hostType) - } - - rootURL, err := url.Parse(flag.Arg(0)) - if err != nil { - log.Fatalf("url.Parse(): %v", err) - } - - if *dest == "" { - log.Fatal("must set --dest") - } - - if err := os.MkdirAll(filepath.Join(*dest, rootURL.Host, rootURL.Path), 0o755); err != nil { - log.Fatal(err) - } - - filter, err := gitindex.NewFilter(*namePattern, *excludePattern) - if err != nil { - log.Fatal(err) - } - - repos, err := crawler(rootURL, filter.Include) - if err != nil { - log.Fatal(err) - } - - for nm, target := range repos { - // For git.savannah.gnu.org, this puts an ugly "CGit" - // path component into the name. However, it's - // possible that there are multiple, different CGit pages - // on the host, so we have to keep it. - fullName := filepath.Join(rootURL.Host, rootURL.Path, nm) - config := map[string]string{ - "zoekt.web-url": target.webURL, - "zoekt.web-url-type": target.webURLType, - "zoekt.name": fullName, - } - - dest, err := gitindex.CloneRepo(*dest, fullName, target.cloneURL, config) - if err != nil { - log.Fatal(err) - } - if dest != "" { - fmt.Println(dest) - } - } -}
diff --git a/cmd/zoekt-mirror-gitlab/main.go b/cmd/zoekt-mirror-gitlab/main.go deleted file mode 100644 index e119e87..0000000 --- a/cmd/zoekt-mirror-gitlab/main.go +++ /dev/null
@@ -1,189 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This binary fetches all repos for a user from gitlab. -// -// It is recommended to use a gitlab personal access token: -// https://docs.gitlab.com/ce/user/profile/personal_access_tokens.html. This -// token should be stored in a file and the --token option should be used. -// In addition, the token should be present in the ~/.netrc of the user running -// the mirror command. For example, the ~/.netrc may look like: -// -// machine gitlab.com -// login oauth -// password <personal access token> -// -package main - -import ( - "flag" - "fmt" - "io/ioutil" - "log" - "net/url" - "os" - "path/filepath" - "strconv" - "strings" - - "github.com/google/zoekt/gitindex" - gitlab "github.com/xanzy/go-gitlab" -) - -func main() { - dest := flag.String("dest", "", "destination directory") - gitlabURL := flag.String("url", "https://gitlab.com/api/v4/", "Gitlab URL. If not set https://gitlab.com/api/v4/ will be used") - token := flag.String("token", - filepath.Join(os.Getenv("HOME"), ".gitlab-token"), - "file holding API token.") - isMember := flag.Bool("membership", false, "only mirror repos this user is a member of ") - isPublic := flag.Bool("public", false, "only mirror public repos") - deleteRepos := flag.Bool("delete", false, "delete missing repos") - namePattern := flag.String("name", "", "only clone repos whose name matches the given regexp.") - excludePattern := flag.String("exclude", "", "don't mirror repos whose names match this regexp.") - flag.Parse() - - if *dest == "" { - log.Fatal("must set --dest") - } - - var host string - rootURL, err := url.Parse(*gitlabURL) - if err != nil { - log.Fatal(err) - } - host = rootURL.Host - - destDir := filepath.Join(*dest, host) - if err := os.MkdirAll(destDir, 0o755); err != nil { - log.Fatal(err) - } - - content, err := ioutil.ReadFile(*token) - if err != nil { - log.Fatal(err) - } - apiToken := strings.TrimSpace(string(content)) - - client := gitlab.NewClient(nil, apiToken) - client.SetBaseURL(*gitlabURL) - - opt := &gitlab.ListProjectsOptions{ - ListOptions: gitlab.ListOptions{ - PerPage: 10, - Page: 1, - }, - Membership: isMember, - } - if *isPublic { - opt.Visibility = gitlab.Visibility(gitlab.PublicVisibility) - } - - var gitlabProjects []*gitlab.Project - for { - projects, resp, err := client.Projects.ListProjects(opt) - if err != nil { - log.Fatal(err) - } - - for _, project := range projects { - - // Skip projects without a default branch - these should be projects - // where the repository isn't enabled - if project.DefaultBranch == "" { - continue - } - - gitlabProjects = append(gitlabProjects, project) - } - - if resp.CurrentPage >= resp.TotalPages { - break - } - - opt.Page = resp.NextPage - } - - filter, err := gitindex.NewFilter(*namePattern, *excludePattern) - if err != nil { - log.Fatal(err) - } - - { - trimmed := gitlabProjects[:0] - for _, p := range gitlabProjects { - if filter.Include(p.NameWithNamespace) { - trimmed = append(trimmed, p) - } - } - gitlabProjects = trimmed - } - - fetchProjects(destDir, apiToken, gitlabProjects) - - if *deleteRepos { - if err := deleteStaleProjects(*dest, filter, gitlabProjects); err != nil { - log.Fatalf("deleteStaleProjects: %v", err) - } - } -} - -func deleteStaleProjects(destDir string, filter *gitindex.Filter, projects []*gitlab.Project) error { - u, err := url.Parse(projects[0].HTTPURLToRepo) - u.Path = "" - if err != nil { - return err - } - - names := map[string]struct{}{} - for _, p := range projects { - u, err := url.Parse(p.HTTPURLToRepo) - if err != nil { - return err - } - - names[filepath.Join(u.Host, u.Path)] = struct{}{} - } - - if err := gitindex.DeleteRepos(destDir, u, names, filter); err != nil { - log.Fatalf("deleteRepos: %v", err) - } - return nil -} - -func fetchProjects(destDir, token string, projects []*gitlab.Project) { - for _, p := range projects { - u, err := url.Parse(p.HTTPURLToRepo) - if err != nil { - log.Printf("Unable to parse project URL: %v", err) - continue - } - config := map[string]string{ - "zoekt.web-url-type": "gitlab", - "zoekt.web-url": p.WebURL, - "zoekt.name": filepath.Join(u.Hostname(), p.PathWithNamespace), - - "zoekt.gitlab-stars": strconv.Itoa(p.StarCount), - "zoekt.gitlab-forks": strconv.Itoa(p.ForksCount), - } - - cloneURL := p.HTTPURLToRepo - dest, err := gitindex.CloneRepo(destDir, p.PathWithNamespace, cloneURL, config) - if err != nil { - log.Printf("cloneRepos: %v", err) - continue - } - if dest != "" { - fmt.Println(dest) - } - } -}
diff --git a/cmd/zoekt-repo-index/main.go b/cmd/zoekt-repo-index/main.go deleted file mode 100644 index 06e8e20..0000000 --- a/cmd/zoekt-repo-index/main.go +++ /dev/null
@@ -1,381 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* zoekt-repo-index indexes a repo-based repository. The constituent -git repositories should already have been downloaded to the ---repo_cache directory, eg. - - go install github.com/google/zoekt/cmd/zoekt-repo-index && - - zoekt-repo-index -base_url https://gfiber.googlesource.com/ \ - -manifest_repo_url https://gfiber.googlesource.com/manifests \ - -manifest_rev_prefix=refs/heads/ \ - -rev_prefix="refs/remotes/" \ - -repo_cache ~/zoekt-serving/repos/ \ - -shard_limit 50000000 \ - master:default_unrestricted.xml -*/ -package main - -import ( - "crypto/sha1" - "flag" - "fmt" - "io/ioutil" - "log" - "net/url" - "path" - "path/filepath" - "sort" - "strings" - - "github.com/google/slothfs/manifest" - "github.com/google/zoekt" - "github.com/google/zoekt/build" - "github.com/google/zoekt/gitindex" - "go.uber.org/automaxprocs/maxprocs" - - git "github.com/go-git/go-git/v5" - "github.com/go-git/go-git/v5/plumbing" -) - -var _ = log.Println - -type fileKey struct { - SubRepoPath string - Path string - ID plumbing.Hash -} - -func (k *fileKey) FullPath() string { - return filepath.Join(k.SubRepoPath, k.Path) -} - -type branchFile struct { - branch, file string - mf *manifest.Manifest - manifestPath string -} - -func parseBranches(manifestRepoURL, revPrefix string, cache *gitindex.RepoCache, args []string) ([]branchFile, error) { - var branches []branchFile - if manifestRepoURL != "" { - u, err := url.Parse(manifestRepoURL) - if err != nil { - return nil, err - } - - repo, err := cache.Open(u) - if err != nil { - return nil, err - } - - for _, f := range args { - fs := strings.SplitN(f, ":", 2) - if len(fs) != 2 { - return nil, fmt.Errorf("cannot parse %q as BRANCH:FILE", f) - } - mf, err := getManifest(repo, revPrefix+fs[0], fs[1]) - if err != nil { - return nil, fmt.Errorf("manifest %s:%s: %v", fs[0], fs[1], err) - } - - branches = append(branches, branchFile{ - branch: fs[0], - file: fs[1], - mf: mf, - manifestPath: cache.Path(u), - }) - } - } else { - if len(args) == 0 { - return nil, fmt.Errorf("must give XML file argument") - } - for _, f := range args { - mf, err := manifest.ParseFile(f) - if err != nil { - return nil, err - } - - branches = append(branches, branchFile{ - branch: "HEAD", - file: filepath.Base(f), - mf: mf, - manifestPath: f, - }) - } - } - return branches, nil -} - -func main() { - sizeMax := flag.Int("file_limit", 128<<10, "maximum file size") - shardLimit := flag.Int("shard_limit", 100<<20, "maximum corpus size for a shard") - parallelism := flag.Int("parallelism", 1, "maximum number of parallel indexing processes") - - revPrefix := flag.String("rev_prefix", "refs/remotes/origin/", "prefix for references") - baseURLStr := flag.String("base_url", "", "base url to interpret repository names") - repoCacheDir := flag.String("repo_cache", "", "root for repository cache") - indexDir := flag.String("index", build.DefaultDir, "index directory for *.zoekt files") - manifestRepoURL := flag.String("manifest_repo_url", "", "set a URL for a git repository holding manifest XML file. Provide the BRANCH:XML-FILE as further command-line arguments") - manifestRevPrefix := flag.String("manifest_rev_prefix", "refs/remotes/origin/", "prefixes for branches in manifest repository") - repoName := flag.String("name", "", "set repository name") - repoURL := flag.String("url", "", "set repository URL") - maxSubProjects := flag.Int("max_sub_projects", 0, "trim number of projects in manifest, for debugging.") - incremental := flag.Bool("incremental", true, "only index if the repository has changed.") - flag.Parse() - - // Tune GOMAXPROCS to match Linux container CPU quota. - maxprocs.Set() - - if *repoCacheDir == "" { - log.Fatal("must set --repo_cache") - } - repoCache := gitindex.NewRepoCache(*repoCacheDir) - - if u, err := url.Parse(*baseURLStr); err != nil { - log.Fatalf("Parse(%q): %v", u, err) - } else if *repoName == "" { - *repoName = filepath.Join(u.Host, u.Path) - } - - opts := build.Options{ - Parallelism: *parallelism, - SizeMax: *sizeMax, - ShardMax: *shardLimit, - IndexDir: *indexDir, - RepositoryDescription: zoekt.Repository{ - Name: *repoName, - URL: *repoURL, - }, - } - opts.SetDefaults() - baseURL, err := url.Parse(*baseURLStr) - if err != nil { - log.Fatalf("Parse baseURL %q: %v", *baseURLStr, err) - } - - branches, err := parseBranches(*manifestRepoURL, *manifestRevPrefix, repoCache, flag.Args()) - if err != nil { - log.Fatalf("parseBranches(%s, %s): %v", *manifestRepoURL, *manifestRevPrefix, err) - } - if len(branches) == 0 { - log.Fatal("must specify at least one branch") - } - if *maxSubProjects > 0 { - for _, b := range branches { - if *maxSubProjects < len(b.mf.Project) { - b.mf.Project = b.mf.Project[:*maxSubProjects] - } - } - } - - perBranch := map[string]map[fileKey]gitindex.BlobLocation{} - opts.SubRepositories = map[string]*zoekt.Repository{} - - // branch => repo => version - versionMap := map[string]map[string]plumbing.Hash{} - for _, br := range branches { - br.mf.Filter() - files, versions, err := iterateManifest(br.mf, *baseURL, *revPrefix, repoCache) - if err != nil { - log.Fatalf("iterateManifest: %v", err) - } - - perBranch[br.branch] = files - for key, loc := range files { - _, ok := opts.SubRepositories[key.SubRepoPath] - if ok { - // This can be incorrect: if the layout of manifests - // changes across branches, then the same file could - // be in different subRepos. We'll pretend this is not - // a problem. - continue - } - - desc := &zoekt.Repository{} - if err := gitindex.SetTemplatesFromOrigin(desc, loc.URL); err != nil { - log.Fatalf("SetTemplatesFromOrigin(%s): %v", loc.URL, err) - } - - opts.SubRepositories[key.SubRepoPath] = desc - } - versionMap[br.branch] = versions - } - - for _, br := range branches { - var paths []string - for p := range opts.SubRepositories { - paths = append(paths, p) - } - sort.Strings(paths) - - // Compute a version of the aggregate. This version - // has nothing to do with git, but will let us do - // incrementality correctly. - hasher := sha1.New() - for _, p := range paths { - repo := opts.SubRepositories[p] - id := versionMap[br.branch][p] - - // it is possible that 'id' is zero, if this - // branch of the manifest doesn't have this - // particular subrepository. - hasher.Write([]byte(p)) - hasher.Write([]byte(id.String())) - repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ - Name: br.branch, - Version: id.String(), - }) - } - - opts.RepositoryDescription.Branches = append(opts.RepositoryDescription.Branches, zoekt.RepositoryBranch{ - Name: br.branch, - Version: fmt.Sprintf("%x", hasher.Sum(nil)), - }) - } - - // key => branch - all := map[fileKey][]string{} - for br, files := range perBranch { - for k := range files { - all[k] = append(all[k], br) - } - } - - if *incremental && opts.IncrementalSkipIndexing() { - return - } - - builder, err := build.NewBuilder(opts) - if err != nil { - log.Fatal(err) - } - for k, branches := range all { - loc := perBranch[branches[0]][k] - data, err := loc.Blob(&k.ID) - if err != nil { - log.Fatal(err) - } - - doc := zoekt.Document{ - Name: k.FullPath(), - Content: data, - SubRepositoryPath: k.SubRepoPath, - } - - doc.Branches = append(doc.Branches, branches...) - if err := builder.Add(doc); err != nil { - log.Printf("Add(%s): %v", doc.Name, err) - break - } - } - if err := builder.Finish(); err != nil { - log.Fatalf("Finish: %v", err) - } -} - -// getManifest parses the manifest XML at the given branch/path inside a Git repository. -func getManifest(repo *git.Repository, branch, path string) (*manifest.Manifest, error) { - ref, err := repo.Reference(plumbing.ReferenceName("refs/heads/"+branch), true) - if err != nil { - return nil, err - } - - commit, err := repo.CommitObject(ref.Hash()) - if err != nil { - return nil, err - } - - tree, err := repo.TreeObject(commit.TreeHash) - if err != nil { - return nil, err - } - - entry, err := tree.FindEntry(path) - if err != nil { - return nil, err - } - - blob, err := repo.BlobObject(entry.Hash) - if err != nil { - return nil, err - } - r, err := blob.Reader() - if err != nil { - return nil, err - } - defer r.Close() - - content, _ := ioutil.ReadAll(r) - return manifest.Parse(content) -} - -// iterateManifest constructs a complete tree from the given Manifest. -func iterateManifest(mf *manifest.Manifest, - baseURL url.URL, revPrefix string, - cache *gitindex.RepoCache) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) { - allFiles := map[fileKey]gitindex.BlobLocation{} - allVersions := map[string]plumbing.Hash{} - for _, p := range mf.Project { - rev := mf.ProjectRevision(&p) - - projURL := baseURL - projURL.Path = path.Join(projURL.Path, p.Name) - - topRepo, err := cache.Open(&projURL) - if err != nil { - return nil, nil, err - } - - ref, err := topRepo.Reference(plumbing.ReferenceName(revPrefix+rev), true) - if err != nil { - return nil, nil, err - } - - commit, err := topRepo.CommitObject(ref.Hash()) - if err != nil { - return nil, nil, err - } - if err != nil { - return nil, nil, err - } - - allVersions[p.GetPath()] = commit.Hash - - tree, err := commit.Tree() - if err != nil { - return nil, nil, err - } - - files, versions, err := gitindex.TreeToFiles(topRepo, tree, projURL.String(), cache) - if err != nil { - return nil, nil, err - } - - for key, repo := range files { - allFiles[fileKey{ - SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath), - Path: key.Path, - ID: key.ID, - }] = repo - } - - for path, version := range versions { - allVersions[filepath.Join(p.GetPath(), path)] = version - } - } - - return allFiles, allVersions, nil -}
diff --git a/cmd/zoekt-test/main.go b/cmd/zoekt-test/main.go deleted file mode 100644 index 2d131d1..0000000 --- a/cmd/zoekt-test/main.go +++ /dev/null
@@ -1,194 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// zoekt-test compares the search engine results with raw substring search -package main - -import ( - "bufio" - "bytes" - "context" - "flag" - "fmt" - "io/ioutil" - "log" - "os" - "path/filepath" - "reflect" - "sort" - "strings" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" - "github.com/google/zoekt/query" - "github.com/google/zoekt/shards" -) - -func readTree(dir string) (map[string][]byte, error) { - var fns []string - - add := func(path string, info os.FileInfo, err error) error { - if !info.Mode().IsRegular() { - return nil - } - - fns = append(fns, path) - return nil - } - if err := filepath.Walk(dir, add); err != nil { - return nil, err - } - - res := map[string][]byte{} - for _, n := range fns { - c, err := ioutil.ReadFile(n) - if err != nil { - return nil, err - } - - strip := strings.TrimPrefix(n, dir+"/") - res[strip] = c - } - return res, nil -} - -func compare(dir, patfile string, caseSensitive bool) error { - indexDir, err := ioutil.TempDir("", "") - if err != nil { - return err - } - defer os.RemoveAll(indexDir) - - var opts build.Options - opts.SetDefaults() - opts.IndexDir = indexDir - - fileContents, err := readTree(dir) - if err != nil { - return err - } - if len(fileContents) == 0 { - return fmt.Errorf("no contents") - } - - builder, err := build.NewBuilder(opts) - if err != nil { - return err - } - for k, v := range fileContents { - builder.AddFile(k, v) - } - if err := builder.Finish(); err != nil { - return err - } - - if !caseSensitive { - for k, v := range fileContents { - fileContents[k] = toLower(v) - } - } - - f, err := os.Open(patfile) - if err != nil { - return err - } - searcher, err := shards.NewDirectorySearcher(indexDir) - if err != nil { - return err - } - - scanner := bufio.NewScanner(f) - for scanner.Scan() { - t := scanner.Text() - if len(t) < 3 { - continue - } - q := &query.Substring{ - Pattern: t, - CaseSensitive: caseSensitive, - } - - zFiles := map[string]struct{}{} - rFiles := map[string]struct{}{} - - // search engine results - var opts zoekt.SearchOptions - res, err := searcher.Search(context.Background(), q, &opts) - if err != nil { - return err - } - - for _, f := range res.Files { - zFiles[f.FileName] = struct{}{} - } - - // raw search - needle := []byte(t) - if !caseSensitive { - needle = toLower(needle) - } - - for k, v := range fileContents { - if bytes.Contains(v, needle) { - rFiles[k] = struct{}{} - } - } - - if !reflect.DeepEqual(zFiles, rFiles) { - var add, del []string - for k := range zFiles { - if _, ok := rFiles[k]; !ok { - del = append(del, k) - } - } - for k := range rFiles { - if _, ok := zFiles[k]; !ok { - add = append(add, k) - } - } - sort.Strings(add) - sort.Strings(del) - log.Printf("pattern %q, add %v, del %v", t, add, del) - } - } - return nil -} - -func main() { - repo := flag.String("repo", "", "repository to search") - caseSensitive := flag.Bool("case", false, "case sensitive") - flag.Parse() - - if len(flag.Args()) == 0 { - fmt.Fprintf(os.Stderr, "pattern file is missing.\n") - flag.Usage() - os.Exit(2) - } - input := flag.Arg(0) - - if err := compare(*repo, input, *caseSensitive); err != nil { - log.Fatal(err) - } -} - -func toLower(in []byte) []byte { - out := make([]byte, len(in)) - for i, c := range in { - if c >= 'A' && c <= 'Z' { - c = c - 'A' + 'a' - } - out[i] = c - } - return out -}
diff --git a/cmd/zoekt-webserver/main.go b/cmd/zoekt-webserver/main.go deleted file mode 100644 index 0cb4771..0000000 --- a/cmd/zoekt-webserver/main.go +++ /dev/null
@@ -1,264 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "context" - "crypto/tls" - "flag" - "fmt" - "html/template" - "io/ioutil" - "log" - "net/http" - "net/http/pprof" - "os" - "path/filepath" - "strings" - "time" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" - "github.com/google/zoekt/shards" - "github.com/google/zoekt/web" - "github.com/prometheus/client_golang/prometheus/promhttp" - "go.uber.org/automaxprocs/maxprocs" - "golang.org/x/net/trace" -) - -const logFormat = "2006-01-02T15-04-05.999999999Z07" - -func divertLogs(dir string, interval time.Duration) { - t := time.NewTicker(interval) - var last *os.File - for { - nm := filepath.Join(dir, fmt.Sprintf("zoekt-webserver.%s.%d.log", time.Now().Format(logFormat), os.Getpid())) - fmt.Fprintf(os.Stderr, "writing logs to %s\n", nm) - - f, err := os.Create(nm) - if err != nil { - // There is not much we can do now. - fmt.Fprintf(os.Stderr, "can't create output file %s: %v\n", nm, err) - os.Exit(2) - } - - log.SetOutput(f) - last.Close() - - last = f - - <-t.C - } -} - -const templateExtension = ".html.tpl" - -func loadTemplates(tpl *template.Template, dir string) error { - fs, err := filepath.Glob(dir + "/*" + templateExtension) - if err != nil { - log.Fatalf("Glob: %v", err) - } - - log.Printf("loading templates: %v", fs) - for _, fn := range fs { - content, err := ioutil.ReadFile(fn) - if err != nil { - return err - } - - base := filepath.Base(fn) - base = strings.TrimSuffix(base, templateExtension) - if _, err := tpl.New(base).Parse(string(content)); err != nil { - return fmt.Errorf("template.Parse(%s): %v", fn, err) - } - } - return nil -} - -func writeTemplates(dir string) error { - if dir == "" { - return fmt.Errorf("must set --template_dir") - } - - for k, v := range web.TemplateText { - nm := filepath.Join(dir, k+templateExtension) - if err := ioutil.WriteFile(nm, []byte(v), 0o644); err != nil { - return err - } - } - return nil -} - -func main() { - logDir := flag.String("log_dir", "", "log to this directory rather than stderr.") - logRefresh := flag.Duration("log_refresh", 24*time.Hour, "if using --log_dir, start writing a new file this often.") - - listen := flag.String("listen", ":6070", "listen on this address.") - index := flag.String("index", build.DefaultDir, "set index directory to use") - html := flag.Bool("html", true, "enable HTML interface") - print := flag.Bool("print", false, "enable local result URLs") - enablePprof := flag.Bool("pprof", false, "set to enable remote profiling.") - sslCert := flag.String("ssl_cert", "", "set path to SSL .pem holding certificate.") - sslKey := flag.String("ssl_key", "", "set path to SSL .pem holding key.") - hostCustomization := flag.String( - "host_customization", "", - "specify host customization, as HOST1=QUERY,HOST2=QUERY") - - templateDir := flag.String("template_dir", "", "set directory from which to load custom .html.tpl template files") - dumpTemplates := flag.Bool("dump_templates", false, "dump templates into --template_dir and exit.") - version := flag.Bool("version", false, "Print version number") - flag.Parse() - - if *version { - fmt.Printf("zoekt-webserver version %q\n", zoekt.Version) - os.Exit(0) - } - - if *dumpTemplates { - if err := writeTemplates(*templateDir); err != nil { - log.Fatal(err) - } - os.Exit(0) - } - - if *logDir != "" { - if fi, err := os.Lstat(*logDir); err != nil || !fi.IsDir() { - log.Fatalf("%s is not a directory", *logDir) - } - // We could do fdup acrobatics to also redirect - // stderr, but it is simpler and more portable for the - // caller to divert stderr output if necessary. - go divertLogs(*logDir, *logRefresh) - } - - // Tune GOMAXPROCS to match Linux container CPU quota. - maxprocs.Set() - - if err := os.MkdirAll(*index, 0o755); err != nil { - log.Fatal(err) - } - - searcher, err := shards.NewDirectorySearcher(*index) - if err != nil { - log.Fatal(err) - } - - s := &web.Server{ - Searcher: searcher, - Top: web.Top, - Version: zoekt.Version, - } - - if *templateDir != "" { - if err := loadTemplates(s.Top, *templateDir); err != nil { - log.Fatalf("loadTemplates: %v", err) - } - } - - s.Print = *print - s.HTML = *html - - if *hostCustomization != "" { - s.HostCustomQueries = map[string]string{} - for _, h := range strings.SplitN(*hostCustomization, ",", -1) { - if len(h) == 0 { - continue - } - fields := strings.SplitN(h, "=", 2) - if len(fields) < 2 { - log.Fatalf("invalid host_customization %q", h) - } - - s.HostCustomQueries[fields[0]] = fields[1] - } - } - - handler, err := web.NewMux(s) - if err != nil { - log.Fatal(err) - } - - handler.Handle("/metrics", promhttp.Handler()) - - if *enablePprof { - handler.HandleFunc("/debug/pprof/", pprof.Index) - handler.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) - handler.HandleFunc("/debug/pprof/profile", pprof.Profile) - handler.HandleFunc("/debug/pprof/symbol", pprof.Symbol) - handler.HandleFunc("/debug/pprof/trace", pprof.Trace) - handler.HandleFunc("/debug/requests/", trace.Traces) - handler.HandleFunc("/debug/events/", trace.Events) - } - - watchdogAddr := "http://" + *listen - if *sslCert != "" || *sslKey != "" { - watchdogAddr = "https://" + *listen - } - go watchdog(30*time.Second, watchdogAddr) - - if *sslCert != "" || *sslKey != "" { - log.Printf("serving HTTPS on %s", *listen) - err = http.ListenAndServeTLS(*listen, *sslCert, *sslKey, handler) - } else { - log.Printf("serving HTTP on %s", *listen) - err = http.ListenAndServe(*listen, handler) - } - log.Printf("ListenAndServe: %v", err) -} - -func watchdogOnce(ctx context.Context, client *http.Client, addr string) error { - ctx, cancel := context.WithDeadline(ctx, time.Now().Add(5*time.Second)) - defer cancel() - - req, err := http.NewRequest("GET", addr, nil) - if err != nil { - return err - } - - req = req.WithContext(ctx) - - resp, err := client.Do(req) - if err != nil { - return err - } - - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("watchdog: status %v", resp.StatusCode) - } - return nil -} - -func watchdog(dt time.Duration, addr string) { - tr := &http.Transport{ - TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, - } - client := &http.Client{ - Transport: tr, - } - tick := time.NewTicker(dt) - - errCount := 0 - for range tick.C { - err := watchdogOnce(context.Background(), client, addr) - if err != nil { - errCount++ - } else { - errCount = 0 - } - if errCount == 3 { - log.Panicf("watchdog: %v", err) - } - } -}
diff --git a/cmd/zoekt/main.go b/cmd/zoekt/main.go deleted file mode 100644 index 035e8fa..0000000 --- a/cmd/zoekt/main.go +++ /dev/null
@@ -1,158 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "context" - "flag" - "fmt" - "log" - "os" - "path/filepath" - "runtime/pprof" - "time" - - "github.com/google/zoekt" - "github.com/google/zoekt/query" - "github.com/google/zoekt/shards" -) - -func displayMatches(files []zoekt.FileMatch, pat string, withRepo bool, list bool) { - for _, f := range files { - r := "" - if withRepo { - r = f.Repository + "/" - } - if list { - fmt.Printf("%s%s\n", r, f.FileName) - continue - } - - for _, m := range f.LineMatches { - fmt.Printf("%s%s:%d:%s\n", r, f.FileName, m.LineNumber, m.Line) - } - } -} - -func loadShard(fn string, verbose bool) (zoekt.Searcher, error) { - f, err := os.Open(fn) - if err != nil { - return nil, err - } - - iFile, err := zoekt.NewIndexFile(f) - if err != nil { - return nil, err - } - - s, err := zoekt.NewSearcher(iFile) - if err != nil { - iFile.Close() - return nil, fmt.Errorf("NewSearcher(%s): %v", fn, err) - } - - if verbose { - repo, index, err := zoekt.ReadMetadata(iFile) - if err != nil { - iFile.Close() - return nil, fmt.Errorf("ReadMetadata(%s): %v", fn, err) - } - log.Printf("repo metadata: %#v", repo) - log.Printf("index metadata: %#v", index) - } - - return s, nil -} - -func main() { - shard := flag.String("shard", "", "search in a specific shard") - index := flag.String("index_dir", - filepath.Join(os.Getenv("HOME"), ".zoekt"), "search for index files in `directory`") - cpuProfile := flag.String("cpu_profile", "", "write cpu profile to `file`") - profileTime := flag.Duration("profile_time", time.Second, "run this long to gather stats.") - verbose := flag.Bool("v", false, "print some background data") - withRepo := flag.Bool("r", false, "print the repo before the file name") - list := flag.Bool("l", false, "print matching filenames only") - - flag.Usage = func() { - name := os.Args[0] - fmt.Fprintf(os.Stderr, "Usage:\n\n %s [option] QUERY\n"+ - "for example\n\n %s 'byte file:java -file:test'\n\n", name, name) - flag.PrintDefaults() - fmt.Fprintf(os.Stderr, "\n") - } - flag.Parse() - - if len(flag.Args()) == 0 { - fmt.Fprintf(os.Stderr, "Pattern is missing.\n") - flag.Usage() - os.Exit(2) - } - pat := flag.Arg(0) - - var searcher zoekt.Searcher - var err error - if *shard != "" { - searcher, err = loadShard(*shard, *verbose) - } else { - searcher, err = shards.NewDirectorySearcher(*index) - } - - if err != nil { - log.Fatal(err) - } - - query, err := query.Parse(pat) - if err != nil { - log.Fatal(err) - } - if *verbose { - log.Println("query:", query) - } - - var sOpts zoekt.SearchOptions - sres, err := searcher.Search(context.Background(), query, &sOpts) - if *cpuProfile != "" { - // If profiling, do it another time so we measure with - // warm caches. - f, err := os.Create(*cpuProfile) - if err != nil { - log.Fatal(err) - } - defer f.Close() - if *verbose { - log.Println("Displaying matches...") - } - - t := time.Now() - pprof.StartCPUProfile(f) - for { - sres, _ = searcher.Search(context.Background(), query, &sOpts) - if time.Since(t) > *profileTime { - break - } - } - pprof.StopCPUProfile() - } - - if err != nil { - log.Fatal(err) - } - - displayMatches(sres.Files, pat, *withRepo, *list) - if *verbose { - log.Printf("stats: %#v", sres.Stats) - } -}
diff --git a/contentprovider.go b/contentprovider.go deleted file mode 100644 index 62fd149..0000000 --- a/contentprovider.go +++ /dev/null
@@ -1,310 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "bytes" - "log" - "sort" - "unicode/utf8" -) - -var _ = log.Println - -// contentProvider is an abstraction to treat matches for names and -// content with the same code. -type contentProvider struct { - id *indexData - stats *Stats - - // mutable - err error - idx uint32 - _data []byte - _nl []uint32 - _nlBuf []uint32 - _sects []DocumentSection - _sectBuf []DocumentSection - fileSize uint32 -} - -// setDocument skips to the given document. -func (p *contentProvider) setDocument(docID uint32) { - fileStart := p.id.boundaries[docID] - - p.idx = docID - p.fileSize = p.id.boundaries[docID+1] - fileStart - - p._nl = nil - p._sects = nil - p._data = nil -} - -func (p *contentProvider) docSections() []DocumentSection { - if p._sects == nil { - var sz uint32 - p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) - p.stats.ContentBytesLoaded += int64(sz) - p._sectBuf = p._sects - } - return p._sects -} - -func (p *contentProvider) newlines() []uint32 { - if p._nl == nil { - var sz uint32 - p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) - p._nlBuf = p._nl - p.stats.ContentBytesLoaded += int64(sz) - } - return p._nl -} - -func (p *contentProvider) data(fileName bool) []byte { - if fileName { - return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] - } - - if p._data == nil { - p._data, p.err = p.id.readContents(p.idx) - p.stats.FilesLoaded++ - p.stats.ContentBytesLoaded += int64(len(p._data)) - } - return p._data -} - -// Find offset in bytes (relative to corpus start) for an offset in -// runes (relative to document start). If filename is set, the corpus -// is the set of filenames, with the document being the name itself. -func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { - if p.id.metaData.PlainASCII { - return r - } - - sample := p.id.runeOffsets - runeEnds := p.id.fileEndRunes - fileStartByte := p.id.boundaries[p.idx] - if filename { - sample = p.id.fileNameRuneOffsets - runeEnds = p.id.fileNameEndRunes - fileStartByte = p.id.fileNameIndex[p.idx] - } - - absR := r - if p.idx > 0 { - absR += runeEnds[p.idx-1] - } - - byteOff := sample[absR/runeOffsetFrequency] - left := absR % runeOffsetFrequency - - var data []byte - - if filename { - data = p.id.fileNameContent[byteOff:] - } else { - data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) - if p.err != nil { - return 0 - } - } - for left > 0 { - _, sz := utf8.DecodeRune(data) - byteOff += uint32(sz) - data = data[sz:] - left-- - } - - byteOff -= fileStartByte - return byteOff -} - -func (p *contentProvider) fillMatches(ms []*candidateMatch) []LineMatch { - var result []LineMatch - if ms[0].fileName { - // There is only "line" in a filename. - res := LineMatch{ - Line: p.id.fileName(p.idx), - FileName: true, - } - - for _, m := range ms { - res.LineFragments = append(res.LineFragments, LineFragmentMatch{ - LineOffset: int(m.byteOffset), - MatchLength: int(m.byteMatchSz), - Offset: m.byteOffset, - }) - - result = []LineMatch{res} - } - } else { - ms = breakMatchesOnNewlines(ms, p.data(false)) - result = p.fillContentMatches(ms) - } - - sects := p.docSections() - for i, m := range result { - result[i].Score = matchScore(sects, &m) - } - - return result -} - -func (p *contentProvider) fillContentMatches(ms []*candidateMatch) []LineMatch { - var result []LineMatch - for len(ms) > 0 { - m := ms[0] - num, lineStart, lineEnd := m.line(p.newlines(), p.fileSize) - - var lineCands []*candidateMatch - - endMatch := m.byteOffset + m.byteMatchSz - - for len(ms) > 0 { - m := ms[0] - if int(m.byteOffset) <= lineEnd { - endMatch = m.byteOffset + m.byteMatchSz - lineCands = append(lineCands, m) - ms = ms[1:] - } else { - break - } - } - - if len(lineCands) == 0 { - log.Panicf( - "%s %v infinite loop: num %d start,end %d,%d, offset %d", - p.id.fileName(p.idx), p.id.metaData, - num, lineStart, lineEnd, - m.byteOffset) - } - - data := p.data(false) - - // Due to merging matches, we may have a match that - // crosses a line boundary. Prevent confusion by - // taking lines until we pass the last match - for lineEnd < len(data) && endMatch > uint32(lineEnd) { - next := bytes.IndexByte(data[lineEnd+1:], '\n') - if next == -1 { - lineEnd = len(data) - } else { - // TODO(hanwen): test that checks "+1" part here. - lineEnd += next + 1 - } - } - - finalMatch := LineMatch{ - LineStart: lineStart, - LineEnd: lineEnd, - LineNumber: num, - } - finalMatch.Line = data[lineStart:lineEnd] - - for _, m := range lineCands { - fragment := LineFragmentMatch{ - Offset: m.byteOffset, - LineOffset: int(m.byteOffset) - lineStart, - MatchLength: int(m.byteMatchSz), - } - finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) - } - result = append(result, finalMatch) - } - return result -} - -const ( - // TODO - how to scale this relative to rank? - scorePartialWordMatch = 50.0 - scoreWordMatch = 500.0 - scoreImportantThreshold = 2000.0 - scorePartialSymbol = 4000.0 - scoreSymbol = 7000.0 - scoreFactorAtomMatch = 400.0 - scoreShardRankFactor = 20.0 - scoreFileOrderFactor = 10.0 - scoreLineOrderFactor = 1.0 -) - -func findSection(secs []DocumentSection, off, sz uint32) *DocumentSection { - j := sort.Search(len(secs), func(i int) bool { - return secs[i].End >= off+sz - }) - - if j == len(secs) { - return nil - } - - if secs[j].Start <= off && off+sz <= secs[j].End { - return &secs[j] - } - return nil -} - -func matchScore(secs []DocumentSection, m *LineMatch) float64 { - var maxScore float64 - for _, f := range m.LineFragments { - startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset])) - - end := int(f.LineOffset) + f.MatchLength - endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end])) - - score := 0.0 - if startBoundary && endBoundary { - score = scoreWordMatch - } else if startBoundary || endBoundary { - score = scorePartialWordMatch - } - - sec := findSection(secs, f.Offset, uint32(f.MatchLength)) - if sec != nil { - startMatch := sec.Start == f.Offset - endMatch := sec.End == f.Offset+uint32(f.MatchLength) - if startMatch && endMatch { - score += scoreSymbol - } else if startMatch || endMatch { - score += (scoreSymbol + scorePartialSymbol) / 2 - } else { - score += scorePartialSymbol - } - } - if score > maxScore { - maxScore = score - } - } - return maxScore -} - -type matchScoreSlice []LineMatch - -func (m matchScoreSlice) Len() int { return len(m) } -func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } -func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } - -type fileMatchSlice []FileMatch - -func (m fileMatchSlice) Len() int { return len(m) } -func (m fileMatchSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } -func (m fileMatchSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } - -func sortMatchesByScore(ms []LineMatch) { - sort.Sort(matchScoreSlice(ms)) -} - -// Sort a slice of results. -func SortFilesByScore(ms []FileMatch) { - sort.Sort(fileMatchSlice(ms)) -}
diff --git a/ctags/json.go b/ctags/json.go deleted file mode 100644 index 980b54b..0000000 --- a/ctags/json.go +++ /dev/null
@@ -1,265 +0,0 @@ -// Copyright 2017 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ctags - -import ( - "bufio" - "bytes" - "encoding/json" - "fmt" - "io" - "log" - "os" - "os/exec" - "runtime" - "strings" - "sync" -) - -const debug = false - -type ctagsProcess struct { - cmd *exec.Cmd - in io.WriteCloser - out *scanner - outPipe io.ReadCloser -} - -func newProcess(bin string) (*ctagsProcess, error) { - opt := "default" - if runtime.GOOS == "linux" { - opt = "sandbox" - } - - cmd := exec.Command(bin, "--_interactive="+opt, "--fields=*") - in, err := cmd.StdinPipe() - if err != nil { - return nil, err - } - - out, err := cmd.StdoutPipe() - if err != nil { - in.Close() - return nil, err - } - cmd.Stderr = os.Stderr - proc := ctagsProcess{ - cmd: cmd, - in: in, - out: &scanner{r: bufio.NewReaderSize(out, 4096)}, - outPipe: out, - } - - if err := cmd.Start(); err != nil { - return nil, err - } - - var init reply - if err := proc.read(&init); err != nil { - return nil, err - } - - return &proc, nil -} - -func (p *ctagsProcess) Close() { - p.cmd.Process.Kill() - p.outPipe.Close() - p.in.Close() -} - -func (p *ctagsProcess) read(rep *reply) error { - if !p.out.Scan() { - // Some errors do not kill the parser. We would deadlock if we waited - // for the process to exit. - err := p.out.Err() - p.Close() - return err - } - if debug { - log.Printf("read %q", p.out.Bytes()) - } - - // See https://github.com/universal-ctags/ctags/issues/1493 - if bytes.Equal([]byte("(null)"), p.out.Bytes()) { - return nil - } - - err := json.Unmarshal(p.out.Bytes(), rep) - if err != nil { - return fmt.Errorf("unmarshal(%q): %v", p.out.Bytes(), err) - } - return nil -} - -func (p *ctagsProcess) post(req *request, content []byte) error { - body, err := json.Marshal(req) - if err != nil { - return err - } - body = append(body, '\n') - if debug { - log.Printf("post %q", body) - } - - if _, err = p.in.Write(body); err != nil { - return err - } - _, err = p.in.Write(content) - if debug { - log.Println(string(content)) - } - return err -} - -type request struct { - Command string `json:"command"` - Filename string `json:"filename"` - Size int `json:"size"` -} - -type reply struct { - // Init - Typ string `json:"_type"` - Name string `json:"name"` - Version string `json:"version"` - - // completed - Command string `json:"command"` - - // Ignore pattern: we don't use it and universal-ctags - // sometimes generates 'false' as value. - Path string `json:"path"` - Language string `json:"language"` - Line int `json:"line"` - Kind string `json:"kind"` - End int `json:"end"` - Scope string `json:"scope"` - ScopeKind string `json:"scopeKind"` - Access string `json:"access"` - Signature string `json:"signature"` -} - -func (p *ctagsProcess) Parse(name string, content []byte) ([]*Entry, error) { - req := request{ - Command: "generate-tags", - Size: len(content), - Filename: name, - } - - if err := p.post(&req, content); err != nil { - return nil, err - } - - var es []*Entry - for { - var rep reply - if err := p.read(&rep); err != nil { - return nil, err - } - if rep.Typ == "completed" { - break - } - - e := Entry{ - Sym: rep.Name, - Path: rep.Path, - Line: rep.Line, - Kind: rep.Kind, - Language: rep.Language, - } - - es = append(es, &e) - } - - return es, nil -} - -// scanner is like bufio.Scanner but skips long lines instead of returning -// bufio.ErrTooLong. -// -// Additionally it will skip empty lines. -type scanner struct { - r *bufio.Reader - line []byte - err error -} - -func (s *scanner) Scan() bool { - if s.err != nil { - return false - } - - var ( - err error - line []byte - ) - - for err == nil && len(line) == 0 { - line, err = s.r.ReadSlice('\n') - for err == bufio.ErrBufferFull { - // make line empty so we ignore it - line = nil - _, err = s.r.ReadSlice('\n') - } - line = bytes.TrimSuffix(line, []byte{'\n'}) - line = bytes.TrimSuffix(line, []byte{'\r'}) - } - - s.line, s.err = line, err - return len(line) > 0 -} - -func (s *scanner) Bytes() []byte { - return s.line -} - -func (s *scanner) Err() error { - if s.err == io.EOF { - return nil - } - return s.err -} - -type Parser interface { - Parse(name string, content []byte) ([]*Entry, error) -} - -type lockedParser struct { - p Parser - l sync.Mutex -} - -func (lp *lockedParser) Parse(name string, content []byte) ([]*Entry, error) { - lp.l.Lock() - defer lp.l.Unlock() - return lp.p.Parse(name, content) -} - -// NewParser creates a parser that is implemented by the given -// universal-ctags binary. The parser is safe for concurrent use. -func NewParser(bin string) (Parser, error) { - if strings.Contains(bin, "universal-ctags") { - // todo: restart, parallelization. - proc, err := newProcess(bin) - if err != nil { - return nil, err - } - return &lockedParser{p: proc}, nil - } - - log.Fatal("not implemented") - return nil, nil -}
diff --git a/ctags/json_test.go b/ctags/json_test.go deleted file mode 100644 index c6f1d3c..0000000 --- a/ctags/json_test.go +++ /dev/null
@@ -1,143 +0,0 @@ -// Copyright 2017 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ctags - -import ( - "bufio" - "os/exec" - "reflect" - "strings" - "testing" - - "github.com/google/go-cmp/cmp" -) - -func TestJSON(t *testing.T) { - if _, err := exec.LookPath("universal-ctags"); err != nil { - t.Skip(err) - } - - p, err := newProcess("universal-ctags") - if err != nil { - t.Fatal("newProcess", err) - } - - defer p.Close() - - java := ` -package io.zoekt; -import java.util.concurrent.Future; -class Back implements Future extends Frob { - public static int BLA = 1; - public int member; - public Back() { - member = 2; - } - public int method() { - member++; - } -} -` - name := "io/zoekt/Back.java" - got, err := p.Parse(name, []byte(java)) - if err != nil { - t.Errorf("Process: %v", err) - } - - want := []*Entry{ - { - Sym: "io.zoekt", - Kind: "package", - Language: "Java", - Path: "io/zoekt/Back.java", - Line: 2, - }, - { - Sym: "Back", - Path: "io/zoekt/Back.java", - Line: 4, - Language: "Java", - Kind: "class", - }, - - { - Sym: "BLA", - Path: "io/zoekt/Back.java", - Line: 5, - Kind: "field", - Language: "Java", - }, - { - Sym: "member", - Path: "io/zoekt/Back.java", - Line: 6, - Language: "Java", - Kind: "field", - }, - { - Sym: "Back", - Path: "io/zoekt/Back.java", - Language: "Java", - Line: 7, - Kind: "method", - }, - { - Sym: "method", - Language: "Java", - Path: "io/zoekt/Back.java", - Line: 10, - Kind: "method", - }, - } - - for i := range want { - if !reflect.DeepEqual(got[i], want[i]) { - t.Fatalf("got %#v, want %#v", got[i], want[i]) - } - } -} - -func TestScanner(t *testing.T) { - size := 20 - - input := strings.Join([]string{ - "aaaaaaaaa", - strings.Repeat("B", 3*size+3), - strings.Repeat("C", size) + strings.Repeat("D", size+1), - "", - strings.Repeat("e", size-1), - "f\r", - "gg", - }, "\n") - want := []string{ - "aaaaaaaaa", - strings.Repeat("e", size-1), - "f", - "gg", - } - - var got []string - r := &scanner{r: bufio.NewReaderSize(strings.NewReader(input), size)} - for r.Scan() { - got = append(got, string(r.Bytes())) - } - if err := r.Err(); err != nil { - t.Fatal(err) - } - - if !cmp.Equal(got, want) { - t.Errorf("mismatch (-want +got):\n%s", cmp.Diff(want, got)) - } -}
diff --git a/ctags/parse.go b/ctags/parse.go deleted file mode 100644 index d199428..0000000 --- a/ctags/parse.go +++ /dev/null
@@ -1,73 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ctags - -import ( - "fmt" - "strconv" - "strings" -) - -type Entry struct { - Sym string - Path string - Line int - Kind string - Language string - Parent string - ParentType string - - FileLimited bool -} - -// Parse parses a single line of exuberant "ctags -n" output. -func Parse(in string) (*Entry, error) { - fields := strings.Split(in, "\t") - e := Entry{} - - if len(fields) < 3 { - return nil, fmt.Errorf("too few fields: %q", in) - } - - e.Sym = fields[0] - e.Path = fields[1] - - lstr := fields[2] - if len(lstr) < 2 { - return nil, fmt.Errorf("got %q for linenum field", lstr) - } - - l, err := strconv.ParseInt(lstr[:len(lstr)-2], 10, 64) - if err != nil { - return nil, err - } - e.Line = int(l) - e.Kind = fields[3] - -field: - for _, f := range fields[3:] { - if string(f) == "file:" { - e.FileLimited = true - } - for _, p := range []string{"class", "enum"} { - if strings.HasPrefix(f, p+":") { - e.Parent = strings.TrimPrefix(f, p+":") - e.ParentType = p - continue field - } - } - } - return &e, nil -}
diff --git a/ctags/parse_test.go b/ctags/parse_test.go deleted file mode 100644 index 30f41a1..0000000 --- a/ctags/parse_test.go +++ /dev/null
@@ -1,61 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ctags - -import ( - "reflect" - "testing" -) - -func TestParse(t *testing.T) { - type testcase struct { - in string - out *Entry - } - cases := []testcase{ - { - `ABBREV_SHA ./gitiles-servlet/src/main/java/com/google/gitiles/CommitData.java 59;" e enum:CommitData.Field file:`, - &Entry{ - Sym: "ABBREV_SHA", - Path: "./gitiles-servlet/src/main/java/com/google/gitiles/CommitData.java", - Line: 59, - Kind: "e", - Parent: "CommitData.Field", - ParentType: "enum", - FileLimited: true, - }, - }, - { - `ACCESS_ATTRIBUTE ./gitiles-servlet/src/main/java/com/google/gitiles/CommitData.java 55;" f class:BaseServlet file:`, - &Entry{ - Sym: "ACCESS_ATTRIBUTE", - Path: "./gitiles-servlet/src/main/java/com/google/gitiles/CommitData.java", - Line: 55, - Kind: "f", - Parent: "BaseServlet", - ParentType: "class", - FileLimited: true, - }, - }, - } - for _, c := range cases { - e, err := Parse(c.in) - if err != nil && c.out != nil { - t.Errorf("Parse(%s): %v", c.in, err) - } else if !reflect.DeepEqual(c.out, e) { - t.Errorf("Parse(%s): got %#v, want %#v", c.in, e, c.out) - } - } -}
diff --git a/doc/ctags.md b/doc/ctags.md deleted file mode 100644 index 36b4014..0000000 --- a/doc/ctags.md +++ /dev/null
@@ -1,41 +0,0 @@ - -CTAGS -===== - -Ctags generates indices of symbol definitions in source files. It -started its life as part of the BSD Unix, but there are several more -modern flavors. Zoekt supports both [exuberant -ctags](http://ctags.sourceforge.net/) and -[universal-ctags](https://github.com/universal-ctags). - -It is strongly recommended to use Universal Ctags, [version -`db3d9a6`](https://github.com/universal-ctags/ctags/commit/4ff09da9b0a36a9e75c92f4be05d476b35b672cd) -or newer, running on the Linux platform. - -From this version on, universal ctags will be called using seccomp, -which guarantees that security problems in ctags cannot escalate to -access to the indexing machine. - -Ubuntu, Debian and Arch provide universal ctags with seccomp support -compiled in. Zoekt expects the `universal-ctags` binary to be on -`$PATH`. Note: only Ubuntu names the binary `universal-ctags`, while -most distributions name it `ctags`. - -Use the following invocation to compile and install universal-ctags: - -``` -sudo apt-get install - pkg-config autoconf \ - libseccomp-dev libseccomp \ - libjansson-dev libjansson - -./autogen.sh -LDFLAGS=-static ./configure --enable-json --enable-seccomp -make -j4 - -# create tarball -NAME=ctags-$(date --iso-8601=minutes | tr -d ':' | sed 's|\+.*$||')-$(git show --pretty=format:%h -q) -mkdir ${NAME} -cp ctags ${NAME}/universal-ctags -tar zcf ${NAME}.tar.gz ${NAME}/ -```
diff --git a/doc/design.md b/doc/design.md deleted file mode 100644 index df3bfea..0000000 --- a/doc/design.md +++ /dev/null
@@ -1,328 +0,0 @@ - - -OBJECTIVE -========= - -Provide full text code search for git based corpuses. - -Goals: - -* sub-50ms results on large codebases, such as Android (~2G text) or - Chrome - -* works well on a single standard Linux machine, with stable storage on SSD - -* search multiple repositories and multiple branches. - -* provide rich query language, with boolean operators - -* integrate with Gerrit/Gitiles code review/browsing system - - -SEARCHING AND INDEXING -====================== - - -Positional trigrams -------------------- - -We build an index of ngrams (n=3), where we store the offset of each -ngram's occurrence within a file. For example, if the corpus is "banana" -then we generate the index - - "ban": 0 - "ana": 1,3 - "nan": 2 - -If we are searching for a string (eg. "The quick brown fox"), then we -look for two trigrams (eg. "The" and "fox"), and check that they are -found at the right distance apart. - -Regular expressions are handled by extracting normal strings from the regular -expressions. For example, to search for - - (Path|PathFragment).*=.*/usr/local - -we look for - - (AND (OR substr:"Path" substr:"PathFragment") substr:"/usr/local") - -and any documents thus found would be searched for the regular -expression. - -Compared to indexing 3-grams on a per-file basis, as described -[here](https://swtch.com/~rsc/regexp/regexp4.html), there are some advantages: - -* for each substring, we only have to intersect just a couple of posting-lists: - one for the beginning, and one for the end. - -* Since we touch few posting lists per query, they can be stored on - slower media, such as SSD. - -* we can select any pair of trigrams from the pattern for which the - number of matches is minimal. For example, we could search for "qui" - rather than "the". - -There are some downsides compared to trigrams: - -* The index is large. Empirically, it is about 3x the corpus size, composed of - 2x (offsets), and 1x (original content). However, since we have to look at - just a limited number of ngrams, we don't have to keep the index in memory. - -Compared to [suffix -arrays](https://blog.nelhage.com/2015/02/regular-expression-search-with-suffix-arrays/), -there are the following advantages: - -* The index construction is straightforward, and can easily be made - incremental. - -* Since the posting lists for a trigram can be stored on SSD, - searching with positional trigrams only requires 1.2x corpus size of - RAM. - -* All the matches are returned in document order. This makes it - straightforward to process compound boolean queries with AND and OR. - -Downsides compared to suffix array: - -* there is no way to transform regular expressions into index ranges into - the suffix array. - - -Case sensitivity ----------------- - -Code usually is searched without regard for case. In this case, when -we are looking for "abc", we look for occurrences of all the different -case variants, ie. {"abc", "Abc", "aBc", "ABc", ... }, and then -compare the candidate matches without regard for case. - - -UTF-8 ------ - -UTF-8 is the defacto encoding for unicode. Zoekt assumes that files -are UTF-8 encoded. Characters have differing widths in UTF-8, so we -use rune offsets in the trigram index, and convert those back to bytes -with a lookup table: every 100 runes, we store the rune-index to -byte-index mapping. For corpuses that are completely ASCII (fairly -normal for source code), we short-circuit this lookup. - - -Branches --------- - -Each file blob in the index has a bitmask, representing the branches -in which the content is found, eg: - - branches: [master=1, staging=2, stable=4] - file "x.java", branch mask=3 - file "x.java", branch mask=4 - -in this case, the index holds two versions of "x.java", the one -present in "master" and "staging", and the one in the "stable" branch. - -With this technique, we can index many similar branches of a -repository with little space overhead. - - -Index format ------------- - -The index is organized in shards, where each shard is a file, laid out -such that it can be mmap'd efficiently. - -Each shard contains data for one code repository. The basic data in an -index shard are the following - - * file contents - * filenames - * the content posting lists (varint encoded) - * the filename posting lists (varint encoded) - * branch masks - * metadata (repository name, index format version, etc.) - -In practice, the shard size is about 3x the corpus (size). - -The format uses uint32 for all offsets, so the total size of a shard -should be below 4G. Given the size of the posting data, this caps -content size per shard at 1G. - -Currently, within a shard, a single goroutine searches all documents, -so the shard size determines the amount of parallelism, and large -repositories should be split across multiple shards to achieve good -performance. - -The metadata section contains a version number (which by convention is -also part of the file name of the shard). This provides a smooth -upgrade path across format versions: generate shards in the new -format, kill old search service, start new search service, delete old -shards. - - -Ranking -------- - -In absense of advanced signals (e.g. pagerank on symbol references), -ranking options are limited: the following signals could be used for -ranking - - * number of atoms matched - * closeness to matches for other atoms - * quality of match: does match boundary coincide with a word boundary? - * file latest update time - * filename lengh - * tokenizer ranking: does a match fall comment or string literal? - * symbol ranking: it the match a symbol definition? - -For the latter, it is necessary to find symbol definitions and other -sections within files on indexing. Several (imperfect) programs to do -this already exist, eg. `ctags`. - - -Query language --------------- - -Queries are stored as expression trees, using the following data -structure: - - Query: - Atom - | AND QueryList - | OR QueryList - | NOT Query - ; - - Atom: - ConstQuery - | SubStringQuery - | RegexpQuery - | RepoQuery - | BranchQuery - ; - -Both SubStringQuery and RegexpQuery can apply to either file or -contents, and can optionally be case-insensitive. - -ConstQuery (match everything, or match nothing) is a useful construct -for partial evaluation of a query: for each index shard through which -we search, we partially evaluate the query, eg. when the query is - - and[substr:"needle" repo:"zoekt"] - -then we can rewrite the query to FALSE if we are looking at a shard -for repository "bazel", skipping the entire shard. - -Each query must have at least one positive atom. Negations can only -serve to prune results generated by positive atoms. - - -Query parsing -------------- - -Strings in the input language are considered regular expressions -but literal regular expressions are simplified to Substring queries, - - a.*b => regexp:"a.*b" - a\.b => substring:"a.b" - -leading modifiers select different types of atoms, eg. - - file:java => Substring_file:"java" - branch:master => Repo:"master" - -parentheses inside a string (possibly with escaped spaces) are -interpreted as regular expressions, otherwise they are used for grouping - - (abc def) => and[substring:"abc" substring:"def"] - (abc\ def) => regexp:"(abc def)" - -there is an implicit "AND" between elements of a parenthesized list. -There is an "OR" operator, which has lower priority than the implicit -"AND": - - ppp qqq or rrr sss => or[and[substring:"ppp" substring:"qqq"] and[substring:"rrr" substring:"sss"]] - - -GERRIT/GITILES INTEGRATION -========================== - -Gerrit is a popular system for code review on open source -projects. Its sister project Gitiles provides a browsing experience. - -Any code search integration with Gerrit should be made available in -Gitiles. Gerrit/Gitiles has a complex ACL system, so a codesearch -solution for Gerrit/Gitiles should respect these ACLs. - -Since Gitiles knows the identity of the logged-in user, it can -construct search queries that respect ACLs, and even filter results -afterwards if necessary. In such a setup, only Gitiles is allowed to -talk to the search service, so it should be protected from general -access, e.g. by requiring authentication. - -A codesearch implementation for Gitiles would change Gitiles to show a -search box on pages relating to a repository. When searching, Gitiles -would also render the search results. The process is as follows: - - * On receiving a query, Gitiles finds the list of branches visible to the user - * Gitiles sends the raw query, along with branches and repository to the search service - * The search service parses the query, and embeds it as follows - - (AND original-query repo:REPO (OR "branch:visible-1" "branch:visible-2" .. )) - - * The search service returns the search results, leaving it to - gitiles to render them. Gitiles can apply any further filtering - as necessary. - - -SERVICE MANAGEMENT -================== - -The above details how indexing and searching works. A fully fledged -system also crawls repositories and (re)indexes them. Since the system -is designed to run on a single machine, we provide a service -management tool, with the following responsibilities: - - * Poll git hosting sites (eg. github.com, googlesource.com), to fetch new updates - * Reindex any changed repositories - * Run the webserver; and restart if it goes down for any reason - * Delete old webserver logs - - -Security --------- - -This section assumes that 'zoekt' is used as a public facing -webserver, indexing publicly available data, serving on HTTPS without -authentication. - -Since the UI is unauthenticated, there are no authentication secrets to steal. - -Since the code is public, there is no sensitive code to steal. - -This leaves us with the following senstive data: - - * Credentials for accesssing git repostitories (eg. github access token) - * TLS server certificates - * Query logs - -The system handles the following untrusted data: - - * code in git repositories - * search queries - -Since 'zoekt' itself is written in Go, it does not have memory -security problems: at worst, a bug in the query parser would lead to a -crash. - -The code to index is handled by `ctags` for symbol detection. The -security risk this poses is mitigated by using a seccomp based -sandboxing. - - -Privacy -------- - -Webserver logs can contain privacy sensitive data (such as IP -addresses and search queries). For this reason, the service management -tool deletes them after a configurable period of time.
diff --git a/doc/faq.md b/doc/faq.md deleted file mode 100644 index c8d81f0..0000000 --- a/doc/faq.md +++ /dev/null
@@ -1,161 +0,0 @@ -# Frequently asked questions - -## Why codesearch? - -Software engineering is more about reading than writing code, and part -of this process is finding the code that you should read. If you are -working on a large project, then finding source code through -navigation quickly becomes inefficient. - -Search engines let you find interesting code much faster than browsing -code, in much the same way that search engines speed up finding things -on the internet. - -## Can you give an example? - -I had to implement SSH hashed hostkey checking on a whim recently, and -here is how I quickly zoomed into the relevant code using -[our public zoekt instance](http://cs.bazel.build): - -* [hash host ssh](http://cs.bazel.build/search?q=hash+host+ssh&num=50): more than 20k results in 750 files, in 3 seconds - -* [hash host r:openssh](http://cs.bazel.build/search?q=hash+host+r%3Aopenssh&num=50): 6k results in 114 files, in 20ms - -* [hash host r:openssh known_host](http://cs.bazel.build/search?q=hash+host+r%3Aopenssh+known_host&num=50): 4k result in 42 files, in 13ms - -the last query still yielded a substantial number of results, but the -function `hash_host` that I was looking for was the 3rd result from -the first file. - -## What features make a code search engine great? - -Often, you don't know exactly what you are looking for, until you -found it. Code search is effective because you can formulate an -approximate query, and then refine it based on results you got. For -this to work, you need the following features: - -* Coverage: the code that interests you should be available for searching - -* Speed: search should return useful results quickly (sub-second), so - you can iterate on queries - -* Approximate queries: matching should be done case insensitively, on - arbitrary substrings, so we don't have to know what we are looking - for in advance. - -* Filtering: we can winnow down results by composing more specific queries - -* Ranking: interesting results (eg. function definitions, whole word - matches) should be at the top. - -## How does `zoekt` provide for these? - -* Coverage: `zoekt` comes with tools to mirror parts of common Git - hosting sites. `cs.bazel.build` uses this to index most of the - Google authored open source software on github.com and - googlesource.com. - -* Speed: `zoekt` uses an index based on positional trigrams. For rare - strings, eg. `nienhuys`, this typically yields results in ~10ms if - the operating system caches are warm. - -* Approximate queries: `zoekt` supports substring patterns and regular - expressions, and can do case-insensitive matching on UTF-8 text. - -* Filtering: you can filter query by adding extra atoms (eg. `f:\.go$` - limits to Go source code), and filter out terms with `-`, so - `\blinus\b -torvalds` finds the Linuses other than Linus Torvalds. - -* Ranking: zoekt uses - [ctags](https://github.com/universal-ctags/ctags) to find - declarations, and these are boosted in the search ranking. - - -## How does this compare to `grep -r`? - -Grep lets you find arbitrary substrings, but it doesn't scale to large -corpuses, and lacks filtering and ranking. - -## What about my IDE? - -If your project fits into your IDE, than that is great. -Unfortunately, loading projects into IDEs is slow, cumbersome, and not -supported by all projects. - -## What about the search on `github.com`? - -Github's search has great coverage, but unfortunately, its search -functionality doesn't support arbitrary substrings. For example, a -query [for part of my -surname](https://github.com/search?utf8=%E2%9C%93&q=nienhuy&type=Code) -does not turn up anything (except this document), while -[my complete -name](https://github.com/search?utf8=%E2%9C%93&q=nienhuys&type=Code) -does. - -## What about Etsy/Hound? - -[Etsy/hound](https://github.com/etsy/hound) is a code search engine -which supports regular expressions over large corpuses, it is about -10x slower than zoekt. However, there is only rudimentary support for -filtering, and there is no symbol ranking. - -## What about livegrep? - -[livegrep](https://livegrep.com) is a code search engine which -supports regular expressions over large corpuses. However, due to its -indexing technique, it requires a lot of RAM and CPU. There is only -rudimentary support for filtering, and there is no symbol ranking. - -## How much resources does `zoekt` require? - -The search server should have local SSD to store the index file (which -is 3.5x the corpus size), and have at least 20% more RAM than the -corpus size. - -## Can I index multiple branches? - -Yes. You can index 64 branches (see also -https://github.com/google/zoekt/issues/32). Files that are identical -across branches take up space just once in the index. - -## How fast is the search? - -Rare strings, are extremely fast to retrieve, for example `r:torvalds -crazy` (search "crazy" in the linux kernel) typically takes [about -7-10ms on -cs.bazel.build](http://cs.bazel.build/search?q=r%3Atorvalds+crazy&num=70). - -The speed for common strings is dominated by how many results you want -to see. For example [r:torvalds license] can give some results -quickly, but producing [all 86k -results](http://cs.bazel.build/search?q=r%3Atorvalds+license&num=50000) -takes between 100ms and 1 second. Then, streaming the results to your -browser, and rendering the HTML takes several seconds. - -## How fast is the indexer? - -The Linux kernel (55K files, 545M data) takes about 160s to index on -my x250 laptop using a single thread. The process can be parallelized -for speedup. - -## What does [cs.bazel.build](https://cs.bazel.build/) run on? - -Currently, it runs on a single Google Cloud VM with 16 vCPUs, 60G RAM and an -attached physical SSD. - -## How does `zoekt` work? - -In short, it splits up the file in trigrams (groups of 3 unicode -characters), and stores the offset of each occurrence. Substrings are -found by searching different trigrams from the query at the correct -distance apart. - -## I want to know more - -Some further background documentation - - * [Designdoc](design.md) for technical details - * [Godoc](https://godoc.org/github.com/google/zoekt) - * Gerrit 2016 user summit: [slides](https://storage.googleapis.com/gerrit-talks/summit/2016/zoekt.pdf) - * Gerrit 2017 user summit: [transcript](https://gitenterprise.me/2017/11/01/gerrit-user-summit-zoekt-code-search-engine/), [slides](https://storage.googleapis.com/gerrit-talks/summit/2017/Zoekt%20-%20improved%20codesearch.pdf), [video](https://www.youtube.com/watch?v=_-KTAvgJYdI)
diff --git a/doc/indexing.md b/doc/indexing.md deleted file mode 100644 index c6563c1..0000000 --- a/doc/indexing.md +++ /dev/null
@@ -1,15 +0,0 @@ - -# Configuration parameters - -Parameters are in the `zoekt` section of the git-config. - -* `name`: name of the repository, typically HOST/PATH, eg. `github.com/hanwen/usb`. - -* `web-url`: base URL for linking to files, commits, and the repository, eg. -`https://github.com/hanwen/usb` - -* `web-url-type`: type of URL, eg. github. Supported are cgit, - gitiles, gitweb and cgit. - -* `github-stars`, `github-forks`, `github-watchers`, - `github-subscribers`: counters for github interactions
diff --git a/eval.go b/eval.go deleted file mode 100644 index de32847..0000000 --- a/eval.go +++ /dev/null
@@ -1,504 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "context" - "fmt" - "log" - "regexp/syntax" - "sort" - "strings" - - "golang.org/x/net/trace" - - "github.com/google/zoekt/query" -) - -const maxUInt16 = 0xffff - -// DebugScore controls whether we collect data on match scores are -// constructed. Intended for use in tests. -var DebugScore = false - -func (m *FileMatch) addScore(what string, s float64) { - if DebugScore { - m.Debug += fmt.Sprintf("%s:%f, ", what, s) - } - m.Score += s -} - -func (d *indexData) simplify(in query.Q) query.Q { - eval := query.Map(in, func(q query.Q) query.Q { - if r, ok := q.(*query.Repo); ok { - return &query.Const{Value: strings.Contains(d.repoMetaData.Name, r.Pattern)} - } - if l, ok := q.(*query.Language); ok { - _, has := d.metaData.LanguageMap[l.Language] - if !has { - return &query.Const{Value: false} - } - } - return q - }) - return query.Simplify(eval) -} - -func (o *SearchOptions) SetDefaults() { - if o.ShardMaxMatchCount == 0 { - // We cap the total number of matches, so overly broad - // searches don't crash the machine. - o.ShardMaxMatchCount = 100000 - } - if o.TotalMaxMatchCount == 0 { - o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount - } - if o.ShardMaxImportantMatch == 0 { - o.ShardMaxImportantMatch = 10 - } - if o.TotalMaxImportantMatch == 0 { - o.TotalMaxImportantMatch = 10 * o.ShardMaxImportantMatch - } -} - -func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) { - copyOpts := *opts - opts = ©Opts - opts.SetDefaults() - importantMatchCount := 0 - - var res SearchResult - if len(d.fileNameIndex) == 0 { - return &res, nil - } - - select { - case <-ctx.Done(): - res.Stats.ShardsSkipped++ - return &res, nil - default: - } - - tr := trace.New("indexData.Search", d.file.Name()) - tr.LazyPrintf("opts: %+v", opts) - defer func() { - if sr != nil { - tr.LazyPrintf("num files: %d", len(sr.Files)) - tr.LazyPrintf("stats: %+v", sr.Stats) - } - if err != nil { - tr.LazyPrintf("error: %v", err) - tr.SetError() - } - tr.Finish() - }() - - q = d.simplify(q) - tr.LazyLog(q, true) - if c, ok := q.(*query.Const); ok && !c.Value { - return &res, nil - } - - if opts.EstimateDocCount { - res.Stats.ShardFilesConsidered = len(d.fileBranchMasks) - return &res, nil - } - - q = query.Map(q, query.ExpandFileContent) - - mt, err := d.newMatchTree(q) - if err != nil { - return nil, err - } - - totalAtomCount := 0 - visitMatchTree(mt, func(t matchTree) { - totalAtomCount++ - }) - - cp := &contentProvider{ - id: d, - stats: &res.Stats, - } - - docCount := uint32(len(d.fileBranchMasks)) - lastDoc := int(-1) - -nextFileMatch: - for { - canceled := false - select { - case <-ctx.Done(): - canceled = true - default: - } - - nextDoc := mt.nextDoc() - if int(nextDoc) <= lastDoc { - nextDoc = uint32(lastDoc + 1) - } - if nextDoc >= docCount { - break - } - lastDoc = int(nextDoc) - - if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) || - (opts.ShardMaxImportantMatch > 0 && importantMatchCount >= opts.ShardMaxImportantMatch) { - res.Stats.FilesSkipped += d.repoListEntry.Stats.Documents - lastDoc - break - } - - res.Stats.FilesConsidered++ - mt.prepare(nextDoc) - - cp.setDocument(nextDoc) - - known := make(map[matchTree]bool) - for cost := costMin; cost <= costMax; cost++ { - v, ok := mt.matches(cp, cost, known) - if ok && !v { - continue nextFileMatch - } - - if cost == costMax && !ok { - log.Panicf("did not decide. Repo %s, doc %d, known %v", - d.repoMetaData.Name, nextDoc, known) - } - } - - fileMatch := FileMatch{ - Repository: d.repoMetaData.Name, - FileName: string(d.fileName(nextDoc)), - Checksum: d.getChecksum(nextDoc), - Language: d.languageMap[d.languages[nextDoc]], - } - - if s := d.subRepos[nextDoc]; s > 0 { - if s >= uint32(len(d.subRepoPaths)) { - log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths) - } - path := d.subRepoPaths[s] - fileMatch.SubRepositoryPath = path - sr := d.repoMetaData.SubRepoMap[path] - fileMatch.SubRepositoryName = sr.Name - if idx := d.branchIndex(nextDoc); idx >= 0 { - fileMatch.Version = sr.Branches[idx].Version - } - } else { - idx := d.branchIndex(nextDoc) - if idx >= 0 { - fileMatch.Version = d.repoMetaData.Branches[idx].Version - } - } - - atomMatchCount := 0 - visitMatches(mt, known, func(mt matchTree) { - atomMatchCount++ - }) - finalCands := gatherMatches(mt, known) - - if len(finalCands) == 0 { - nm := d.fileName(nextDoc) - finalCands = append(finalCands, - &candidateMatch{ - caseSensitive: false, - fileName: true, - substrBytes: nm, - substrLowered: nm, - file: nextDoc, - runeOffset: 0, - byteOffset: 0, - byteMatchSz: uint32(len(nm)), - }) - } - fileMatch.LineMatches = cp.fillMatches(finalCands) - - maxFileScore := 0.0 - for i := range fileMatch.LineMatches { - if maxFileScore < fileMatch.LineMatches[i].Score { - maxFileScore = fileMatch.LineMatches[i].Score - } - - // Order by ordering in file. - fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) - } - - // Maintain ordering of input files. This - // strictly dominates the in-file ordering of - // the matches. - fileMatch.addScore("fragment", maxFileScore) - fileMatch.addScore("atom", float64(atomMatchCount)/float64(totalAtomCount)*scoreFactorAtomMatch) - - // Prefer earlier docs. - fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(nextDoc)/float64(len(d.boundaries)))) - fileMatch.addScore("shard-order", scoreShardRankFactor*float64(d.repoMetaData.Rank)/maxUInt16) - - if fileMatch.Score > scoreImportantThreshold { - importantMatchCount++ - } - fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) - sortMatchesByScore(fileMatch.LineMatches) - if opts.Whole { - fileMatch.Content = cp.data(false) - } - - res.Files = append(res.Files, fileMatch) - res.Stats.MatchCount += len(fileMatch.LineMatches) - res.Stats.FileCount++ - } - SortFilesByScore(res.Files) - - addRepo(&res, &d.repoMetaData) - for _, v := range d.repoMetaData.SubRepoMap { - addRepo(&res, v) - } - - visitMatchTree(mt, func(mt matchTree) { - if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { - atom.updateStats(&res.Stats) - } - }) - return &res, nil -} - -func addRepo(res *SearchResult, repo *Repository) { - if res.RepoURLs == nil { - res.RepoURLs = map[string]string{} - } - res.RepoURLs[repo.Name] = repo.FileURLTemplate - - if res.LineFragments == nil { - res.LineFragments = map[string]string{} - } - res.LineFragments[repo.Name] = repo.LineFragmentTemplate -} - -type sortByOffsetSlice []*candidateMatch - -func (m sortByOffsetSlice) Len() int { return len(m) } -func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } -func (m sortByOffsetSlice) Less(i, j int) bool { - return m[i].byteOffset < m[j].byteOffset -} - -// Gather matches from this document. This never returns a mixture of -// filename/content matches: if there are content matches, all -// filename matches are trimmed from the result. The matches are -// returned in document order and are non-overlapping. -func gatherMatches(mt matchTree, known map[matchTree]bool) []*candidateMatch { - var cands []*candidateMatch - visitMatches(mt, known, func(mt matchTree) { - if smt, ok := mt.(*substrMatchTree); ok { - cands = append(cands, smt.current...) - } - if rmt, ok := mt.(*regexpMatchTree); ok { - cands = append(cands, rmt.found...) - } - }) - - foundContentMatch := false - for _, c := range cands { - if !c.fileName { - foundContentMatch = true - break - } - } - - res := cands[:0] - for _, c := range cands { - if !foundContentMatch || !c.fileName { - res = append(res, c) - } - } - cands = res - - // Merge adjacent candidates. This guarantees that the matches - // are non-overlapping. - sort.Sort((sortByOffsetSlice)(cands)) - res = cands[:0] - for i, c := range cands { - if i == 0 { - res = append(res, c) - continue - } - last := res[len(res)-1] - lastEnd := last.byteOffset + last.byteMatchSz - end := c.byteOffset + c.byteMatchSz - if lastEnd >= c.byteOffset { - if end > lastEnd { - last.byteMatchSz = end - last.byteOffset - } - continue - } - - res = append(res, c) - } - - return res -} - -func (d *indexData) branchIndex(docID uint32) int { - mask := d.fileBranchMasks[docID] - idx := 0 - for mask != 0 { - if mask&0x1 != 0 { - return idx - } - idx++ - mask >>= 1 - } - return -1 -} - -// gatherBranches returns a list of branch names. -func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { - foundBranchQuery := false - var branches []string - - visitMatches(mt, known, func(mt matchTree) { - bq, ok := mt.(*branchQueryMatchTree) - if ok { - foundBranchQuery = true - branches = append(branches, - d.branchNames[uint(bq.mask)]) - } - }) - - if !foundBranchQuery { - mask := d.fileBranchMasks[docID] - id := uint32(1) - for mask != 0 { - if mask&0x1 != 0 { - branches = append(branches, d.branchNames[uint(id)]) - } - id <<= 1 - mask >>= 1 - } - } - return branches -} - -func (d *indexData) List(ctx context.Context, q query.Q) (rl *RepoList, err error) { - tr := trace.New("indexData.List", d.file.Name()) - defer func() { - if rl != nil { - tr.LazyPrintf("repos size: %d", len(rl.Repos)) - tr.LazyPrintf("crashes: %d", rl.Crashes) - } - if err != nil { - tr.LazyPrintf("error: %v", err) - tr.SetError() - } - tr.Finish() - }() - - q = d.simplify(q) - tr.LazyLog(q, true) - c, ok := q.(*query.Const) - - if !ok { - return nil, fmt.Errorf("List should receive Repo-only query") - } - - l := &RepoList{} - if c.Value { - l.Repos = append(l.Repos, &d.repoListEntry) - } - return l, nil -} - -// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If -// mt is equivalent to the input r, isEqual = true and the matchTree can be used -// in place of the regex r. If singleLine = true, then the matchTree and all -// its children only match terms on the same line. singleLine is used during -// recursion to decide whether to return an andLineMatchTree (singleLine = true) -// or a andMatchTree (singleLine = false). -func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) { - // TODO - we could perhaps transform Begin/EndText in '\n'? - // TODO - we could perhaps transform CharClass in (OrQuery ) - // if there are just a few runes, and part of a OpConcat? - switch r.Op { - case syntax.OpLiteral: - s := string(r.Rune) - if len(s) >= minTextSize { - mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive}) - return mt, true, !strings.Contains(s, "\n"), err - } - case syntax.OpCapture: - return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) - - case syntax.OpPlus: - return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) - - case syntax.OpRepeat: - if r.Min == 1 { - return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) - } else if r.Min > 1 { - // (x){2,} can't be expressed precisely by the matchTree - mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) - return mt, false, singleLine, err - } - case syntax.OpConcat, syntax.OpAlternate: - var qs []matchTree - isEq := true - singleLine = true - for _, sr := range r.Sub { - if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil { - if err != nil { - return nil, false, false, err - } - isEq = isEq && subIsEq - singleLine = singleLine && subSingleLine - qs = append(qs, sq) - } - } - if r.Op == syntax.OpConcat { - if len(qs) > 1 { - isEq = false - } - newQs := make([]matchTree, 0, len(qs)) - for _, q := range qs { - if _, ok := q.(*bruteForceMatchTree); ok { - continue - } - newQs = append(newQs, q) - } - if len(newQs) == 1 { - return newQs[0], isEq, singleLine, nil - } - if len(newQs) == 0 { - return &bruteForceMatchTree{}, isEq, singleLine, nil - } - if singleLine { - return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil - } - return &andMatchTree{newQs}, isEq, singleLine, nil - } - for _, q := range qs { - if _, ok := q.(*bruteForceMatchTree); ok { - return q, isEq, false, nil - } - } - if len(qs) == 0 { - return &noMatchTree{"const"}, isEq, false, nil - } - return &orMatchTree{qs}, isEq, false, nil - case syntax.OpStar: - if r.Sub[0].Op == syntax.OpAnyCharNotNL { - return &bruteForceMatchTree{}, false, true, nil - } - } - return &bruteForceMatchTree{}, false, false, nil -}
diff --git a/eval_test.go b/eval_test.go deleted file mode 100644 index ce16a60..0000000 --- a/eval_test.go +++ /dev/null
@@ -1,128 +0,0 @@ -// Copyright 2020 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "reflect" - "regexp/syntax" - "strings" - "testing" - - "github.com/google/zoekt/query" -) - -var opnames = map[syntax.Op]string{ - syntax.OpNoMatch: "OpNoMatch", - syntax.OpEmptyMatch: "OpEmptyMatch", - syntax.OpLiteral: "OpLiteral", - syntax.OpCharClass: "OpCharClass", - syntax.OpAnyCharNotNL: "OpAnyCharNotNL", - syntax.OpAnyChar: "OpAnyChar", - syntax.OpBeginLine: "OpBeginLine", - syntax.OpEndLine: "OpEndLine", - syntax.OpBeginText: "OpBeginText", - syntax.OpEndText: "OpEndText", - syntax.OpWordBoundary: "OpWordBoundary", - syntax.OpNoWordBoundary: "OpNoWordBoundary", - syntax.OpCapture: "OpCapture", - syntax.OpStar: "OpStar", - syntax.OpPlus: "OpPlus", - syntax.OpQuest: "OpQuest", - syntax.OpRepeat: "OpRepeat", - syntax.OpConcat: "OpConcat", - syntax.OpAlternate: "OpAlternate", -} - -func printRegexp(t *testing.T, r *syntax.Regexp, lvl int) { - t.Logf("%s%s ch: %d", strings.Repeat(" ", lvl), opnames[r.Op], len(r.Sub)) - for _, s := range r.Sub { - printRegexp(t, s, lvl+1) - } -} - -func substrMT(pattern string) matchTree { - d := &indexData{} - mt, _ := d.newSubstringMatchTree(&query.Substring{ - Pattern: pattern, - }) - return mt -} - -func TestRegexpParse(t *testing.T) { - type testcase struct { - in string - query matchTree - isEquivalent bool - } - - cases := []testcase{ - {"(foo|)bar", substrMT("bar"), false}, - {"(foo|)", &bruteForceMatchTree{}, false}, - {"(foo|bar)baz.*bla", &andMatchTree{[]matchTree{ - &orMatchTree{[]matchTree{ - substrMT("foo"), - substrMT("bar"), - }}, - substrMT("baz"), - substrMT("bla"), - }}, false}, - { - "^[a-z](People)+barrabas$", - &andMatchTree{[]matchTree{ - substrMT("People"), - substrMT("barrabas"), - }}, false, - }, - {"foo", substrMT("foo"), true}, - {"^foo", substrMT("foo"), false}, - {"(foo) (bar)", &andMatchTree{[]matchTree{substrMT("foo"), substrMT("bar")}}, false}, - {"(thread|needle|haystack)", &orMatchTree{[]matchTree{ - substrMT("thread"), - substrMT("needle"), - substrMT("haystack"), - }}, true}, - {"(foo)(?-s:.)*?(bar)", &andLineMatchTree{andMatchTree{[]matchTree{ - substrMT("foo"), - substrMT("bar"), - }}}, false}, - {"(foo)(?-s:.)*?[[:space:]](?-s:.)*?(bar)", &andMatchTree{[]matchTree{ - substrMT("foo"), - substrMT("bar"), - }}, false}, - {"(foo){2,}", substrMT("foo"), false}, - {"(...)(...)", &bruteForceMatchTree{}, false}, - } - - for _, c := range cases { - r, err := syntax.Parse(c.in, syntax.Perl) - if err != nil { - t.Errorf("Parse(%q): %v", c.in, err) - continue - } - d := indexData{} - q := query.Regexp{ - Regexp: r, - } - gotQuery, isEq, _, _ := d.regexpToMatchTreeRecursive(q.Regexp, 3, q.FileName, q.CaseSensitive) - if !reflect.DeepEqual(c.query, gotQuery) { - printRegexp(t, r, 0) - t.Errorf("regexpToQuery(%q): got %v, want %v", c.in, gotQuery, c.query) - } - if isEq != c.isEquivalent { - printRegexp(t, r, 0) - t.Errorf("regexpToQuery(%q): got %v, want %v", c.in, isEq, c.isEquivalent) - } - } -}
diff --git a/gitindex/clone.go b/gitindex/clone.go deleted file mode 100644 index 11b2c87..0000000 --- a/gitindex/clone.go +++ /dev/null
@@ -1,96 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import ( - "bytes" - "log" - "os" - "os/exec" - "path/filepath" - "sort" - - git "github.com/go-git/go-git/v5" - "github.com/go-git/go-git/v5/config" -) - -// CloneRepo clones one repository, adding the given config -// settings. It returns the bare repo directory. The `name` argument -// determines where the repo is stored relative to `destDir`. Returns -// the directory of the repository. -func CloneRepo(destDir, name, cloneURL string, settings map[string]string) (string, error) { - parent := filepath.Join(destDir, filepath.Dir(name)) - if err := os.MkdirAll(parent, 0o755); err != nil { - return "", err - } - - repoDest := filepath.Join(parent, filepath.Base(name)+".git") - if _, err := os.Lstat(repoDest); err == nil { - return "", nil - } - - var keys []string - for k := range settings { - keys = append(keys, k) - } - sort.Strings(keys) - - var config []string - for _, k := range keys { - if settings[k] != "" { - config = append(config, "--config", k+"="+settings[k]) - } - } - - cmd := exec.Command( - "git", "clone", "--bare", "--verbose", "--progress", - ) - cmd.Args = append(cmd.Args, config...) - cmd.Args = append(cmd.Args, cloneURL, repoDest) - - // Prevent prompting - cmd.Stdin = &bytes.Buffer{} - log.Println("running:", cmd.Args) - if err := cmd.Run(); err != nil { - return "", err - } - - if err := setFetch(repoDest, "origin", "+refs/heads/*:refs/heads/*"); err != nil { - log.Printf("addFetch: %v", err) - } - return repoDest, nil -} - -func setFetch(repoDir, remote, refspec string) error { - repo, err := git.PlainOpen(repoDir) - if err != nil { - return err - } - - cfg, err := repo.Config() - if err != nil { - return err - } - - rm := cfg.Remotes[remote] - if rm != nil { - rm.Fetch = []config.RefSpec{config.RefSpec(refspec)} - } - if err := repo.Storer.SetConfig(cfg); err != nil { - return err - } - - return nil -}
diff --git a/gitindex/clone_test.go b/gitindex/clone_test.go deleted file mode 100644 index 8445416..0000000 --- a/gitindex/clone_test.go +++ /dev/null
@@ -1,63 +0,0 @@ -// Copyright 2019 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import ( - "io/ioutil" - "os" - "os/exec" - "testing" - - git "github.com/go-git/go-git/v5" -) - -func TestSetRemote(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(dir) - script := `mkdir orig -cd orig -git init -cd .. -git clone orig/.git clone.git -` - - cmd := exec.Command("/bin/sh", "-euxc", script) - cmd.Dir = dir - - if out, err := cmd.CombinedOutput(); err != nil { - t.Fatalf("execution error: %v, output %s", err, out) - } - - r := dir + "/clone.git" - if err := setFetch(r, "origin", "+refs/heads/*:refs/heads/*"); err != nil { - t.Fatalf("addFetch: %v", err) - } - - repo, err := git.PlainOpen(r) - if err != nil { - t.Fatal("PlainOpen", err) - } - - rm, err := repo.Remote("origin") - if err != nil { - t.Fatal("Remote", err) - } - if got, want := rm.Config().Fetch[0].String(), "+refs/heads/*:refs/heads/*"; got != want { - t.Fatalf("got %q want %q", got, want) - } -}
diff --git a/gitindex/delete.go b/gitindex/delete.go deleted file mode 100644 index f53e1af..0000000 --- a/gitindex/delete.go +++ /dev/null
@@ -1,41 +0,0 @@ -package gitindex - -import ( - "fmt" - "log" - "net/url" - "os" - "path/filepath" -) - -// DeleteRepos deletes stale repos under a specific path in disk. The `names` -// argument stores names of repos retrieved from the git hosting site -// and is used along with the `filter` argument to decide on repo deletion. -func DeleteRepos(baseDir string, urlPrefix *url.URL, names map[string]struct{}, filter *Filter) error { - paths, err := ListRepos(baseDir, urlPrefix) - if err != nil { - return err - } - var toDelete []string - for _, p := range paths { - _, exists := names[p] - if filter.Include(filepath.Base(p)) && !exists { - toDelete = append(toDelete, p) - } - } - - if len(toDelete) > 0 { - log.Printf("deleting repos %v", toDelete) - } - - var errs []string - for _, d := range toDelete { - if err := os.RemoveAll(filepath.Join(baseDir, d)); err != nil { - errs = append(errs, err.Error()) - } - } - if len(errs) > 0 { - return fmt.Errorf("errors: %v", errs) - } - return nil -}
diff --git a/gitindex/delete_test.go b/gitindex/delete_test.go deleted file mode 100644 index 381e6e0..0000000 --- a/gitindex/delete_test.go +++ /dev/null
@@ -1,86 +0,0 @@ -package gitindex - -import ( - "io/ioutil" - "net/url" - "os" - "path/filepath" - "reflect" - "testing" -) - -func TestDeleteRepos(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - if err := createSubmoduleRepo(dir); err != nil { - t.Error("createSubmoduleRepo", err) - } - - reposBefore, err := FindGitRepos(dir) - if err != nil { - t.Error("FindGitRepos", err) - } - - gotBefore := map[string]struct{}{} - for _, r := range reposBefore { - p, err := filepath.Rel(dir, r) - if err != nil { - t.Fatalf("Relative: %v", err) - } - - gotBefore[p] = struct{}{} - } - - wantBefore := map[string]struct{}{ - "gerrit.googlesource.com/bdir.git": {}, - "gerrit.googlesource.com/sub/bdir.git": {}, - "adir/.git": {}, - "bdir/.git": {}, - "gerrit.googlesource.com/adir.git": {}, - } - - if !reflect.DeepEqual(gotBefore, wantBefore) { - t.Fatalf("got %v want %v", gotBefore, wantBefore) - } - - aURL, _ := url.Parse("http://gerrit.googlesource.com") - aURL.Path = "sub" - names := map[string]struct{}{ - "bdir/.git": {}, - "gerrit.googlesource.com/adir.git": {}, - } - filter, _ := NewFilter("", "") - - err = DeleteRepos(dir, aURL, names, filter) - if err != nil { - t.Fatalf("DeleteRepos: %T", err) - } - reposAfter, err := FindGitRepos(dir) - if err != nil { - t.Error("FindGitRepos", err) - } - - gotAfter := map[string]struct{}{} - for _, r := range reposAfter { - p, err := filepath.Rel(dir, r) - if err != nil { - t.Fatalf("Relative: %v", err) - } - - gotAfter[p] = struct{}{} - } - wantAfter := map[string]struct{}{ - "gerrit.googlesource.com/bdir.git": {}, - "adir/.git": {}, - "bdir/.git": {}, - "gerrit.googlesource.com/adir.git": {}, - } - - if !reflect.DeepEqual(gotAfter, wantAfter) { - t.Errorf("got %v want %v", gotAfter, wantAfter) - } -}
diff --git a/gitindex/filter.go b/gitindex/filter.go deleted file mode 100644 index eba82f9..0000000 --- a/gitindex/filter.go +++ /dev/null
@@ -1,58 +0,0 @@ -// Copyright 2017 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import "regexp" - -// Filter is a include/exclude filter to be used for repo names. -type Filter struct { - inc, exc *regexp.Regexp -} - -// Include returns true if the name passes the filter. -func (f *Filter) Include(name string) bool { - if f.inc != nil { - if !f.inc.MatchString(name) { - return false - } - } - if f.exc != nil { - if f.exc.MatchString(name) { - return false - } - } - return true -} - -// NewFilter creates a new filter. -func NewFilter(includeRegex, excludeRegex string) (*Filter, error) { - f := &Filter{} - var err error - if includeRegex != "" { - f.inc, err = regexp.Compile(includeRegex) - - if err != nil { - return nil, err - } - } - if excludeRegex != "" { - f.exc, err = regexp.Compile(excludeRegex) - if err != nil { - return nil, err - } - } - - return f, nil -}
diff --git a/gitindex/index.go b/gitindex/index.go deleted file mode 100644 index 5715406..0000000 --- a/gitindex/index.go +++ /dev/null
@@ -1,521 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package gitindex provides functions for indexing Git repositories. -package gitindex - -import ( - "bytes" - "fmt" - "io" - "log" - "math" - "net/url" - "os" - "path/filepath" - "sort" - "strconv" - "strings" - "time" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" - - "github.com/go-git/go-git/v5/config" - "github.com/go-git/go-git/v5/plumbing" - "github.com/go-git/go-git/v5/plumbing/object" - - git "github.com/go-git/go-git/v5" -) - -// RepoModTime returns the time of last fetch of a git repository. -func RepoModTime(dir string) (time.Time, error) { - var last time.Time - refDir := filepath.Join(dir, "refs") - if _, err := os.Lstat(refDir); err == nil { - if err := filepath.Walk(refDir, - func(_ string, fi os.FileInfo, _ error) error { - if !fi.IsDir() && last.Before(fi.ModTime()) { - last = fi.ModTime() - } - return nil - }); err != nil { - return last, err - } - } - - // git gc compresses refs into the following file: - for _, fn := range []string{"info/refs", "packed-refs"} { - if fi, err := os.Lstat(filepath.Join(dir, fn)); err == nil && !fi.IsDir() && last.Before(fi.ModTime()) { - last = fi.ModTime() - } - } - - return last, nil -} - -// FindGitRepos finds directories holding git repositories below the -// given directory. It will find both bare and the ".git" dirs in -// non-bare repositories. It returns the full path including the dir -// passed in. -func FindGitRepos(dir string) ([]string, error) { - arg, err := filepath.Abs(dir) - if err != nil { - return nil, err - } - var dirs []string - if err := filepath.Walk(arg, func(name string, fi os.FileInfo, err error) error { - // Best-effort, ignore filepath.Walk failing - if err != nil { - return nil - } - - if fi, err := os.Lstat(filepath.Join(name, ".git")); err == nil && fi.IsDir() { - dirs = append(dirs, filepath.Join(name, ".git")) - return filepath.SkipDir - } - - if !strings.HasSuffix(name, ".git") || !fi.IsDir() { - return nil - } - - fi, err = os.Lstat(filepath.Join(name, "objects")) - if err != nil || !fi.IsDir() { - return nil - } - - dirs = append(dirs, name) - return filepath.SkipDir - }); err != nil { - return nil, err - } - - return dirs, nil -} - -// setTemplates fills in URL templates for known git hosting -// sites. -func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { - repo.URL = u.String() - switch typ { - case "gitiles": - /// eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 - repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" - repo.FileURLTemplate = u.String() + "/+/{{.Version}}/{{.Path}}" - repo.LineFragmentTemplate = "#{{.LineNumber}}" - case "github": - // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10 - repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" - repo.FileURLTemplate = u.String() + "/blob/{{.Version}}/{{.Path}}" - repo.LineFragmentTemplate = "#L{{.LineNumber}}" - case "cgit": - // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100 - repo.CommitURLTemplate = u.String() + "/commit/?id={{.Version}}" - repo.FileURLTemplate = u.String() + "/tree/{{.Path}}/?id={{.Version}}" - repo.LineFragmentTemplate = "#n{{.LineNumber}}" - case "gitweb": - // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10 - repo.FileURLTemplate = u.String() + ";a=blob;f={{.Path}};hb={{.Version}}" - repo.CommitURLTemplate = u.String() + ";a=commit;h={{.Version}}" - repo.LineFragmentTemplate = "#l{{.LineNumber}}" - case "source.bazel.build": - // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9 - // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10 - repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" - repo.FileURLTemplate = u.String() + "/+/{{.Version}}:{{.Path}}" - repo.LineFragmentTemplate = ";l={{.LineNumber}}" - case "bitbucket-server": - // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc - // https://<bitbucketserver-host>/projects/<project>/repos/<repo>/browse/<file>?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc - repo.CommitURLTemplate = u.String() + "/commits/{{.Version}}" - repo.FileURLTemplate = u.String() + "/{{.Path}}?at={{.Version}}" - repo.LineFragmentTemplate = "#{{.LineNumber}}" - case "gitlab": - repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" - repo.FileURLTemplate = u.String() + "/blob/{{.Version}}/{{.Path}}" - repo.LineFragmentTemplate = "#L{{.LineNumber}}" - default: - return fmt.Errorf("URL scheme type %q unknown", typ) - } - return nil -} - -// getCommit returns a tree object for the given reference. -func getCommit(repo *git.Repository, prefix, ref string) (*object.Commit, error) { - sha1, err := repo.ResolveRevision(plumbing.Revision(ref)) - // ref might be a branch name (e.g. "master") add branch prefix and try again. - if err != nil { - sha1, err = repo.ResolveRevision(plumbing.Revision(filepath.Join(prefix, ref))) - } - if err != nil { - return nil, err - } - - commitObj, err := repo.CommitObject(*sha1) - if err != nil { - return nil, err - } - return commitObj, nil -} - -func configLookupRemoteURL(cfg *config.Config, key string) string { - rc := cfg.Remotes[key] - if rc == nil || len(rc.URLs) == 0 { - return "" - } - return rc.URLs[0] -} - -func setTemplatesFromConfig(desc *zoekt.Repository, repoDir string) error { - repo, err := git.PlainOpen(repoDir) - if err != nil { - return err - } - - cfg, err := repo.Config() - if err != nil { - return err - } - - sec := cfg.Raw.Section("zoekt") - - webURLStr := sec.Options.Get("web-url") - webURLType := sec.Options.Get("web-url-type") - - if webURLType != "" && webURLStr != "" { - webURL, err := url.Parse(webURLStr) - if err != nil { - return err - } - if err := setTemplates(desc, webURL, webURLType); err != nil { - return err - } - } - - name := sec.Options.Get("name") - if name != "" { - desc.Name = name - } else { - remoteURL := configLookupRemoteURL(cfg, "origin") - if remoteURL == "" { - return nil - } - u, err := url.Parse(remoteURL) - if err != nil { - return err - } - if err := SetTemplatesFromOrigin(desc, u); err != nil { - return err - } - } - - if desc.RawConfig == nil { - desc.RawConfig = map[string]string{} - } - for _, o := range sec.Options { - desc.RawConfig[o.Key] = o.Value - } - - // Ranking info. - - // Github: - traction := 0 - for _, s := range []string{"github-stars", "github-forks", "github-watchers", "github-subscribers"} { - f, err := strconv.Atoi(sec.Options.Get(s)) - if err == nil { - traction += f - } - } - - if strings.Contains(desc.Name, "googlesource.com/") && traction == 0 { - // Pretend everything on googlesource.com has 1000 - // github stars. - traction = 1000 - } - - if traction > 0 { - l := math.Log(float64(traction)) - desc.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000) - } - - return nil -} - -// SetTemplatesFromOrigin fills in templates based on the origin URL. -func SetTemplatesFromOrigin(desc *zoekt.Repository, u *url.URL) error { - desc.Name = filepath.Join(u.Host, strings.TrimSuffix(u.Path, ".git")) - - if strings.HasSuffix(u.Host, ".googlesource.com") { - return setTemplates(desc, u, "gitiles") - } else if u.Host == "github.com" { - u.Path = strings.TrimSuffix(u.Path, ".git") - return setTemplates(desc, u, "github") - } else { - return fmt.Errorf("unknown git hosting site %q", u) - } -} - -// The Options structs controls details of the indexing process. -type Options struct { - // The repository to be indexed. - RepoDir string - - // If set, follow submodule links. This requires RepoCacheDir to be set. - Submodules bool - - // If set, skip indexing if the existing index shard is newer - // than the refs in the repository. - Incremental bool - - // Don't error out if some branch is missing - AllowMissingBranch bool - - // Specifies the root of a Repository cache. Needed for submodule indexing. - RepoCacheDir string - - // Indexing options. - BuildOptions build.Options - - // Prefix of the branch to index, e.g. `remotes/origin`. - BranchPrefix string - - // List of branch names to index, e.g. []string{"HEAD", "stable"} - Branches []string -} - -func expandBranches(repo *git.Repository, bs []string, prefix string) ([]string, error) { - var result []string - for _, b := range bs { - if b == "HEAD" { - ref, err := repo.Head() - if err != nil { - return nil, err - } - - result = append(result, strings.TrimPrefix(ref.Name().String(), prefix)) - continue - } - - if strings.Contains(b, "*") { - iter, err := repo.Branches() - if err != nil { - return nil, err - } - - defer iter.Close() - for { - ref, err := iter.Next() - if err == io.EOF { - break - } - if err != nil { - return nil, err - } - - name := ref.Name().Short() - if matched, err := filepath.Match(b, name); err != nil { - return nil, err - } else if !matched { - continue - } - - result = append(result, strings.TrimPrefix(name, prefix)) - } - continue - } - - result = append(result, b) - } - - return result, nil -} - -// IndexGitRepo indexes the git repository as specified by the options. -func IndexGitRepo(opts Options) error { - // Set max thresholds, since we use them in this function. - opts.BuildOptions.SetDefaults() - if opts.RepoDir == "" { - return fmt.Errorf("gitindex: must set RepoDir") - } - - opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir - repo, err := git.PlainOpen(opts.RepoDir) - if err != nil { - return err - } - - if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { - log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err) - } - - repoCache := NewRepoCache(opts.RepoCacheDir) - - // branch => (path, sha1) => repo. - repos := map[fileKey]BlobLocation{} - - // fileKey => branches - branchMap := map[fileKey][]string{} - - // Branch => Repo => SHA1 - branchVersions := map[string]map[string]plumbing.Hash{} - - branches, err := expandBranches(repo, opts.Branches, opts.BranchPrefix) - if err != nil { - return err - } - for _, b := range branches { - commit, err := getCommit(repo, opts.BranchPrefix, b) - if err != nil { - if opts.AllowMissingBranch && err.Error() == "reference not found" { - continue - } - - return err - } - - opts.BuildOptions.RepositoryDescription.Branches = append(opts.BuildOptions.RepositoryDescription.Branches, zoekt.RepositoryBranch{ - Name: b, - Version: commit.Hash.String(), - }) - - tree, err := commit.Tree() - if err != nil { - return err - } - - files, subVersions, err := TreeToFiles(repo, tree, opts.BuildOptions.RepositoryDescription.URL, repoCache) - if err != nil { - return err - } - for k, v := range files { - repos[k] = v - branchMap[k] = append(branchMap[k], b) - } - - branchVersions[b] = subVersions - } - - if opts.Incremental && opts.BuildOptions.IncrementalSkipIndexing() { - return nil - } - - reposByPath := map[string]BlobLocation{} - for key, location := range repos { - reposByPath[key.SubRepoPath] = location - } - - opts.BuildOptions.SubRepositories = map[string]*zoekt.Repository{} - for path, location := range reposByPath { - tpl := opts.BuildOptions.RepositoryDescription - if path != "" { - tpl = zoekt.Repository{URL: location.URL.String()} - if err := SetTemplatesFromOrigin(&tpl, location.URL); err != nil { - log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, location.URL, err) - } - } - opts.BuildOptions.SubRepositories[path] = &tpl - } - for _, br := range opts.BuildOptions.RepositoryDescription.Branches { - for path, repo := range opts.BuildOptions.SubRepositories { - id := branchVersions[br.Name][path] - repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ - Name: br.Name, - Version: id.String(), - }) - } - } - - builder, err := build.NewBuilder(opts.BuildOptions) - if err != nil { - return err - } - defer builder.Finish() - - var names []string - fileKeys := map[string][]fileKey{} - for key := range repos { - n := key.FullPath() - fileKeys[n] = append(fileKeys[n], key) - names = append(names, n) - } - - sort.Strings(names) - names = uniq(names) - - for _, name := range names { - keys := fileKeys[name] - - for _, key := range keys { - brs := branchMap[key] - blob, err := repos[key].Repo.BlobObject(key.ID) - if err != nil { - return err - } - - if blob.Size > int64(opts.BuildOptions.SizeMax) && !opts.BuildOptions.IgnoreSizeMax(key.FullPath()) { - if err := builder.Add(zoekt.Document{ - SkipReason: fmt.Sprintf("file size %d exceeds maximum size %d", blob.Size, opts.BuildOptions.SizeMax), - Name: key.FullPath(), - Branches: brs, - SubRepositoryPath: key.SubRepoPath, - }); err != nil { - return err - } - continue - } - - contents, err := blobContents(blob) - if err != nil { - return err - } - if err := builder.Add(zoekt.Document{ - SubRepositoryPath: key.SubRepoPath, - Name: key.FullPath(), - Content: contents, - Branches: brs, - }); err != nil { - return err - } - } - } - return builder.Finish() -} - -func blobContents(blob *object.Blob) ([]byte, error) { - r, err := blob.Reader() - if err != nil { - return nil, err - } - defer r.Close() - - var buf bytes.Buffer - buf.Grow(int(blob.Size)) - _, err = buf.ReadFrom(r) - if err != nil { - return nil, err - } - return buf.Bytes(), nil -} - -func uniq(ss []string) []string { - result := ss[:0] - var last string - for i, s := range ss { - if i == 0 || s != last { - result = append(result, s) - } - last = s - } - return result -}
diff --git a/gitindex/index_test.go b/gitindex/index_test.go deleted file mode 100644 index bc24796..0000000 --- a/gitindex/index_test.go +++ /dev/null
@@ -1,56 +0,0 @@ -// Copyright 2021 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import ( - "io/ioutil" - "os" - "os/exec" - "path/filepath" - "testing" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" -) - -func TestIndexEmptyRepo(t *testing.T) { - tmp, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir %v", err) - } - defer os.RemoveAll(tmp) - - cmd := exec.Command("git", "init", "-b", "master", "repo") - cmd.Dir = tmp - - if err := cmd.Run(); err != nil { - t.Fatalf("cmd.Run: %v", err) - } - - desc := zoekt.Repository{ - Name: "repo", - } - opts := Options{ - RepoDir: filepath.Join(tmp, "repo", ".git"), - BuildOptions: build.Options{ - RepositoryDescription: desc, - IndexDir: tmp, - }, - } - - if err := IndexGitRepo(opts); err != nil { - t.Fatalf("IndexGitRepo: %v", err) - } -}
diff --git a/gitindex/repocache.go b/gitindex/repocache.go deleted file mode 100644 index a8528f5..0000000 --- a/gitindex/repocache.go +++ /dev/null
@@ -1,117 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import ( - "net/url" - "os" - "path/filepath" - "strings" - "sync" - - git "github.com/go-git/go-git/v5" -) - -// RepoCache is a set of repositories on the file system, named and -// stored by URL. -type RepoCache struct { - baseDir string - - reposMu sync.Mutex - repos map[string]*git.Repository -} - -// NewRepoCache creates a new RepoCache rooted at the given directory. -func NewRepoCache(dir string) *RepoCache { - return &RepoCache{ - baseDir: dir, - repos: make(map[string]*git.Repository), - } -} - -func repoKeyStr(key string) string { - if !strings.HasSuffix(key, ".git") { - key += ".git" - } - return key -} - -func repoKey(u *url.URL) string { - return repoKeyStr(filepath.Join(u.Host, u.Path)) -} - -// Path returns the absolute path of the bare repository. -func Path(baseDir string, name string) string { - key := repoKeyStr(name) - return filepath.Join(baseDir, key) -} - -func (rc *RepoCache) Path(u *url.URL) string { - key := repoKey(u) - return filepath.Join(rc.baseDir, key) -} - -// Open opens a git repository. The cache retains a pointer to the -// repository. -func (rc *RepoCache) Open(u *url.URL) (*git.Repository, error) { - dir := rc.Path(u) - rc.reposMu.Lock() - defer rc.reposMu.Unlock() - - key := repoKey(u) - r := rc.repos[key] - if r != nil { - return r, nil - } - - repo, err := git.PlainOpen(dir) - if err == nil { - rc.repos[key] = repo - } - return repo, err -} - -// ListRepos returns paths to repos on disk that start with the given -// URL prefix. The paths are relative to baseDir, and typically -// include a ".git" suffix. -func ListRepos(baseDir string, u *url.URL) ([]string, error) { - key := filepath.Join(u.Host, u.Path) - - var paths []string - walk := func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.IsDir() { - return nil - } - if strings.HasSuffix(path, ".git") && !strings.HasSuffix(path, "/.git") { - _, err := git.PlainOpen(path) - if err == nil { - p, err := filepath.Rel(baseDir, path) - if err == nil { - paths = append(paths, p) - } - } - return filepath.SkipDir - } - return nil - } - - if err := filepath.Walk(filepath.Join(baseDir, key), walk); err != nil { - return nil, err - } - return paths, nil -}
diff --git a/gitindex/repocache_test.go b/gitindex/repocache_test.go deleted file mode 100644 index 8d22d3a..0000000 --- a/gitindex/repocache_test.go +++ /dev/null
@@ -1,67 +0,0 @@ -// Copyright 2017 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import ( - "io/ioutil" - "net/url" - "os" - "reflect" - "sort" - "testing" -) - -func TestListReposNonExistent(t *testing.T) { - u, err := url.Parse("https://gerrit.googlesource.com/") - if err != nil { - t.Fatalf("url.Parse: %v", err) - } - - rs, err := ListRepos("/doesnotexist", u) - if err == nil { - t.Fatalf("ListRepos(/doesnotexist): %v", rs) - } -} - -func TestListRepos(t *testing.T) { - tmp, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir %v", err) - } - defer os.RemoveAll(tmp) - if err := createSubmoduleRepo(tmp); err != nil { - t.Fatalf("createSubmoduleRepo %v", err) - } - - u, err := url.Parse("https://gerrit.googlesource.com/") - if err != nil { - t.Fatalf("url.Parse: %v", err) - } - rs, err := ListRepos(tmp, u) - if err != nil { - t.Fatalf("ListRepos(%s): %v", u, err) - } - - want := []string{ - "gerrit.googlesource.com/adir.git", - "gerrit.googlesource.com/bdir.git", - "gerrit.googlesource.com/sub/bdir.git", - } - sort.Strings(rs) - - if !reflect.DeepEqual(rs, want) { - t.Fatalf("got %v, want %v", rs, want) - } -}
diff --git a/gitindex/submodule.go b/gitindex/submodule.go deleted file mode 100644 index 0836c89..0000000 --- a/gitindex/submodule.go +++ /dev/null
@@ -1,64 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import ( - "bytes" - - "github.com/go-git/go-git/v5/plumbing/format/config" -) - -// SubmoduleEntry represent one entry in a .gitmodules file -type SubmoduleEntry struct { - Path string - URL string - Branch string -} - -// ParseGitModules parses the contents of a .gitmodules file. -func ParseGitModules(content []byte) (map[string]*SubmoduleEntry, error) { - dec := config.NewDecoder(bytes.NewBuffer(content)) - cfg := &config.Config{} - - if err := dec.Decode(cfg); err != nil { - return nil, err - } - - result := map[string]*SubmoduleEntry{} - for _, s := range cfg.Sections { - if s.Name != "submodule" { - continue - } - - for _, ss := range s.Subsections { - name := ss.Name - e := &SubmoduleEntry{} - for _, o := range ss.Options { - switch o.Key { - case "branch": - e.Branch = o.Value - case "path": - e.Path = o.Value - case "url": - e.URL = o.Value - } - } - - result[name] = e - } - } - - return result, nil -}
diff --git a/gitindex/submodule_test.go b/gitindex/submodule_test.go deleted file mode 100644 index ca7fef3..0000000 --- a/gitindex/submodule_test.go +++ /dev/null
@@ -1,43 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import ( - "reflect" - "testing" -) - -func TestParseGitModules(t *testing.T) { - testData := `[submodule "plugins/abc"] - path = plugins/abc - url = ../plugins/abc - branch = .` - - got, err := ParseGitModules([]byte(testData)) - if err != nil { - t.Fatalf("ParseGitModules: %T", err) - } - - want := map[string]*SubmoduleEntry{ - "plugins/abc": { - Path: "plugins/abc", - URL: "../plugins/abc", - Branch: ".", - }, - } - if !reflect.DeepEqual(got, want) { - t.Fatalf("got %v, want %v", got, want) - } -}
diff --git a/gitindex/tree.go b/gitindex/tree.go deleted file mode 100644 index 145cfd6..0000000 --- a/gitindex/tree.go +++ /dev/null
@@ -1,217 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import ( - "fmt" - "io" - "log" - "net/url" - "path" - "path/filepath" - "strings" - - "github.com/go-git/go-git/v5/plumbing" - "github.com/go-git/go-git/v5/plumbing/filemode" - "github.com/go-git/go-git/v5/plumbing/object" - - git "github.com/go-git/go-git/v5" -) - -// repoWalker walks a tree, recursing into submodules. -type repoWalker struct { - repo *git.Repository - - repoURL *url.URL - tree map[fileKey]BlobLocation - - // Path => SubmoduleEntry - submodules map[string]*SubmoduleEntry - - // Path => commit SHA1 - subRepoVersions map[string]plumbing.Hash - repoCache *RepoCache -} - -// subURL returns the URL for a submodule. -func (w *repoWalker) subURL(relURL string) (*url.URL, error) { - if w.repoURL == nil { - return nil, fmt.Errorf("no URL for base repo") - } - if strings.HasPrefix(relURL, "../") { - u := *w.repoURL - u.Path = path.Join(u.Path, relURL) - return &u, nil - } - - return url.Parse(relURL) -} - -// newRepoWalker creates a new repoWalker. -func newRepoWalker(r *git.Repository, repoURL string, repoCache *RepoCache) *repoWalker { - u, _ := url.Parse(repoURL) - return &repoWalker{ - repo: r, - repoURL: u, - tree: map[fileKey]BlobLocation{}, - repoCache: repoCache, - subRepoVersions: map[string]plumbing.Hash{}, - } -} - -// parseModuleMap initializes rw.submodules. -func (rw *repoWalker) parseModuleMap(t *object.Tree) error { - modEntry, _ := t.File(".gitmodules") - if modEntry != nil { - c, err := blobContents(&modEntry.Blob) - if err != nil { - return err - } - mods, err := ParseGitModules(c) - if err != nil { - return err - } - rw.submodules = map[string]*SubmoduleEntry{} - for _, entry := range mods { - rw.submodules[entry.Path] = entry - } - } - return nil -} - -// TreeToFiles fetches the blob SHA1s for a tree. If repoCache is -// non-nil, recurse into submodules. In addition, it returns a mapping -// that indicates in which repo each SHA1 can be found. -func TreeToFiles(r *git.Repository, t *object.Tree, - repoURL string, repoCache *RepoCache) (map[fileKey]BlobLocation, map[string]plumbing.Hash, error) { - rw := newRepoWalker(r, repoURL, repoCache) - - if err := rw.parseModuleMap(t); err != nil { - return nil, nil, err - } - - tw := object.NewTreeWalker(t, true, make(map[plumbing.Hash]bool)) - defer tw.Close() - for { - name, entry, err := tw.Next() - if err == io.EOF { - break - } - if err := rw.handleEntry(name, &entry); err != nil { - return nil, nil, err - } - } - return rw.tree, rw.subRepoVersions, nil -} - -func (r *repoWalker) tryHandleSubmodule(p string, id *plumbing.Hash) error { - if err := r.handleSubmodule(p, id); err != nil { - log.Printf("submodule %s: ignoring error %v", p, err) - } - return nil -} - -func (r *repoWalker) handleSubmodule(p string, id *plumbing.Hash) error { - submod := r.submodules[p] - if submod == nil { - return fmt.Errorf("no entry for submodule path %q", r.repoURL) - } - - subURL, err := r.subURL(submod.URL) - if err != nil { - return err - } - - subRepo, err := r.repoCache.Open(subURL) - if err != nil { - return err - } - - obj, err := subRepo.CommitObject(*id) - if err != nil { - return err - } - tree, err := subRepo.TreeObject(obj.TreeHash) - if err != nil { - return err - } - - r.subRepoVersions[p] = *id - - subTree, subVersions, err := TreeToFiles(subRepo, tree, subURL.String(), r.repoCache) - if err != nil { - return err - } - for k, repo := range subTree { - r.tree[fileKey{ - SubRepoPath: filepath.Join(p, k.SubRepoPath), - Path: k.Path, - ID: k.ID, - }] = repo - } - for k, v := range subVersions { - r.subRepoVersions[filepath.Join(p, k)] = v - } - return nil -} - -func (r *repoWalker) handleEntry(p string, e *object.TreeEntry) error { - if e.Mode == filemode.Submodule && r.repoCache != nil { - if err := r.tryHandleSubmodule(p, &e.Hash); err != nil { - return fmt.Errorf("submodule %s: %v", p, err) - } - } - - switch e.Mode { - case filemode.Regular, filemode.Executable: - default: - return nil - } - - r.tree[fileKey{ - Path: p, - ID: e.Hash, - }] = BlobLocation{ - Repo: r.repo, - URL: r.repoURL, - } - return nil -} - -// fileKey describes a blob at a location in the final tree. We also -// record the subrepository from where it came. -type fileKey struct { - SubRepoPath string - Path string - ID plumbing.Hash -} - -func (k *fileKey) FullPath() string { - return filepath.Join(k.SubRepoPath, k.Path) -} - -// BlobLocation holds data where a blob can be found. -type BlobLocation struct { - Repo *git.Repository - URL *url.URL -} - -func (l *BlobLocation) Blob(id *plumbing.Hash) ([]byte, error) { - blob, err := l.Repo.BlobObject(*id) - if err != nil { - return nil, err - } - return blobContents(blob) -}
diff --git a/gitindex/tree_test.go b/gitindex/tree_test.go deleted file mode 100644 index eff31b3..0000000 --- a/gitindex/tree_test.go +++ /dev/null
@@ -1,486 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitindex - -import ( - "bytes" - "context" - "fmt" - "io/ioutil" - "net/url" - "os" - "os/exec" - "path/filepath" - "reflect" - "sort" - "testing" - - "github.com/google/zoekt" - "github.com/google/zoekt/build" - "github.com/google/zoekt/query" - "github.com/google/zoekt/shards" -) - -func createSubmoduleRepo(dir string) error { - if err := os.MkdirAll(dir, 0o755); err != nil { - return err - } - script := `mkdir adir bdir -cd adir -git init -b master -mkdir subdir -echo acont > afile -echo sub-cont > subdir/sub-file -git add afile subdir/sub-file -git config user.email "you@example.com" -git config user.name "Your Name" -git commit -am amsg - -cd .. -cd bdir -git init -b master -echo bcont > bfile -git add bfile -git config user.email "you@example.com" -git config user.name "Your Name" -git commit -am bmsg - -cd ../adir -git submodule add --name bname -- ../bdir bname -git commit -am bmodmsg -cat .gitmodules -cd .. -mkdir gerrit.googlesource.com -git clone --bare adir gerrit.googlesource.com/adir.git -git clone --bare bdir gerrit.googlesource.com/bdir.git - -mkdir gerrit.googlesource.com/bogus.git -mkdir gerrit.googlesource.com/sub -git clone --bare bdir gerrit.googlesource.com/sub/bdir.git - -cat << EOF > gerrit.googlesource.com/adir.git/config -[core] - repositoryformatversion = 0 - filemode = true - bare = true -[remote "origin"] - url = http://gerrit.googlesource.com/adir -[branch "master"] - remote = origin - merge = refs/heads/master -EOF -` - cmd := exec.Command("/bin/sh", "-euxc", script) - cmd.Dir = dir - if out, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("execution error: %v, output %s", err, out) - } - return nil -} - -func TestFindGitRepos(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - if err := createSubmoduleRepo(dir); err != nil { - t.Error("createSubmoduleRepo", err) - } - repos, err := FindGitRepos(dir) - if err != nil { - t.Error("FindGitRepos", err) - } - - got := map[string]bool{} - for _, r := range repos { - p, err := filepath.Rel(dir, r) - if err != nil { - t.Fatalf("Relative: %v", err) - } - - got[p] = true - } - - want := map[string]bool{ - "gerrit.googlesource.com/bdir.git": true, - "gerrit.googlesource.com/sub/bdir.git": true, - "adir/.git": true, - "bdir/.git": true, - "gerrit.googlesource.com/adir.git": true, - } - if !reflect.DeepEqual(got, want) { - t.Errorf("got %v want %v", got, want) - } -} - -func TestTreeToFiles(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - if err := createSubmoduleRepo(dir); err != nil { - t.Fatalf("TempDir: %v", err) - } - - cache := NewRepoCache(dir) - - aURL, _ := url.Parse("http://gerrit.googlesource.com/adir") - repo, err := cache.Open(aURL) - if err != nil { - t.Fatalf("Open: %v", err) - } - - headRef, err := repo.Head() - if err != nil { - t.Fatalf("HEAD tree: %v", err) - } - commit, err := repo.CommitObject(headRef.Hash()) - if err != nil { - t.Fatalf("commit obj HEAD: %v", err) - } - - tree, err := repo.TreeObject(commit.TreeHash) - if err != nil { - t.Fatalf("AsTree: %v", err) - } - - files, versions, err := TreeToFiles(repo, tree, aURL.String(), cache) - if err != nil { - t.Fatalf("TreeToFiles: %v", err) - } - - bnameHash := versions["bname"] - if entry, err := tree.FindEntry("bname"); err != nil { - t.Fatalf("FindEntry %v", err) - } else if !bytes.Equal(bnameHash[:], entry.Hash[:]) { - t.Fatalf("got 'bname' versions %v, want %v", bnameHash, entry.Hash) - } - - var paths []string - for k := range files { - paths = append(paths, k.FullPath()) - } - sort.Strings(paths) - - want := []string{".gitmodules", "afile", "bname/bfile", "subdir/sub-file"} - if !reflect.DeepEqual(paths, want) { - t.Errorf("got %v, want %v", paths, want) - } -} - -func TestSubmoduleIndex(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - if err := createSubmoduleRepo(dir); err != nil { - t.Fatalf("createSubmoduleRepo: %v", err) - } - - indexDir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(indexDir) - - buildOpts := build.Options{ - IndexDir: indexDir, - } - opts := Options{ - RepoDir: filepath.Join(dir, "gerrit.googlesource.com", "adir.git"), - BuildOptions: buildOpts, - BranchPrefix: "refs/heads/", - Branches: []string{"master"}, - Submodules: true, - Incremental: true, - RepoCacheDir: dir, - } - if err := IndexGitRepo(opts); err != nil { - t.Fatalf("IndexGitRepo: %v", err) - } - - searcher, err := shards.NewDirectorySearcher(indexDir) - if err != nil { - t.Fatal("NewDirectorySearcher", err) - } - defer searcher.Close() - - results, err := searcher.Search(context.Background(), - &query.Substring{Pattern: "bcont"}, - &zoekt.SearchOptions{}) - if err != nil { - t.Fatal("Search", err) - } - - if len(results.Files) != 1 { - t.Fatalf("got search result %v, want 1 file", results.Files) - } - - file := results.Files[0] - if got, want := file.SubRepositoryName, "gerrit.googlesource.com/bdir"; got != want { - t.Errorf("got subrepo name %q, want %q", got, want) - } - if got, want := file.SubRepositoryPath, "bname"; got != want { - t.Errorf("got subrepo path %q, want %q", got, want) - } - - subVersion := file.Version - if len(subVersion) != 40 { - t.Fatalf("got %q, want hex sha1", subVersion) - } - - if results, err := searcher.Search(context.Background(), &query.Substring{Pattern: "acont"}, &zoekt.SearchOptions{}); err != nil { - t.Fatalf("Search('acont'): %v", err) - } else if len(results.Files) != 1 { - t.Errorf("got %v, want 1 result", results.Files) - } else if f := results.Files[0]; f.Version == subVersion { - t.Errorf("version in super repo matched version is subrepo.") - } -} - -func TestAllowMissingBranch(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - if err := createSubmoduleRepo(dir); err != nil { - t.Fatalf("createSubmoduleRepo: %v", err) - } - - indexDir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(indexDir) - - buildOpts := build.Options{ - IndexDir: indexDir, - } - - opts := Options{ - RepoDir: filepath.Join(dir, "gerrit.googlesource.com", "adir.git"), - BuildOptions: buildOpts, - BranchPrefix: "refs/heads/", - Branches: []string{"master", "nonexist"}, - Submodules: true, - Incremental: true, - RepoCacheDir: dir, - } - if err := IndexGitRepo(opts); err == nil { - t.Fatalf("IndexGitRepo(nonexist) succeeded") - } - opts.AllowMissingBranch = true - if err := IndexGitRepo(opts); err != nil { - t.Fatalf("IndexGitRepo(nonexist, allow): %v", err) - } -} - -func createMultibranchRepo(dir string) error { - if err := os.MkdirAll(dir, 0o755); err != nil { - return err - } - script := `mkdir repo -cd repo -git init -b master -mkdir subdir -echo acont > afile -echo sub-cont > subdir/sub-file -git add afile subdir/sub-file -git config user.email "you@example.com" -git config user.name "Your Name" -git commit -am amsg - -git branch branchdir/a - -echo acont >> afile -git add afile subdir/sub-file -git commit -am amsg - -git branch branchdir/b - -git branch c - -git update-ref refs/meta/config HEAD -` - cmd := exec.Command("/bin/sh", "-euxc", script) - cmd.Dir = dir - if out, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("execution error: %v, output %s", err, out) - } - return nil -} - -func TestBranchWildcard(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - if err := createMultibranchRepo(dir); err != nil { - t.Fatalf("createMultibranchRepo: %v", err) - } - - indexDir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(indexDir) - - buildOpts := build.Options{ - IndexDir: indexDir, - RepositoryDescription: zoekt.Repository{ - Name: "repo", - }, - } - buildOpts.SetDefaults() - - opts := Options{ - RepoDir: filepath.Join(dir + "/repo"), - BuildOptions: buildOpts, - BranchPrefix: "refs/heads", - Branches: []string{"branchdir/*"}, - Submodules: true, - Incremental: true, - } - if err := IndexGitRepo(opts); err != nil { - t.Fatalf("IndexGitRepo: %v", err) - } - - searcher, err := shards.NewDirectorySearcher(indexDir) - if err != nil { - t.Fatal("NewDirectorySearcher", err) - } - defer searcher.Close() - - if rlist, err := searcher.List(context.Background(), &query.Repo{Pattern: ""}); err != nil { - t.Fatalf("List(): %v", err) - } else if len(rlist.Repos) != 1 { - t.Errorf("got %v, want 1 result", rlist.Repos) - } else if repo := rlist.Repos[0]; len(repo.Repository.Branches) != 2 { - t.Errorf("got branches %v, want 2", repo.Repository.Branches) - } else if repo := rlist.Repos[0]; repo.Stats.Documents != 3 { - t.Errorf("got document count %d, want 3", repo.Stats.Documents) - } -} - -func TestSkipSubmodules(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - if err := createSubmoduleRepo(dir); err != nil { - t.Fatalf("createMultibranchRepo: %v", err) - } - - indexDir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(indexDir) - - buildOpts := build.Options{ - IndexDir: indexDir, - RepositoryDescription: zoekt.Repository{ - Name: "gerrit.googlesource.com/adir", - }, - } - if err := os.Rename(dir+"/gerrit.googlesource.com/bdir.git", - dir+"/gerrit.googlesource.com/notexist.git"); err != nil { - t.Fatalf("Rename: %v", err) - } - - opts := Options{ - RepoDir: filepath.Join(dir, "gerrit.googlesource.com", "adir.git"), - BuildOptions: buildOpts, - BranchPrefix: "refs/heads", - Branches: []string{"master"}, - Submodules: false, - } - if err := IndexGitRepo(opts); err != nil { - t.Fatalf("IndexGitRepo: %v", err) - } -} - -func TestFullAndShortRefNames(t *testing.T) { - dir, err := ioutil.TempDir("", "git") - if err != nil { - t.Fatalf("TempDir: %v", err) - } - defer os.RemoveAll(dir) - - if err := createMultibranchRepo(dir); err != nil { - t.Fatalf("createMultibranchRepo: %v", err) - } - - indexDir, err := ioutil.TempDir("", "index-") - if err != nil { - t.Fatal(err) - } - os.RemoveAll(indexDir) - - buildOpts := build.Options{ - IndexDir: indexDir, - RepositoryDescription: zoekt.Repository{ - Name: "repo", - }, - } - buildOpts.SetDefaults() - - opts := Options{ - RepoDir: filepath.Join(dir + "/repo"), - BuildOptions: buildOpts, - BranchPrefix: "refs/heads", - Branches: []string{"refs/heads/master", "branchdir/a", "refs/meta/config"}, - Submodules: false, - Incremental: false, - AllowMissingBranch: false, - } - if err := IndexGitRepo(opts); err != nil { - t.Fatalf("IndexGitRepo: %v", err) - } - - searcher, err := shards.NewDirectorySearcher(indexDir) - if err != nil { - t.Fatal("NewDirectorySearcher", err) - } - defer searcher.Close() - - if rlist, err := searcher.List(context.Background(), &query.Repo{Pattern: ""}); err != nil { - t.Fatalf("List(): %v", err) - } else if len(rlist.Repos) != 1 { - t.Errorf("got %v, want 1 result", rlist.Repos) - } else if repo := rlist.Repos[0]; len(repo.Repository.Branches) != 3 { - t.Errorf("got branches %v, want 3", repo.Repository.Branches) - } -} - -func TestUniq(t *testing.T) { - in := []string{"a", "b", "b", "c", "c"} - want := []string{"a", "b", "c"} - got := uniq(in) - if !reflect.DeepEqual(got, want) { - t.Errorf("got %v, want %v", got, want) - } -}
diff --git a/go.mod b/go.mod deleted file mode 100644 index 90ae146..0000000 --- a/go.mod +++ /dev/null
@@ -1,27 +0,0 @@ -module github.com/google/zoekt - -go 1.13 - -require ( - github.com/andygrunwald/go-gerrit v0.0.0-20191101112536-3f5e365ccf57 - github.com/bmatcuk/doublestar v1.3.4 - github.com/fsnotify/fsnotify v1.4.7 - github.com/gfleury/go-bitbucket-v1 v0.0.0-20200104105711-ddbafbb02522 - github.com/go-git/go-git/v5 v5.0.0 - github.com/golang/protobuf v1.3.3 // indirect - github.com/google/go-cmp v0.5.5 - github.com/google/go-github/v27 v27.0.6 - github.com/google/slothfs v0.0.0-20190417171004-6b42407d9230 - github.com/kylelemons/godebug v1.1.0 - github.com/mitchellh/mapstructure v1.1.2 // indirect - github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_golang v1.5.1 - github.com/xanzy/go-gitlab v0.25.0 - go.uber.org/automaxprocs v1.3.0 - golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6 - golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d - golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e - google.golang.org/appengine v1.6.5 // indirect - gopkg.in/yaml.v2 v2.2.8 // indirect - humungus.tedunangst.com/r/gerc v0.1.2 -)
diff --git a/go.sum b/go.sum deleted file mode 100644 index 4b1a482..0000000 --- a/go.sum +++ /dev/null
@@ -1,212 +0,0 @@ -cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7 h1:uSoVVbwJiQipAclBbw+8quDsfcvFjOpI5iCf4p/cqCs= -github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7/go.mod h1:6zEj6s6u/ghQa61ZWa/C2Aw3RkjiTBOix7dkqa1VLIs= -github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= -github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= -github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= -github.com/andygrunwald/go-gerrit v0.0.0-20191101112536-3f5e365ccf57 h1:wtSQ14h8qAUezER6QPfYmCh5+W5Ly1lVruhm/QeOVUE= -github.com/andygrunwald/go-gerrit v0.0.0-20191101112536-3f5e365ccf57/go.mod h1:0iuRQp6WJ44ts+iihy5E/WlPqfg5RNeQxOmzRkxCdtk= -github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 h1:kFOfPq6dUM1hTo4JG6LR5AXSUEsOjtdm0kw0FtQtMJA= -github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c= -github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= -github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= -github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= -github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= -github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= -github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0= -github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE= -github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= -github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emirpasic/gods v1.12.0 h1:QAUIPSaCu4G+POclxeqb3F+WPpdKqFGlw36+yOzGlrg= -github.com/emirpasic/gods v1.12.0/go.mod h1:YfzfFFoVP/catgzJb4IKIqXjX78Ha8FMSDh3ymbK86o= -github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ= -github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= -github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= -github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/gfleury/go-bitbucket-v1 v0.0.0-20200104105711-ddbafbb02522 h1:hrfa10FjSX7jqupn/o8vyEsJ807SyKvuf+iFiEytTN8= -github.com/gfleury/go-bitbucket-v1 v0.0.0-20200104105711-ddbafbb02522/go.mod h1:Se0U4YUmRkRAOh8kD7KXz+3VCUBmvTFcdWP2QYYRjjc= -github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0= -github.com/gliderlabs/ssh v0.2.2/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= -github.com/go-git/gcfg v1.5.0 h1:Q5ViNfGF8zFgyJWPqYwA7qGFoMTEiBmdlkcfRmpIMa4= -github.com/go-git/gcfg v1.5.0/go.mod h1:5m20vg6GwYabIxaOonVkTdrILxQMpEShl1xiMF4ua+E= -github.com/go-git/go-billy/v5 v5.0.0 h1:7NQHvd9FVid8VL4qVUMm8XifBK+2xCoZ2lSk0agRrHM= -github.com/go-git/go-billy/v5 v5.0.0/go.mod h1:pmpqyWchKfYfrkb/UVH4otLvyi/5gJlGI4Hb3ZqZ3W0= -github.com/go-git/go-git-fixtures/v4 v4.0.1 h1:q+IFMfLx200Q3scvt2hN79JsEzy4AmBTp/pqnefH+Bc= -github.com/go-git/go-git-fixtures/v4 v4.0.1/go.mod h1:m+ICp2rF3jDhFgEZ/8yziagdT1C+ZpZcrJjappBCDSw= -github.com/go-git/go-git/v5 v5.0.0 h1:k5RWPm4iJwYtfWoxIJy4wJX9ON7ihPeZZYC1fLYDnpg= -github.com/go-git/go-git/v5 v5.0.0/go.mod h1:oYD8y9kWsGINPFJoLdaScGCN6dlKg23blmClfZwtUVA= -github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= -github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= -github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= -github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= -github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= -github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.3 h1:gyjaxf+svBWX08ZjK86iN9geUJF0H6gp2IRKX6Nf6/I= -github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-github/v27 v27.0.6 h1:oiOZuBmGHvrGM1X9uNUAUlLgp5r1UUO/M/KnbHnLRlQ= -github.com/google/go-github/v27 v27.0.6/go.mod h1:/0Gr8pJ55COkmv+S/yPKCczSkUPIM/LnFyubufRNIS0= -github.com/google/go-querystring v1.0.0 h1:Xkwi/a1rcvNg1PPYe5vI8GbeBY/jrVuDX5ASuANWTrk= -github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck= -github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/slothfs v0.0.0-20190417171004-6b42407d9230 h1:iBLrJ79cF90CZmpskySqhPvzrWr9njBYEsOZubXLZlc= -github.com/google/slothfs v0.0.0-20190417171004-6b42407d9230/go.mod h1:kzvK/MFjZSNdFgc1tCZML3E1nVvnB4/npSKEuvMoECU= -github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A= -github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo= -github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= -github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= -github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= -github.com/kevinburke/ssh_config v0.0.0-20190725054713-01f96b0aa0cd h1:Coekwdh0v2wtGp9Gmz1Ze3eVRAWJMLokvN3QjdzCHLY= -github.com/kevinburke/ssh_config v0.0.0-20190725054713-01f96b0aa0cd/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= -github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= -github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= -github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= -github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= -github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE= -github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= -github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= -github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= -github.com/prometheus/client_golang v1.5.1 h1:bdHYieyGlH+6OLEk2YQha8THib30KP0/yD0YH9m6xcA= -github.com/prometheus/client_golang v1.5.1/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= -github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= -github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M= -github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= -github.com/prometheus/common v0.9.1 h1:KOMtN28tlbam3/7ZKEYKHhKoJZYYj3gMH4uc62x7X7U= -github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4= -github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= -github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= -github.com/prometheus/procfs v0.0.8 h1:+fpWZdT24pJBiqJdAwYBjPSk+5YmQzYNPYzQsdzLkt8= -github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= -github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= -github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= -github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= -github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= -github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/xanzy/go-gitlab v0.25.0 h1:G5aTZeqZd66Q6qMVieBfmHBsPpF0jY92zCLAMpULe3I= -github.com/xanzy/go-gitlab v0.25.0/go.mod h1:t4Bmvnxj7k37S4Y17lfLx+nLqkf/oQwT2HagfWKv5Og= -github.com/xanzy/ssh-agent v0.2.1 h1:TCbipTQL2JiiCprBWx9frJ2eJlCYT00NmctrHxVAr70= -github.com/xanzy/ssh-agent v0.2.1/go.mod h1:mLlQY/MoOhWBj+gOGMQkOeiEvkx+8pJSI+0Bx9h2kr4= -go.uber.org/automaxprocs v1.3.0 h1:II28aZoGdaglS5vVNnspf28lnZpXScxtIozx1lAjdb0= -go.uber.org/automaxprocs v1.3.0/go.mod h1:9CWT6lKIep8U41DDaPiH6eFscnTyjfTANNQNx6LrIcA= -golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= -golang.org/x/crypto v0.0.0-20190219172222-a4c6cb3142f2/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200302210943-78000ba7a073 h1:xMPOj6Pz6UipU1wXLkrtqpHbR0AVFnyPEQq/wRWz9lM= -golang.org/x/crypto v0.0.0-20200302210943-78000ba7a073/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= -golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= -golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181108082009-03003ca0c849/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= -golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6 h1:0PC75Fz/kyMGhL0e1QnypqK2kQMqKt9csD1GnMJR+Zk= -golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= -golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.0.0-20181106182150-f42d05182288/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d h1:TzXSXBo42m9gQenoE3b9BGiEpg5IG2JkU5FkPIawgtw= -golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e h1:vcxGaoTs7kV8m5Np9uUNQin4BrLOthgV7252N8V+FwY= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190221075227-b4e8571b14e0/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210423082822-04245dca01da h1:b3NXsE2LusjYGGjL5bxEVZZORm/YEFFrWFjR8eFrw/c= -golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= -golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= -google.golang.org/appengine v1.3.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.6.5 h1:tycE03LOZYQNhDpS27tcQdAzLCVMaj7QT2SXxebnpCM= -google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= -gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= -gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= -gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= -humungus.tedunangst.com/r/gerc v0.1.2 h1:eW5yTbRLFFWSu/RpTdkLxaVPlNlFUzxc02VQsftzg64= -humungus.tedunangst.com/r/gerc v0.1.2/go.mod h1:tuYnDVV3WEGI9NEX5/3Iz5xVNimFzN4+83qZvFf/GUg=
diff --git a/hititer.go b/hititer.go deleted file mode 100644 index 356c505..0000000 --- a/hititer.go +++ /dev/null
@@ -1,259 +0,0 @@ -// Copyright 2018 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "encoding/binary" - "fmt" -) - -// hitIterator finds potential search matches, measured in offsets of -// the concatenation of all documents. -type hitIterator interface { - // Return the first hit, or maxUInt32 if none. - first() uint32 - - // Skip until past limit. The argument maxUInt32 should be - // treated specially. - next(limit uint32) - - // Return how many bytes were read. - updateStats(s *Stats) -} - -// distanceHitIterator looks for hits at a fixed distance apart. -type distanceHitIterator struct { - started bool - distance uint32 - i1 hitIterator - i2 hitIterator -} - -func (i *distanceHitIterator) String() string { - return fmt.Sprintf("dist(%d, %v, %v)", i.distance, i.i1, i.i2) -} - -func (i *distanceHitIterator) findNext() { - for { - var p1, p2 uint32 - p1 = i.i1.first() - p2 = i.i2.first() - if p1 == maxUInt32 || p2 == maxUInt32 { - i.i1.next(maxUInt32) - break - } - - if p1+i.distance < p2 { - i.i1.next(p2 - i.distance - 1) - } else if p1+i.distance > p2 { - i.i2.next(p1 + i.distance - 1) - } else { - break - } - } -} - -func (i *distanceHitIterator) first() uint32 { - if !i.started { - i.findNext() - i.started = true - } - return i.i1.first() -} - -func (i *distanceHitIterator) updateStats(s *Stats) { - i.i1.updateStats(s) - i.i2.updateStats(s) -} - -func (i *distanceHitIterator) next(limit uint32) { - i.i1.next(limit) - l2 := limit + i.distance - - if l2 < limit { // overflow. - l2 = maxUInt32 - } - i.i2.next(l2) - i.findNext() -} - -func (d *indexData) newDistanceTrigramIter(ng1, ng2 ngram, dist uint32, caseSensitive, fileName bool) (hitIterator, error) { - if dist == 0 { - return nil, fmt.Errorf("d == 0") - } - - i1, err := d.trigramHitIterator(ng1, caseSensitive, fileName) - if err != nil { - return nil, err - } - i2, err := d.trigramHitIterator(ng2, caseSensitive, fileName) - if err != nil { - return nil, err - } - return &distanceHitIterator{ - i1: i1, - i2: i2, - distance: dist, - }, nil -} - -func (d *indexData) trigramHitIterator(ng ngram, caseSensitive, fileName bool) (hitIterator, error) { - variants := []ngram{ng} - if !caseSensitive { - variants = generateCaseNgrams(ng) - } - - iters := make([]hitIterator, 0, len(variants)) - for _, v := range variants { - if fileName { - blob := d.fileNameNgrams[v] - if len(blob) > 0 { - iters = append(iters, &inMemoryIterator{ - d.fileNameNgrams[v], - v, - }) - } - continue - } - - sec := d.ngrams[v] - blob, err := d.readSectionBlob(sec) - if err != nil { - return nil, err - } - if len(blob) > 0 { - iters = append(iters, newCompressedPostingIterator(blob, v)) - } - } - - if len(iters) == 1 { - return iters[0], nil - } - return &mergingIterator{ - iters: iters, - }, nil -} - -// inMemoryIterator is hitIterator that goes over an in-memory uint32 posting list. -type inMemoryIterator struct { - postings []uint32 - what ngram -} - -func (i *inMemoryIterator) String() string { - return fmt.Sprintf("mem(%s):%v", i.what, i.postings) -} - -func (i *inMemoryIterator) first() uint32 { - if len(i.postings) > 0 { - return i.postings[0] - } - return maxUInt32 -} - -func (i *inMemoryIterator) updateStats(s *Stats) { -} - -func (i *inMemoryIterator) next(limit uint32) { - if limit == maxUInt32 { - i.postings = nil - } - - for len(i.postings) > 0 && i.postings[0] <= limit { - i.postings = i.postings[1:] - } -} - -// compressedPostingIterator goes over a delta varint encoded posting -// list. -type compressedPostingIterator struct { - blob, orig []byte - _first uint32 - what ngram -} - -func newCompressedPostingIterator(b []byte, w ngram) *compressedPostingIterator { - d, sz := binary.Uvarint(b) - return &compressedPostingIterator{ - _first: uint32(d), - blob: b[sz:], - orig: b, - what: w, - } -} - -func (i *compressedPostingIterator) String() string { - return fmt.Sprintf("compressed(%s, %d, [%d bytes])", i.what, i._first, len(i.blob)) -} - -func (i *compressedPostingIterator) first() uint32 { - return i._first -} - -func (i *compressedPostingIterator) next(limit uint32) { - if limit == maxUInt32 { - i.blob = nil - i._first = maxUInt32 - return - } - - for i._first <= limit && len(i.blob) > 0 { - delta, sz := binary.Uvarint(i.blob) - i._first += uint32(delta) - i.blob = i.blob[sz:] - } - - if i._first <= limit && len(i.blob) == 0 { - i._first = maxUInt32 - } -} - -func (i *compressedPostingIterator) updateStats(s *Stats) { - s.IndexBytesLoaded += int64(len(i.orig) - len(i.blob)) -} - -// mergingIterator forms the merge of a set of hitIterators, to -// implement an OR operation at the hit level. -type mergingIterator struct { - iters []hitIterator -} - -func (i *mergingIterator) String() string { - return fmt.Sprintf("merge:%v", i.iters) -} - -func (i *mergingIterator) updateStats(s *Stats) { - for _, j := range i.iters { - j.updateStats(s) - } -} - -func (i *mergingIterator) first() uint32 { - r := uint32(maxUInt32) - for _, j := range i.iters { - f := j.first() - if f < r { - r = f - } - } - - return r -} - -func (i *mergingIterator) next(limit uint32) { - for _, j := range i.iters { - j.next(limit) - } -}
diff --git a/hititer_test.go b/hititer_test.go deleted file mode 100644 index 0c276c1..0000000 --- a/hititer_test.go +++ /dev/null
@@ -1,111 +0,0 @@ -// Copyright 2019 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "fmt" - "math/rand" - "reflect" - "sort" - "testing" - "testing/quick" - - "github.com/google/go-cmp/cmp" -) - -func TestCompressedPostingIterator_limit(t *testing.T) { - f := func(nums, limits []uint32) bool { - if len(nums) == 0 || len(limits) == 0 { - return true - } - - nums = sortedUnique(nums) - sort.Slice(limits, func(i, j int) bool { return limits[i] < limits[j] }) - - want := doHitIterator(&inMemoryIterator{postings: nums}, limits) - - it := newCompressedPostingIterator(toDeltas(nums), stringToNGram("abc")) - got := doHitIterator(it, limits) - if !reflect.DeepEqual(want, got) { - t.Log(cmp.Diff(want, got)) - return false - } - return true - } - if err := quick.Check(f, nil); err != nil { - t.Error(err) - } -} - -func doHitIterator(it hitIterator, limits []uint32) []uint32 { - var nums []uint32 - for _, limit := range limits { - it.next(limit) - nums = append(nums, it.first()) - } - return nums -} - -func BenchmarkCompressedPostingIterator(b *testing.B) { - cases := []struct{ size, limitSize int }{ - {100, 50}, - {10000, 100}, - {10000, 1000}, - {10000, 10000}, - {100000, 100}, - {100000, 1000}, - {100000, 10000}, - {100000, 100000}, - } - for _, tt := range cases { - b.Run(fmt.Sprintf("%d_%d", tt.size, tt.limitSize), func(b *testing.B) { - benchmarkCompressedPostingIterator(b, tt.size, tt.limitSize) - }) - } -} - -func benchmarkCompressedPostingIterator(b *testing.B, size, limitsSize int) { - nums := genUints32(size) - limits := genUints32(limitsSize) - - nums = sortedUnique(nums) - sort.Slice(limits, func(i, j int) bool { return limits[i] < limits[j] }) - - ng := stringToNGram("abc") - deltas := toDeltas(nums) - - b.ResetTimer() - - for n := 0; n < b.N; n++ { - it := newCompressedPostingIterator(deltas, ng) - for _, limit := range limits { - it.next(limit) - _ = it.first() - } - var s Stats - it.updateStats(&s) - b.SetBytes(s.IndexBytesLoaded) - } -} - -func genUints32(size int) []uint32 { - // Deterministic for benchmarks - r := rand.New(rand.NewSource(int64(size))) - nums := make([]uint32, size) - for i := range nums { - nums[i] = r.Uint32() - } - return nums -}
diff --git a/index_test.go b/index_test.go deleted file mode 100644 index 0f84197..0000000 --- a/index_test.go +++ /dev/null
@@ -1,1938 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "bytes" - "context" - "fmt" - "reflect" - "regexp/syntax" - "strings" - "testing" - - "github.com/kylelemons/godebug/pretty" - - "github.com/google/zoekt/query" -) - -func clearScores(r *SearchResult) { - for i := range r.Files { - r.Files[i].Score = 0.0 - for j := range r.Files[i].LineMatches { - r.Files[i].LineMatches[j].Score = 0.0 - } - r.Files[i].Checksum = nil - r.Files[i].Debug = "" - } -} - -func testIndexBuilder(t *testing.T, repo *Repository, docs ...Document) *IndexBuilder { - b, err := NewIndexBuilder(repo) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - for i, d := range docs { - if err := b.Add(d); err != nil { - t.Fatalf("Add %d: %v", i, err) - } - } - return b -} - -func TestBoundary(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: []byte("x the")}, - Document{Name: "f1", Content: []byte("reader")}) - res := searchForTest(t, b, &query.Substring{Pattern: "there"}) - if len(res.Files) > 0 { - t.Fatalf("got %v, want no matches", res.Files) - } -} - -func TestDocSectionInvalid(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - doc := Document{ - Name: "f1", - Content: []byte("01234567890123"), - Symbols: []DocumentSection{{5, 8}, {7, 9}}, - } - - if err := b.Add(doc); err == nil { - t.Errorf("overlapping doc sections should fail") - } - - doc = Document{ - Name: "f1", - Content: []byte("01234567890123"), - Symbols: []DocumentSection{{0, 20}}, - } - - if err := b.Add(doc); err == nil { - t.Errorf("doc sections beyond EOF should fail") - } -} - -func TestBasic(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{ - Name: "f2", - Content: []byte("to carry water in the no later bla"), - // ------------- 0123456789012345678901234567890123456789 - }) - - res := searchForTest(t, b, &query.Substring{ - Pattern: "water", - CaseSensitive: true, - }) - fmatches := res.Files - if len(fmatches) != 1 || len(fmatches[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 matches", fmatches) - } - - got := fmt.Sprintf("%s:%d", fmatches[0].FileName, fmatches[0].LineMatches[0].LineFragments[0].Offset) - want := "f2:9" - if got != want { - t.Errorf("1: got %s, want %s", got, want) - } -} - -func TestEmptyIndex(t *testing.T) { - b := testIndexBuilder(t, nil) - searcher := searcherForTest(t, b) - - var opts SearchOptions - if _, err := searcher.Search(context.Background(), &query.Substring{}, &opts); err != nil { - t.Fatalf("Search: %v", err) - } - - if _, err := searcher.List(context.Background(), &query.Repo{}); err != nil { - t.Fatalf("List: %v", err) - } - - if _, err := searcher.Search(context.Background(), &query.Substring{Pattern: "java", FileName: true}, &opts); err != nil { - t.Fatalf("Search: %v", err) - } -} - -type memSeeker struct { - data []byte -} - -func (s *memSeeker) Name() string { - return "memseeker" -} - -func (s *memSeeker) Close() {} -func (s *memSeeker) Read(off, sz uint32) ([]byte, error) { - return s.data[off : off+sz], nil -} - -func (s *memSeeker) Size() (uint32, error) { - return uint32(len(s.data)), nil -} - -func TestNewlines(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{Name: "filename", Content: []byte("line1\nline2\nbla")}) - - sres := searchForTest(t, b, &query.Substring{Pattern: "ne2"}) - - matches := sres.Files - want := []FileMatch{{ - FileName: "filename", - LineMatches: []LineMatch{ - { - LineFragments: []LineFragmentMatch{{ - Offset: 8, - LineOffset: 2, - MatchLength: 3, - }}, - Line: []byte("line2"), - LineStart: 6, - LineEnd: 11, - LineNumber: 2, - }, - }, - }} - - if !reflect.DeepEqual(matches, want) { - t.Errorf("got %v, want %v", matches, want) - } -} - -// A result spanning multiple lines should have LineMatches that only cover -// single lines. -func TestQueryNewlines(t *testing.T) { - text := "line1\nline2\nbla" - b := testIndexBuilder(t, nil, - Document{Name: "filename", Content: []byte(text)}) - sres := searchForTest(t, b, &query.Substring{Pattern: "ine2\nbla"}) - matches := sres.Files - if len(matches) != 1 { - t.Fatalf("got %d file matches, want exactly one", len(matches)) - } - m := matches[0] - if len(m.LineMatches) != 2 { - t.Fatalf("got %d line matches, want exactly two", len(m.LineMatches)) - } -} - -func searchForTest(t *testing.T, b *IndexBuilder, q query.Q, o ...SearchOptions) *SearchResult { - searcher := searcherForTest(t, b) - var opts SearchOptions - if len(o) > 0 { - opts = o[0] - } - res, err := searcher.Search(context.Background(), q, &opts) - if err != nil { - t.Fatalf("Search(%s): %v", q, err) - } - clearScores(res) - return res -} - -func searcherForTest(t *testing.T, b *IndexBuilder) Searcher { - var buf bytes.Buffer - b.Write(&buf) - f := &memSeeker{buf.Bytes()} - - searcher, err := NewSearcher(f) - if err != nil { - t.Fatalf("NewSearcher: %v", err) - } - - return searcher -} - -func TestFileBasedSearch(t *testing.T) { - c1 := []byte("I love bananas without skin") - // -----------0123456789012345678901234567890123456789 - c2 := []byte("In Dutch, ananas means pineapple") - // -----------0123456789012345678901234567890123456789 - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: c1}, - Document{Name: "f2", Content: c2}, - ) - sres := searchForTest(t, b, &query.Substring{ - CaseSensitive: false, - Pattern: "ananas", - }) - - matches := sres.Files - if len(matches) != 2 { - t.Fatalf("got %v, want 2 matches", matches) - } - if matches[0].FileName != "f2" || matches[1].FileName != "f1" { - t.Fatalf("got %v, want matches {f1,f2}", matches) - } - if matches[0].LineMatches[0].LineFragments[0].Offset != 10 || matches[1].LineMatches[0].LineFragments[0].Offset != 8 { - t.Fatalf("got %#v, want offsets 10,8", matches) - } -} - -func TestCaseFold(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: []byte("I love BaNaNAS.")}, - // ---------- 012345678901234567890123456 - ) - sres := searchForTest(t, b, &query.Substring{ - Pattern: "bananas", - CaseSensitive: true, - }) - matches := sres.Files - if len(matches) != 0 { - t.Errorf("foldcase: got %#v, want 0 matches", matches) - } - - sres = searchForTest(t, b, - &query.Substring{ - Pattern: "BaNaNAS", - CaseSensitive: true, - }) - matches = sres.Files - if len(matches) != 1 { - t.Errorf("no foldcase: got %v, want 1 matches", matches) - } else if matches[0].LineMatches[0].LineFragments[0].Offset != 7 { - t.Errorf("foldcase: got %v, want offsets 7", matches) - } -} - -func TestAndSearch(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("f1", []byte("x banana y")) - b.AddFile("f2", []byte("x apple y")) - b.AddFile("f3", []byte("x banana apple y")) - // ---------------------0123456789012345 - sres := searchForTest(t, b, query.NewAnd( - &query.Substring{ - Pattern: "banana", - }, - &query.Substring{ - Pattern: "apple", - }, - )) - matches := sres.Files - if len(matches) != 1 || len(matches[0].LineMatches) != 1 || len(matches[0].LineMatches[0].LineFragments) != 2 { - t.Fatalf("got %#v, want 1 match with 2 fragments", matches) - } - - if matches[0].LineMatches[0].LineFragments[0].Offset != 2 || matches[0].LineMatches[0].LineFragments[1].Offset != 9 { - t.Fatalf("got %#v, want offsets 2,9", matches) - } - - wantStats := Stats{ - FilesLoaded: 1, - ContentBytesLoaded: 18, - IndexBytesLoaded: 8, - NgramMatches: 3, // we look at doc 1, because it's max(0,1) due to AND - MatchCount: 1, - FileCount: 1, - FilesConsidered: 2, - } - if diff := pretty.Compare(wantStats, sres.Stats); diff != "" { - t.Errorf("got stats diff %s", diff) - } -} - -func TestAndNegateSearch(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("f1", []byte("x banana y")) - b.AddFile("f4", []byte("x banana apple y")) - // ---------------------0123456789012345 - sres := searchForTest(t, b, query.NewAnd( - &query.Substring{ - Pattern: "banana", - }, - &query.Not{Child: &query.Substring{ - Pattern: "apple", - }})) - - matches := sres.Files - - if len(matches) != 1 || len(matches[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 match", matches) - } - if matches[0].FileName != "f1" { - t.Fatalf("got match %#v, want FileName: f1", matches[0]) - } - if matches[0].LineMatches[0].LineFragments[0].Offset != 2 { - t.Fatalf("got %v, want offsets 2,9", matches) - } -} - -func TestNegativeMatchesOnlyShortcut(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("f1", []byte("x banana y")) - b.AddFile("f2", []byte("x appelmoes y")) - b.AddFile("f3", []byte("x appelmoes y")) - b.AddFile("f3", []byte("x appelmoes y")) - - sres := searchForTest(t, b, query.NewAnd( - &query.Substring{ - Pattern: "banana", - }, - &query.Not{Child: &query.Substring{ - Pattern: "appel", - }})) - - if sres.Stats.FilesConsidered != 1 { - t.Errorf("got %#v, want FilesConsidered: 1", sres.Stats) - } -} - -func TestFileSearch(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("banzana", []byte("x orange y")) - // --------0123456 - b.AddFile("banana", []byte("x apple y")) - // --------789012 - sres := searchForTest(t, b, &query.Substring{ - Pattern: "anan", - FileName: true, - }) - - matches := sres.Files - if len(matches) != 1 || len(matches[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 match", matches) - } - - got := matches[0].LineMatches[0] - want := LineMatch{ - Line: []byte("banana"), - LineFragments: []LineFragmentMatch{{ - Offset: 1, - LineOffset: 1, - MatchLength: 4, - }}, - FileName: true, - } - - if !reflect.DeepEqual(got, want) { - t.Errorf("got %#v, want %#v", got, want) - } -} - -func TestFileCase(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("BANANA", []byte("x orange y")) - sres := searchForTest(t, b, &query.Substring{ - Pattern: "banana", - FileName: true, - }) - - matches := sres.Files - if len(matches) != 1 || matches[0].FileName != "BANANA" { - t.Fatalf("got %v, want 1 match 'BANANA'", matches) - } -} - -func TestFileRegexpSearchBruteForce(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("banzana", []byte("x orange y")) - // --------------------------0123456879 - b.AddFile("banana", []byte("x apple y")) - sres := searchForTest(t, b, &query.Regexp{ - Regexp: mustParseRE("[qn][zx]"), - FileName: true, - }) - - matches := sres.Files - if len(matches) != 1 || matches[0].FileName != "banzana" { - t.Fatalf("got %v, want 1 match on 'banzana'", matches) - } -} - -func TestFileRegexpSearchShortString(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("banana.py", []byte("x orange y")) - sres := searchForTest(t, b, &query.Regexp{ - Regexp: mustParseRE("ana.py"), - FileName: true, - }) - - matches := sres.Files - if len(matches) != 1 || matches[0].FileName != "banana.py" { - t.Fatalf("got %v, want 1 match on 'banana.py'", matches) - } -} - -func TestFileSubstringSearchBruteForce(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("BANZANA", []byte("x orange y")) - b.AddFile("banana", []byte("x apple y")) - - q := &query.Substring{ - Pattern: "z", - FileName: true, - } - - res := searchForTest(t, b, q) - if len(res.Files) != 1 || res.Files[0].FileName != "BANZANA" { - t.Fatalf("got %v, want 1 match on 'BANZANA''", res.Files) - } -} - -func TestFileSubstringSearchBruteForceEnd(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("BANZANA", []byte("x orange y")) - b.AddFile("bananaq", []byte("x apple y")) - - q := &query.Substring{ - Pattern: "q", - FileName: true, - } - - res := searchForTest(t, b, q) - if want := "bananaq"; len(res.Files) != 1 || res.Files[0].FileName != want { - t.Fatalf("got %v, want 1 match in %q", res.Files, want) - } -} - -func TestSearchMatchAll(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("banzana", []byte("x orange y")) - // --------------------------0123456879 - b.AddFile("banana", []byte("x apple y")) - sres := searchForTest(t, b, &query.Const{Value: true}) - - matches := sres.Files - if len(matches) != 2 { - t.Fatalf("got %v, want 2 matches", matches) - } -} - -func TestSearchNewline(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("banzana", []byte("abcd\ndefg")) - sres := searchForTest(t, b, &query.Substring{Pattern: "d\nd"}) - - // Just check that we don't crash. - - matches := sres.Files - if len(matches) != 1 { - t.Fatalf("got %v, want 1 matches", matches) - } -} - -func TestSearchMatchAllRegexp(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("banzana", []byte("abcd")) - // --------------------------0123456879 - b.AddFile("banana", []byte("pqrs")) - sres := searchForTest(t, b, &query.Regexp{Regexp: mustParseRE(".")}) - - matches := sres.Files - if len(matches) != 2 || sres.Stats.MatchCount != 2 { - t.Fatalf("got %v, want 2 matches", matches) - } - if len(matches[0].LineMatches[0].Line) != 4 || len(matches[1].LineMatches[0].Line) != 4 { - t.Fatalf("want 4 chars in every file, got %#v", matches) - } -} - -func TestFileRestriction(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - b.AddFile("banana1", []byte("x orange y")) - // --------------------------0123456879 - b.AddFile("banana2", []byte("x apple y")) - b.AddFile("orange", []byte("x apple y")) - sres := searchForTest(t, b, query.NewAnd( - &query.Substring{ - Pattern: "banana", - FileName: true, - }, - &query.Substring{ - Pattern: "apple", - })) - - matches := sres.Files - if len(matches) != 1 || len(matches[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 match", matches) - } - - match := matches[0].LineMatches[0] - got := string(match.Line) - want := "x apple y" - if got != want { - t.Errorf("got match %#v, want line %q", match, want) - } -} - -func TestFileNameBoundary(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{Name: "banana2", Content: []byte("x apple y")}, - Document{Name: "helpers.go", Content: []byte("x apple y")}, - Document{Name: "foo", Content: []byte("x apple y")}) - sres := searchForTest(t, b, &query.Substring{ - Pattern: "helpers.go", - FileName: true, - }) - - matches := sres.Files - if len(matches) != 1 || len(matches[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 match", matches) - } -} - -func TestWordBoundaryRanking(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: []byte("xbytex xbytex")}, - Document{Name: "f2", Content: []byte("xbytex\nbytex\nbyte bla")}, - // -----------------------------------0123456 789012 34567890 - Document{Name: "f3", Content: []byte("xbytex ybytex")}) - - sres := searchForTest(t, b, &query.Substring{ - Pattern: "byte", - }) - - if len(sres.Files) != 3 { - t.Fatalf("got %#v, want 3 files", sres.Files) - } - - file0 := sres.Files[0] - if file0.FileName != "f2" || len(file0.LineMatches) != 3 { - t.Fatalf("got file %s, num matches %d (%#v), want 3 matches in file f2", file0.FileName, len(file0.LineMatches), file0) - } - - if file0.LineMatches[0].LineFragments[0].Offset != 13 { - t.Fatalf("got first match %#v, want full word match", sres.Files[0].LineMatches[0]) - } - if file0.LineMatches[1].LineFragments[0].Offset != 7 { - t.Fatalf("got second match %#v, want partial word match", sres.Files[0].LineMatches[1]) - } -} - -func TestDocumentOrder(t *testing.T) { - var docs []Document - for i := 0; i < 3; i++ { - docs = append(docs, Document{Name: fmt.Sprintf("f%d", i), Content: []byte("needle")}) - } - - b := testIndexBuilder(t, nil, docs...) - - sres := searchForTest(t, b, query.NewAnd( - &query.Substring{ - Pattern: "needle", - })) - - want := []string{"f0", "f1", "f2"} - var got []string - for _, f := range sres.Files { - got = append(got, f.FileName) - } - if !reflect.DeepEqual(got, want) { - t.Fatalf("got %v, want %v", got, want) - } -} - -func TestBranchMask(t *testing.T) { - b := testIndexBuilder(t, &Repository{ - Branches: []RepositoryBranch{ - {"master", "v-master"}, - {"stable", "v-stable"}, - {"bonzai", "v-bonzai"}, - }, - }, Document{Name: "f1", Content: []byte("needle"), Branches: []string{"master"}}, - Document{Name: "f2", Content: []byte("needle"), Branches: []string{"stable", "master"}}, - Document{Name: "f3", Content: []byte("needle"), Branches: []string{"stable", "master"}}, - Document{Name: "f4", Content: []byte("needle"), Branches: []string{"bonzai"}}, - ) - - sres := searchForTest(t, b, query.NewAnd( - &query.Substring{ - Pattern: "needle", - }, - &query.Branch{ - Pattern: "table", - })) - - if len(sres.Files) != 2 || sres.Files[0].FileName != "f2" || sres.Files[1].FileName != "f3" { - t.Fatalf("got %v, want 2 result from [f2,f3]", sres.Files) - } - - if len(sres.Files[0].Branches) != 1 || sres.Files[0].Branches[0] != "stable" { - t.Fatalf("got %v, want 1 branch 'stable'", sres.Files[0].Branches) - } -} - -func TestBranchLimit(t *testing.T) { - for limit := 64; limit <= 65; limit++ { - r := &Repository{} - for i := 0; i < limit; i++ { - s := fmt.Sprintf("b%d", i) - r.Branches = append(r.Branches, RepositoryBranch{ - s, "v-" + s, - }) - } - _, err := NewIndexBuilder(r) - if limit == 64 && err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } else if limit == 65 && err == nil { - t.Fatalf("NewIndexBuilder succeeded") - } - } -} - -func TestBranchReport(t *testing.T) { - branches := []string{"stable", "master"} - b := testIndexBuilder(t, &Repository{ - Branches: []RepositoryBranch{ - {"stable", "vs"}, - {"master", "vm"}, - }, - }, - Document{Name: "f2", Content: []byte("needle"), Branches: branches}) - sres := searchForTest(t, b, &query.Substring{ - Pattern: "needle", - }) - if len(sres.Files) != 1 { - t.Fatalf("got %v, want 1 result from f2", sres.Files) - } - - f := sres.Files[0] - if !reflect.DeepEqual(f.Branches, branches) { - t.Fatalf("got branches %q, want %q", f.Branches, branches) - } -} - -func TestBranchVersions(t *testing.T) { - b := testIndexBuilder(t, &Repository{ - Branches: []RepositoryBranch{ - {"stable", "v-stable"}, - {"master", "v-master"}, - }, - }, Document{Name: "f2", Content: []byte("needle"), Branches: []string{"master"}}) - - sres := searchForTest(t, b, &query.Substring{ - Pattern: "needle", - }) - if len(sres.Files) != 1 { - t.Fatalf("got %v, want 1 result from f2", sres.Files) - } - - f := sres.Files[0] - if f.Version != "v-master" { - t.Fatalf("got file %#v, want version 'v-master'", f) - } -} - -func mustParseRE(s string) *syntax.Regexp { - r, err := syntax.Parse(s, 0) - if err != nil { - panic(err) - } - - return r -} - -func TestRegexp(t *testing.T) { - content := []byte("needle the bla") - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: content, - }) - // ------------------------------01234567890123 - - sres := searchForTest(t, b, - &query.Regexp{ - Regexp: mustParseRE("dle.*bla"), - }) - - if len(sres.Files) != 1 || len(sres.Files[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 match in 1 file", sres.Files) - } - - got := sres.Files[0].LineMatches[0] - want := LineMatch{ - LineFragments: []LineFragmentMatch{{ - LineOffset: 3, - Offset: 3, - MatchLength: 11, - }}, - Line: content, - FileName: false, - LineNumber: 1, - LineStart: 0, - LineEnd: 14, - } - - if !reflect.DeepEqual(got, want) { - t.Errorf("got %#v, want %#v", got, want) - } -} - -func TestRegexpFile(t *testing.T) { - content := []byte("needle the bla") - // ----------------01234567890123 - - name := "let's play: find the mussel" - b := testIndexBuilder(t, nil, - Document{Name: name, Content: content}, - Document{Name: "play.txt", Content: content}) - - sres := searchForTest(t, b, - &query.Regexp{ - Regexp: mustParseRE("play.*mussel"), - FileName: true, - }) - - if len(sres.Files) != 1 || len(sres.Files[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 match in 1 file", sres.Files) - } - - if sres.Files[0].FileName != name { - t.Errorf("got match %#v, want name %q", sres.Files[0], name) - } -} - -func TestRegexpOrder(t *testing.T) { - content := []byte("bla the needle") - // ----------------01234567890123 - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: content}) - - sres := searchForTest(t, b, - &query.Regexp{ - Regexp: mustParseRE("dle.*bla"), - }) - - if len(sres.Files) != 0 { - t.Fatalf("got %v, want 0 matches", sres.Files) - } -} - -func TestRepoName(t *testing.T) { - content := []byte("bla the needle") - // ----------------01234567890123 - b := testIndexBuilder(t, &Repository{Name: "bla"}, - Document{Name: "f1", Content: content}) - - sres := searchForTest(t, b, - query.NewAnd( - &query.Substring{Pattern: "needle"}, - &query.Repo{Pattern: "foo"}, - )) - - if len(sres.Files) != 0 { - t.Fatalf("got %v, want 0 matches", sres.Files) - } - - if sres.Stats.FilesConsidered > 0 { - t.Fatalf("got FilesConsidered %d, should have short circuited", sres.Stats.FilesConsidered) - } - - sres = searchForTest(t, b, - query.NewAnd( - &query.Substring{Pattern: "needle"}, - &query.Repo{Pattern: "bla"}, - )) - if len(sres.Files) != 1 { - t.Fatalf("got %v, want 1 match", sres.Files) - } -} - -func TestMergeMatches(t *testing.T) { - content := []byte("blablabla") - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: content}) - - sres := searchForTest(t, b, - &query.Substring{Pattern: "bla"}) - if len(sres.Files) != 1 || len(sres.Files[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 match", sres.Files) - } -} - -func TestRepoURL(t *testing.T) { - content := []byte("blablabla") - b := testIndexBuilder(t, &Repository{ - Name: "name", - URL: "URL", - CommitURLTemplate: "commit", - FileURLTemplate: "file-url", - LineFragmentTemplate: "fragment", - }, Document{Name: "f1", Content: content}) - - sres := searchForTest(t, b, &query.Substring{Pattern: "bla"}) - - if sres.RepoURLs["name"] != "file-url" { - t.Errorf("got RepoURLs %v, want {name: URL}", sres.RepoURLs) - } - if sres.LineFragments["name"] != "fragment" { - t.Errorf("got URLs %v, want {name: URL}", sres.LineFragments) - } -} - -func TestRegexpCaseSensitive(t *testing.T) { - content := []byte("bla\nfunc unmarshalGitiles\n") - b := testIndexBuilder(t, nil, Document{ - Name: "f1", - Content: content, - }) - - res := searchForTest(t, b, - &query.Regexp{ - Regexp: mustParseRE("func.*Gitiles"), - CaseSensitive: true, - }) - - if len(res.Files) != 1 { - t.Fatalf("got %v, want one match", res.Files) - } -} - -func TestRegexpCaseFolding(t *testing.T) { - content := []byte("bla\nfunc unmarshalGitiles\n") - - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: content}) - res := searchForTest(t, b, - &query.Regexp{ - Regexp: mustParseRE("func.*GITILES"), - CaseSensitive: false, - }) - - if len(res.Files) != 1 { - t.Fatalf("got %v, want one match", res.Files) - } -} - -func TestCaseRegexp(t *testing.T) { - content := []byte("BLABLABLA") - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: content}) - res := searchForTest(t, b, - &query.Regexp{ - Regexp: mustParseRE("[xb][xl][xa]"), - CaseSensitive: true, - }) - - if len(res.Files) > 0 { - t.Fatalf("got %v, want no matches", res.Files) - } -} - -func TestNegativeRegexp(t *testing.T) { - content := []byte("BLABLABLA needle bla") - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: content}) - res := searchForTest(t, b, - query.NewAnd( - &query.Substring{ - Pattern: "needle", - }, - &query.Not{ - Child: &query.Regexp{ - Regexp: mustParseRE(".cs"), - }, - })) - - if len(res.Files) != 1 { - t.Fatalf("got %v, want 1 match", res.Files) - } -} - -func TestSymbolRank(t *testing.T) { - content := []byte("func bla() blubxxxxx") - // ----------------01234567890123456789 - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: content, - }, Document{ - Name: "f2", - Content: content, - Symbols: []DocumentSection{{5, 8}}, - }, Document{ - Name: "f3", - Content: content, - }) - - res := searchForTest(t, b, - &query.Substring{ - CaseSensitive: false, - Pattern: "bla", - }) - - if len(res.Files) != 3 { - t.Fatalf("got %d files, want 3 files. Full data: %v", len(res.Files), res.Files) - } - if res.Files[0].FileName != "f2" { - t.Errorf("got %#v, want 'f2' as top match", res.Files[0]) - } -} - -func TestSymbolRankRegexpUTF8(t *testing.T) { - prefix := strings.Repeat(string([]rune{kelvinCodePoint}), 100) + "\n" - content := []byte(prefix + - "func bla() blub") - // ------012345678901234 - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: content, - }, Document{ - Name: "f2", - Content: content, - Symbols: []DocumentSection{{uint32(len(prefix) + 5), uint32(len(prefix) + 8)}}, - }, Document{ - Name: "f3", - Content: content, - }) - - res := searchForTest(t, b, - &query.Regexp{ - Regexp: mustParseRE("b.a"), - }) - - if len(res.Files) != 3 { - t.Fatalf("got %#v, want 3 files", res.Files) - } - if res.Files[0].FileName != "f2" { - t.Errorf("got %#v, want 'f2' as top match", res.Files[0]) - } -} - -func TestPartialSymbolRank(t *testing.T) { - content := []byte("func bla() blub") - // ----------------012345678901234 - - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: content, - Symbols: []DocumentSection{{4, 9}}, - }, Document{ - Name: "f2", - Content: content, - Symbols: []DocumentSection{{4, 8}}, - }, Document{ - Name: "f3", - Content: content, - Symbols: []DocumentSection{{4, 9}}, - }) - - res := searchForTest(t, b, - &query.Substring{ - Pattern: "bla", - }) - - if len(res.Files) != 3 { - t.Fatalf("got %#v, want 3 files", res.Files) - } - if res.Files[0].FileName != "f2" { - t.Errorf("got %#v, want 'f2' as top match", res.Files[0]) - } -} - -func TestNegativeRepo(t *testing.T) { - content := []byte("bla the needle") - // ----------------01234567890123 - b := testIndexBuilder(t, &Repository{ - Name: "bla", - }, Document{Name: "f1", Content: content}) - - sres := searchForTest(t, b, - query.NewAnd( - &query.Substring{Pattern: "needle"}, - &query.Not{Child: &query.Repo{Pattern: "bla"}}, - )) - - if len(sres.Files) != 0 { - t.Fatalf("got %v, want 0 matches", sres.Files) - } -} - -func TestListRepos(t *testing.T) { - content := []byte("bla the needle") - // ----------------01234567890123 - b := testIndexBuilder(t, &Repository{ - Name: "reponame", - }, - Document{Name: "f1", Content: content}, - Document{Name: "f2", Content: content}) - - searcher := searcherForTest(t, b) - q := &query.Repo{Pattern: "epo"} - res, err := searcher.List(context.Background(), q) - if err != nil { - t.Fatalf("List(%v): %v", q, err) - } - if len(res.Repos) != 1 || res.Repos[0].Repository.Name != "reponame" { - t.Fatalf("got %v, want 1 matches", res) - } - if got := res.Repos[0].Stats.Shards; got != 1 { - t.Fatalf("got %d, want 1 shard", got) - } - q = &query.Repo{Pattern: "bla"} - res, err = searcher.List(context.Background(), q) - if err != nil { - t.Fatalf("List(%v): %v", q, err) - } - if len(res.Repos) != 0 { - t.Fatalf("got %v, want 0 matches", res) - } -} - -func TestMetadata(t *testing.T) { - content := []byte("bla the needle") - // ----------------01234567890123 - b := testIndexBuilder(t, &Repository{ - Name: "reponame", - }, Document{Name: "f1", Content: content}, - Document{Name: "f2", Content: content}) - - var buf bytes.Buffer - b.Write(&buf) - f := &memSeeker{buf.Bytes()} - - rd, _, err := ReadMetadata(f) - if err != nil { - t.Fatalf("ReadMetadata: %v", err) - } - - if got, want := rd.Name, "reponame"; got != want { - t.Fatalf("got %q want %q", got, want) - } -} - -func TestOr(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: []byte("needle")}, - Document{Name: "f2", Content: []byte("banana")}) - sres := searchForTest(t, b, query.NewOr( - &query.Substring{Pattern: "needle"}, - &query.Substring{Pattern: "banana"})) - - if len(sres.Files) != 2 { - t.Fatalf("got %v, want 2 files", sres.Files) - } -} - -func TestAtomCountScore(t *testing.T) { - b := testIndexBuilder(t, - &Repository{ - Branches: []RepositoryBranch{ - {"branches", "v1"}, - {"needle", "v2"}, - }, - }, - Document{Name: "f1", Content: []byte("needle the bla"), Branches: []string{"branches"}}, - Document{Name: "needle-file-branch", Content: []byte("needle content"), Branches: []string{"needle"}}, - Document{Name: "needle-file", Content: []byte("needle content"), Branches: []string{"branches"}}) - - sres := searchForTest(t, b, - query.NewOr( - &query.Substring{Pattern: "needle"}, - &query.Substring{Pattern: "needle", FileName: true}, - &query.Branch{Pattern: "needle"}, - )) - var got []string - for _, f := range sres.Files { - got = append(got, f.FileName) - } - want := []string{"needle-file-branch", "needle-file", "f1"} - if !reflect.DeepEqual(got, want) { - t.Errorf("got %v, want %v", got, want) - } -} - -func TestImportantCutoff(t *testing.T) { - content := []byte("func bla() blub") - // ----------------012345678901234 - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: content, - Symbols: []DocumentSection{{5, 8}}, - }, Document{ - Name: "f2", - Content: content, - }) - opts := SearchOptions{ - ShardMaxImportantMatch: 1, - } - - sres := searchForTest(t, b, &query.Substring{Pattern: "bla"}, opts) - if len(sres.Files) != 1 || sres.Files[0].FileName != "f1" { - t.Errorf("got %v, wanted 1 match 'f1'", sres.Files) - } -} - -func TestFrequency(t *testing.T) { - content := []byte("sla _Py_HashDouble(double v sla las las shd dot dot") - // ----------------012345678901234 - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: content, - }) - - sres := searchForTest(t, b, &query.Substring{Pattern: "slashdot"}) - if len(sres.Files) != 0 { - t.Errorf("got %v, wanted 0 matches", sres.Files) - } -} - -func TestMatchNewline(t *testing.T) { - re, err := syntax.Parse("[^a]a", syntax.ClassNL) - if err != nil { - t.Fatalf("syntax.Parse: %v", err) - } - - content := []byte("pqr\nalex") - // ----------------0123 4567 - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: content, - }) - - sres := searchForTest(t, b, &query.Regexp{Regexp: re, CaseSensitive: true}) - if len(sres.Files) != 1 { - t.Errorf("got %v, wanted 1 matches", sres.Files) - } else if l := sres.Files[0].LineMatches[0].Line; !bytes.Equal(l, content[len("pqr\n"):]) { - t.Errorf("got match line %q, want %q", l, content) - } -} - -func TestSubRepo(t *testing.T) { - subRepos := map[string]*Repository{ - "sub": { - Name: "sub-name", - LineFragmentTemplate: "sub-line", - }, - } - - content := []byte("pqr\nalex") - // ----------------0123 4567 - - b := testIndexBuilder(t, &Repository{ - SubRepoMap: subRepos, - }, Document{ - Name: "sub/f1", - Content: content, - SubRepositoryPath: "sub", - }) - - sres := searchForTest(t, b, &query.Substring{Pattern: "alex"}) - if len(sres.Files) != 1 { - t.Fatalf("got %v, wanted 1 matches", sres.Files) - } - - f := sres.Files[0] - if f.SubRepositoryPath != "sub" || f.SubRepositoryName != "sub-name" { - t.Errorf("got %#v, want SubRepository{Path,Name} = {'sub', 'sub-name'}", f) - } - - if sres.LineFragments["sub-name"] != "sub-line" { - t.Errorf("got LineFragmentTemplate %v, want {'sub':'sub-line'}", sres.LineFragments) - } -} - -func TestSearchEither(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: []byte("bla needle bla")}, - Document{Name: "needle-file-branch", Content: []byte("bla content")}) - - sres := searchForTest(t, b, &query.Substring{Pattern: "needle"}) - if len(sres.Files) != 2 { - t.Fatalf("got %v, wanted 2 matches", sres.Files) - } - - sres = searchForTest(t, b, &query.Substring{Pattern: "needle", Content: true}) - if len(sres.Files) != 1 { - t.Fatalf("got %v, wanted 1 match", sres.Files) - } - - if got, want := sres.Files[0].FileName, "f1"; got != want { - t.Errorf("got %q, want %q", got, want) - } -} - -func TestUnicodeExactMatch(t *testing.T) { - needle := "néédlÉ" - content := []byte("blá blá " + needle + " blâ") - // ----------------01234567 8 - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: content}) - - if res := searchForTest(t, b, &query.Substring{Pattern: needle, CaseSensitive: true}); len(res.Files) != 1 { - t.Fatalf("case sensitive: got %v, wanted 1 match", res.Files) - } -} - -func TestUnicodeCoverContent(t *testing.T) { - needle := "néédlÉ" - content := []byte("blá blá " + needle + " blâ") - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: content}) - - if res := searchForTest(t, b, &query.Substring{Pattern: "NÉÉDLÉ", CaseSensitive: true}); len(res.Files) != 0 { - t.Fatalf("case sensitive: got %v, wanted 0 match", res.Files) - } - - res := searchForTest(t, b, &query.Substring{Pattern: "NÉÉDLÉ"}) - if len(res.Files) != 1 { - t.Fatalf("case insensitive: got %v, wanted 1 match", res.Files) - } - - if got, want := res.Files[0].LineMatches[0].LineFragments[0].Offset, uint32(strings.Index(string(content), needle)); got != want { - t.Errorf("got %d want %d", got, want) - } -} - -func TestUnicodeNonCoverContent(t *testing.T) { - needle := "nééáádlÉ" - //---------01234567 - content := []byte("blá blá " + needle + " blâ") - // ----------------01234567 8901234 5678 - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: content}) - - res := searchForTest(t, b, &query.Substring{Pattern: "NÉÉÁÁDLÉ", Content: true}) - if len(res.Files) != 1 { - t.Fatalf("got %v, wanted 1 match", res.Files) - } - - if got, want := res.Files[0].LineMatches[0].LineFragments[0].Offset, uint32(strings.Index(string(content), needle)); got != want { - t.Errorf("got %d want %d", got, want) - } -} - -const kelvinCodePoint = 8490 - -func TestUnicodeVariableLength(t *testing.T) { - lower := 'k' - upper := rune(kelvinCodePoint) - - needle := "nee" + string([]rune{lower}) + "eed" - corpus := []byte("nee" + string([]rune{upper}) + "eed" + - " ee" + string([]rune{lower}) + "ee" + - " ee" + string([]rune{upper}) + "ee") - - b := testIndexBuilder(t, nil, - Document{Name: "f1", Content: []byte(corpus)}) - - res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true}) - if len(res.Files) != 1 { - t.Fatalf("got %v, wanted 1 match", res.Files) - } -} - -func TestUnicodeFileStartOffsets(t *testing.T) { - unicode := "世界" - wat := "waaaaaat" - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: []byte(unicode), - }, - Document{ - Name: "f2", - Content: []byte(wat), - }, - ) - q := &query.Substring{Pattern: wat, Content: true} - res := searchForTest(t, b, q) - if len(res.Files) != 1 { - t.Fatalf("got %v, wanted 1 match", res.Files) - } -} - -func TestLongFileUTF8(t *testing.T) { - needle := "neeedle" - - // 6 bytes. - unicode := "世界" - content := []byte(strings.Repeat(unicode, 100) + needle) - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: []byte(strings.Repeat("a", 50)), - }, - Document{ - Name: "f2", - Content: content, - }) - - q := &query.Substring{Pattern: needle, Content: true} - res := searchForTest(t, b, q) - if len(res.Files) != 1 { - t.Errorf("got %v, want 1 result", res) - } -} - -func TestEstimateDocCount(t *testing.T) { - content := []byte("bla needle bla") - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f1", Content: content}, - Document{Name: "f2", Content: content}, - ) - - if sres := searchForTest(t, b, - query.NewAnd( - &query.Substring{Pattern: "needle"}, - &query.Repo{Pattern: "reponame"}, - ), SearchOptions{ - EstimateDocCount: true, - }); sres.Stats.ShardFilesConsidered != 2 { - t.Errorf("got FilesConsidered = %d, want 2", sres.Stats.FilesConsidered) - } - if sres := searchForTest(t, b, - query.NewAnd( - &query.Substring{Pattern: "needle"}, - &query.Repo{Pattern: "nomatch"}, - ), SearchOptions{ - EstimateDocCount: true, - }); sres.Stats.ShardFilesConsidered != 0 { - t.Errorf("got FilesConsidered = %d, want 0", sres.Stats.FilesConsidered) - } -} - -func TestUTF8CorrectCorpus(t *testing.T) { - needle := "neeedle" - - // 6 bytes. - unicode := "世界" - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: []byte(strings.Repeat(unicode, 100)), - }, - Document{ - Name: "xxxxxneeedle", - Content: []byte("hello"), - }) - - q := &query.Substring{Pattern: needle, FileName: true} - res := searchForTest(t, b, q) - if len(res.Files) != 1 { - t.Errorf("got %v, want 1 result", res) - } -} - -func TestBuilderStats(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: []byte(strings.Repeat("abcd", 1024)), - }) - var buf bytes.Buffer - b.Write(&buf) - - if got, want := b.ContentSize(), uint32(2+4*1024); got != want { - t.Errorf("got %d, want %d", got, want) - } -} - -func TestIOStats(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: []byte(strings.Repeat("abcd", 1024)), - }) - - q := &query.Substring{Pattern: "abc", CaseSensitive: true, Content: true} - res := searchForTest(t, b, q) - - // 4096 (content) + 2 (overhead: newlines or doc sections) - if got, want := res.Stats.ContentBytesLoaded, int64(4098); got != want { - t.Errorf("got content I/O %d, want %d", got, want) - } - - // 1024 entries, each 4 bytes apart. 4 fits into single byte - // delta encoded. - if got, want := res.Stats.IndexBytesLoaded, int64(1024); got != want { - t.Errorf("got index I/O %d, want %d", got, want) - } -} - -func TestStartLineAnchor(t *testing.T) { - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: []byte( - `hello -start of middle of line -`), - }) - - q, err := query.Parse("^start") - if err != nil { - t.Errorf("parse: %v", err) - } - - res := searchForTest(t, b, q) - if len(res.Files) != 1 { - t.Errorf("got %v, want 1 file", res.Files) - } - - q, err = query.Parse("^middle") - if err != nil { - t.Errorf("parse: %v", err) - } - res = searchForTest(t, b, q) - if len(res.Files) != 0 { - t.Errorf("got %v, want 0 files", res.Files) - } -} - -func TestAndOrUnicode(t *testing.T) { - q, err := query.Parse("orange.*apple") - if err != nil { - t.Errorf("parse: %v", err) - } - finalQ := query.NewAnd(q, - query.NewOr(query.NewAnd(&query.Repo{Pattern: "name"}, - query.NewOr(&query.Branch{Pattern: "master"})))) - - b := testIndexBuilder(t, &Repository{ - Name: "name", - Branches: []RepositoryBranch{{"master", "master-version"}}, - }, Document{ - Name: "f2", - Content: []byte("orange\u2318apple"), - // --------------0123456 78901 - Branches: []string{"master"}, - }) - - res := searchForTest(t, b, finalQ) - if len(res.Files) != 1 { - t.Errorf("got %v, want 1 result", res.Files) - } -} - -func TestAndShort(t *testing.T) { - content := []byte("bla needle at orange bla") - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f1", Content: content}, - Document{Name: "f2", Content: []byte("xx at xx")}, - Document{Name: "f3", Content: []byte("yy orange xx")}, - ) - - q := query.NewAnd(&query.Substring{Pattern: "at"}, - &query.Substring{Pattern: "orange"}) - - res := searchForTest(t, b, q) - if len(res.Files) != 1 || res.Files[0].FileName != "f1" { - t.Errorf("got %v, want 1 result", res.Files) - } -} - -func TestNoCollectRegexpSubstring(t *testing.T) { - content := []byte("bla final bla\nfoo final, foo") - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f1", Content: content}, - ) - - q := &query.Regexp{ - Regexp: mustParseRE("final[,.]"), - } - - res := searchForTest(t, b, q) - if len(res.Files) != 1 { - t.Fatalf("got %v, want 1 result", res.Files) - } - if f := res.Files[0]; len(f.LineMatches) != 1 { - t.Fatalf("got line matches %v, want 1 line match", printLineMatches(f.LineMatches)) - } -} - -func printLineMatches(ms []LineMatch) string { - var ss []string - for _, m := range ms { - ss = append(ss, fmt.Sprintf("%d:%q %v", m.LineNumber, m.Line, m.LineFragments)) - } - - return strings.Join(ss, ", ") -} - -func TestLang(t *testing.T) { - content := []byte("bla needle bla") - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f1", Content: content}, - Document{Name: "f2", Language: "java", Content: content}, - Document{Name: "f3", Language: "cpp", Content: content}, - ) - - q := query.NewAnd(&query.Substring{Pattern: "needle"}, - &query.Language{Language: "cpp"}) - - res := searchForTest(t, b, q) - if len(res.Files) != 1 { - t.Fatalf("got %v, want 1 result in f3", res.Files) - } - f := res.Files[0] - if f.FileName != "f3" || f.Language != "cpp" { - t.Fatalf("got %v, want 1 match with language cpp", f) - } -} - -func TestLangShortcut(t *testing.T) { - content := []byte("bla needle bla") - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f2", Language: "java", Content: content}, - Document{Name: "f3", Language: "cpp", Content: content}, - ) - - q := query.NewAnd(&query.Substring{Pattern: "needle"}, - &query.Language{Language: "fortran"}) - - res := searchForTest(t, b, q) - if len(res.Files) != 0 { - t.Fatalf("got %v, want 0 results", res.Files) - } - if res.Stats.IndexBytesLoaded > 0 { - t.Errorf("got IndexBytesLoaded %d, want 0", res.Stats.IndexBytesLoaded) - } -} - -func TestNoTextMatchAtoms(t *testing.T) { - content := []byte("bla needle bla") - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f1", Content: content}, - Document{Name: "f2", Language: "java", Content: content}, - Document{Name: "f3", Language: "cpp", Content: content}, - ) - q := query.NewAnd(&query.Language{Language: "java"}) - res := searchForTest(t, b, q) - if len(res.Files) != 1 { - t.Fatalf("got %v, want 1 result in f3", res.Files) - } -} - -func TestNoPositiveAtoms(t *testing.T) { - content := []byte("bla needle bla") - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f1", Content: content}, - Document{Name: "f2", Content: content}, - ) - - q := query.NewAnd( - &query.Not{Child: &query.Substring{Pattern: "xyz"}}, - &query.Repo{Pattern: "reponame"}) - res := searchForTest(t, b, q) - if len(res.Files) != 2 { - t.Fatalf("got %v, want 2 results in f3", res.Files) - } -} - -func TestSymbolBoundaryStart(t *testing.T) { - content := []byte("start\nbla bla\nend") - // ----------------012345 67890123 456 - - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{ - Name: "f1", - Content: content, - Symbols: []DocumentSection{{0, 5}, {14, 17}}, - }, - ) - q := &query.Symbol{ - Atom: &query.Substring{Pattern: "start"}, - } - res := searchForTest(t, b, q) - if len(res.Files) != 1 || len(res.Files[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 line in 1 file", res.Files) - } - m := res.Files[0].LineMatches[0].LineFragments[0] - if m.Offset != 0 { - t.Fatalf("got offset %d want 0", m.Offset) - } -} - -func TestSymbolBoundaryEnd(t *testing.T) { - content := []byte("start\nbla bla\nend") - // ----------------012345 67890123 456 - - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{ - Name: "f1", - Content: content, - Symbols: []DocumentSection{{14, 17}}, - }, - ) - q := &query.Symbol{ - Atom: &query.Substring{Pattern: "end"}, - } - res := searchForTest(t, b, q) - if len(res.Files) != 1 || len(res.Files[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 line in 1 file", res.Files) - } - m := res.Files[0].LineMatches[0].LineFragments[0] - if m.Offset != 14 { - t.Fatalf("got offset %d want 0", m.Offset) - } -} - -func TestSymbolAtom(t *testing.T) { - content := []byte("bla\nsymblabla\nbla") - // ----------------0123 456789012 - - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{ - Name: "f1", - Content: content, - Symbols: []DocumentSection{{4, 12}}, - }, - ) - q := &query.Symbol{ - Atom: &query.Substring{Pattern: "bla"}, - } - res := searchForTest(t, b, q) - if len(res.Files) != 1 || len(res.Files[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 line in 1 file", res.Files) - } - m := res.Files[0].LineMatches[0].LineFragments[0] - if m.Offset != 7 || m.MatchLength != 3 { - t.Fatalf("got offset %d, size %d want 7 size 3", m.Offset, m.MatchLength) - } -} - -func TestSymbolAtomExact(t *testing.T) { - content := []byte("bla\nsym\nbla\nsym\nasymb") - // ----------------0123 4567 89012 - - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{ - Name: "f1", - Content: content, - Symbols: []DocumentSection{{4, 7}}, - }, - ) - q := &query.Symbol{ - Atom: &query.Substring{Pattern: "sym"}, - } - res := searchForTest(t, b, q) - if len(res.Files) != 1 || len(res.Files[0].LineMatches) != 1 { - t.Fatalf("got %v, want 1 line in 1 file", res.Files) - } - m := res.Files[0].LineMatches[0].LineFragments[0] - if m.Offset != 4 { - t.Fatalf("got offset %d, want 7", m.Offset) - } -} - -func TestHitIterTerminate(t *testing.T) { - // contrived input: trigram frequencies forces selecting abc + - // def for the distance iteration. There is no match, so this - // will advance the compressedPostingIterator to beyond the - // end. - content := []byte("abc bcdbcd cdecde abcabc def efg") - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: content, - }, - ) - searchForTest(t, b, &query.Substring{Pattern: "abcdef"}) -} - -func TestDistanceHitIterBailLast(t *testing.T) { - content := []byte("AST AST AST UASH") - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: content, - }, - ) - res := searchForTest(t, b, &query.Substring{Pattern: "UAST"}) - if len(res.Files) != 0 { - t.Fatalf("got %v, want no results", res.Files) - } -} - -func TestDocumentSectionRuneBoundary(t *testing.T) { - content := string([]rune{kelvinCodePoint, kelvinCodePoint, kelvinCodePoint}) - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - for i, sec := range []DocumentSection{ - {2, 6}, - {3, 7}, - } { - if err := b.Add(Document{ - Name: "f1", - Content: []byte(content), - Symbols: []DocumentSection{sec}, - }); err == nil { - t.Errorf("%d: Add succeeded", i) - } - } -} - -func TestUnicodeQuery(t *testing.T) { - content := string([]rune{kelvinCodePoint, kelvinCodePoint, kelvinCodePoint}) - b := testIndexBuilder(t, nil, - Document{ - Name: "f1", - Content: []byte(content), - }, - ) - - q := &query.Substring{Pattern: content} - res := searchForTest(t, b, q) - if len(res.Files) != 1 { - t.Fatalf("want 1 match, got %v", res.Files) - } - - f := res.Files[0] - if len(f.LineMatches) != 1 { - t.Fatalf("want 1 line, got %v", f.LineMatches) - } - l := f.LineMatches[0] - - if len(l.LineFragments) != 1 { - t.Fatalf("want 1 line fragment, got %v", l.LineFragments) - } - fr := l.LineFragments[0] - if fr.MatchLength != len(content) { - t.Fatalf("got MatchLength %d want %d", fr.MatchLength, len(content)) - } -} - -func TestSkipInvalidContent(t *testing.T) { - for _, content := range []string{ - // Binary - "abc def \x00 abc", - } { - - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - if err := b.Add(Document{ - Name: "f1", - Content: []byte(content), - }); err != nil { - t.Fatal(err) - } - - q := &query.Substring{Pattern: "abc def"} - res := searchForTest(t, b, q) - if len(res.Files) != 0 { - t.Fatalf("got %v, want no results", res.Files) - } - - q = &query.Substring{Pattern: "NOT-INDEXED"} - res = searchForTest(t, b, q) - if len(res.Files) != 1 { - t.Fatalf("got %v, want 1 result", res.Files) - } - } -} - -func TestCheckText(t *testing.T) { - for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} { - if err := CheckText([]byte(text), 20000); err != nil { - t.Errorf("CheckText(%q): %v", text, err) - } - } - for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} { - if err := CheckText([]byte(text), 15); err == nil { - t.Errorf("CheckText(%q) succeeded", text) - } - } -} - -func TestLineAnd(t *testing.T) { - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f1", Content: []byte("apple\nbanana\napple banana chocolate apple pudding banana\ngrape")}, - Document{Name: "f2", Content: []byte("apple orange\nbanana")}, - Document{Name: "f3", Content: []byte("banana grape")}, - ) - pattern := "(apple)(?-s:.)*?(banana)" - r, _ := syntax.Parse(pattern, syntax.Perl) - - q := query.Regexp{ - Regexp: r, - Content: true, - } - res := searchForTest(t, b, &q) - wantRegexpCount := 1 - if gotRegexpCount := res.RegexpsConsidered; gotRegexpCount != wantRegexpCount { - t.Errorf("got %d, wanted %d", gotRegexpCount, wantRegexpCount) - } - if len(res.Files) != 1 || res.Files[0].FileName != "f1" { - t.Errorf("got %v, want 1 result", res.Files) - } -} - -func TestLineAndFileName(t *testing.T) { - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f1", Content: []byte("apple banana\ngrape")}, - Document{Name: "f2", Content: []byte("apple banana\norange")}, - Document{Name: "apple banana", Content: []byte("banana grape")}, - ) - pattern := "(apple)(?-s:.)*?(banana)" - r, _ := syntax.Parse(pattern, syntax.Perl) - - q := query.Regexp{ - Regexp: r, - FileName: true, - } - res := searchForTest(t, b, &q) - wantRegexpCount := 1 - if gotRegexpCount := res.RegexpsConsidered; gotRegexpCount != wantRegexpCount { - t.Errorf("got %d, wanted %d", gotRegexpCount, wantRegexpCount) - } - if len(res.Files) != 1 || res.Files[0].FileName != "apple banana" { - t.Errorf("got %v, want 1 result", res.Files) - } -} - -func TestMultiLineRegex(t *testing.T) { - b := testIndexBuilder(t, &Repository{Name: "reponame"}, - Document{Name: "f1", Content: []byte("apple banana\ngrape")}, - Document{Name: "f2", Content: []byte("apple orange")}, - Document{Name: "f3", Content: []byte("grape apple")}, - ) - pattern := "(apple).*?[[:space:]].*?(grape)" - r, _ := syntax.Parse(pattern, syntax.Perl) - - q := query.Regexp{ - Regexp: r, - } - res := searchForTest(t, b, &q) - wantRegexpCount := 2 - if gotRegexpCount := res.RegexpsConsidered; gotRegexpCount != wantRegexpCount { - t.Errorf("got %d, wanted %d", gotRegexpCount, wantRegexpCount) - } - if len(res.Files) != 1 || res.Files[0].FileName != "f1" { - t.Errorf("got %v, want 1 result", res.Files) - } -}
diff --git a/indexbuilder.go b/indexbuilder.go deleted file mode 100644 index b5c87aa..0000000 --- a/indexbuilder.go +++ /dev/null
@@ -1,414 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "bytes" - "encoding/binary" - "fmt" - "hash/crc64" - "html/template" - "log" - "path/filepath" - "sort" - "unicode/utf8" -) - -var _ = log.Println - -const ngramSize = 3 - -type searchableString struct { - data []byte -} - -// Filled by the linker (see build-deploy.sh) -var Version string - -// Store character (unicode codepoint) offset (in bytes) this often. -const runeOffsetFrequency = 100 - -type postingsBuilder struct { - postings map[ngram][]byte - lastOffsets map[ngram]uint32 - - // To support UTF-8 searching, we must map back runes to byte - // offsets. As a first attempt, we sample regularly. The - // precise offset can be found by walking from the recorded - // offset to the desired rune. - runeOffsets []uint32 - runeCount uint32 - - isPlainASCII bool - - endRunes []uint32 - endByte uint32 -} - -func newPostingsBuilder() *postingsBuilder { - return &postingsBuilder{ - postings: map[ngram][]byte{}, - lastOffsets: map[ngram]uint32{}, - isPlainASCII: true, - } -} - -// Store trigram offsets for the given UTF-8 data. The -// DocumentSections must correspond to rune boundaries in the UTF-8 -// data. -func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { - dest := searchableString{ - data: data, - } - var buf [8]byte - var runeGram [3]rune - - var runeIndex uint32 - byteCount := 0 - dataSz := uint32(len(data)) - - byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) - for _, s := range byteSections { - byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) - } - var runeSectionBoundaries []uint32 - - endRune := s.runeCount - for ; len(data) > 0; runeIndex++ { - c, sz := utf8.DecodeRune(data) - if sz > 1 { - s.isPlainASCII = false - } - data = data[sz:] - - runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c - - if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { - s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) - } - for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { - runeSectionBoundaries = append(runeSectionBoundaries, - endRune+uint32(runeIndex)) - byteSectionBoundaries = byteSectionBoundaries[1:] - } - - byteCount += sz - - if runeIndex < 2 { - continue - } - - ng := runesToNGram(runeGram) - lastOff := s.lastOffsets[ng] - newOff := endRune + uint32(runeIndex) - 2 - - m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) - s.postings[ng] = append(s.postings[ng], buf[:m]...) - s.lastOffsets[ng] = newOff - } - s.runeCount += runeIndex - - for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { - return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) - } - - // Handle symbol definition that ends at file end. This can - // happen for labels at the end of .bat files. - - for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { - runeSectionBoundaries = append(runeSectionBoundaries, - endRune+runeIndex) - byteSectionBoundaries = byteSectionBoundaries[1:] - } - runeSecs := make([]DocumentSection, 0, len(byteSections)) - for i := 0; i < len(runeSectionBoundaries); i += 2 { - runeSecs = append(runeSecs, DocumentSection{ - Start: runeSectionBoundaries[i], - End: runeSectionBoundaries[i+1], - }) - } - - s.endRunes = append(s.endRunes, s.runeCount) - s.endByte += dataSz - return &dest, runeSecs, nil -} - -// IndexBuilder builds a single index shard. -type IndexBuilder struct { - contentStrings []*searchableString - nameStrings []*searchableString - docSections [][]DocumentSection - runeDocSections []DocumentSection - - checksums []byte - - branchMasks []uint64 - subRepos []uint32 - - contentPostings *postingsBuilder - namePostings *postingsBuilder - - // root repository - repo Repository - - // name to index. - subRepoIndices map[string]uint32 - - // language => language code - languageMap map[string]byte - - // languages codes - languages []byte -} - -func (d *Repository) verify() error { - for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} { - if _, err := template.New("").Parse(t); err != nil { - return err - } - } - return nil -} - -// ContentSize returns the number of content bytes so far ingested. -func (b *IndexBuilder) ContentSize() uint32 { - // Add the name too so we don't skip building index if we have - // lots of empty files. - return b.contentPostings.endByte + b.namePostings.endByte -} - -// NewIndexBuilder creates a fresh IndexBuilder. The passed in -// Repository contains repo metadata, and may be set to nil. -func NewIndexBuilder(r *Repository) (*IndexBuilder, error) { - b := &IndexBuilder{ - contentPostings: newPostingsBuilder(), - namePostings: newPostingsBuilder(), - languageMap: map[string]byte{}, - } - - if r == nil { - r = &Repository{} - } - if err := b.setRepository(r); err != nil { - return nil, err - } - return b, nil -} - -func (b *IndexBuilder) setRepository(desc *Repository) error { - if len(b.contentStrings) > 0 { - return fmt.Errorf("setRepository called after adding files") - } - if err := desc.verify(); err != nil { - return err - } - - if len(desc.Branches) > 64 { - return fmt.Errorf("too many branches") - } - - b.repo = *desc - - // copy subrepomap without root - b.repo.SubRepoMap = map[string]*Repository{} - for k, v := range desc.SubRepoMap { - if k != "" { - b.repo.SubRepoMap[k] = v - } - } - - b.populateSubRepoIndices() - return nil -} - -type DocumentSection struct { - Start, End uint32 -} - -// Document holds a document (file) to index. -type Document struct { - Name string - Content []byte - Branches []string - SubRepositoryPath string - Language string - - // If set, something is wrong with the file contents, and this - // is the reason it wasn't indexed. - SkipReason string - - // Document sections for symbols. Offsets should use bytes. - Symbols []DocumentSection -} - -type docSectionSlice []DocumentSection - -func (m docSectionSlice) Len() int { return len(m) } -func (m docSectionSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } -func (m docSectionSlice) Less(i, j int) bool { return m[i].Start < m[j].Start } - -// AddFile is a convenience wrapper for Add -func (b *IndexBuilder) AddFile(name string, content []byte) error { - return b.Add(Document{Name: name, Content: content}) -} - -// CheckText returns a reason why the given contents are probably not source texts. -func CheckText(content []byte, maxTrigramCount int) error { - if len(content) == 0 { - return nil - } - - if len(content) < ngramSize { - return fmt.Errorf("file size smaller than %d", ngramSize) - } - - trigrams := map[ngram]struct{}{} - - var cur [3]rune - byteCount := 0 - for len(content) > 0 { - if content[0] == 0 { - return fmt.Errorf("binary data at byte offset %d", byteCount) - } - - r, sz := utf8.DecodeRune(content) - content = content[sz:] - byteCount += sz - - cur[0], cur[1], cur[2] = cur[1], cur[2], r - if cur[0] == 0 { - // start of file. - continue - } - - trigrams[runesToNGram(cur)] = struct{}{} - if len(trigrams) > maxTrigramCount { - // probably not text. - return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) - } - } - return nil -} - -func (b *IndexBuilder) populateSubRepoIndices() { - if b.subRepoIndices != nil { - return - } - paths := []string{""} - for k := range b.repo.SubRepoMap { - paths = append(paths, k) - } - sort.Strings(paths) - b.subRepoIndices = make(map[string]uint32, len(paths)) - for i, p := range paths { - b.subRepoIndices[p] = uint32(i) - } -} - -const notIndexedMarker = "NOT-INDEXED: " - -// Add a file which only occurs in certain branches. -func (b *IndexBuilder) Add(doc Document) error { - hasher := crc64.New(crc64.MakeTable(crc64.ISO)) - - if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { - doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) - doc.Language = "binary" - } - - if doc.SkipReason != "" { - doc.Content = []byte(notIndexedMarker + doc.SkipReason) - doc.Symbols = nil - if doc.Language == "" { - doc.Language = "skipped" - } - } - - sort.Sort(docSectionSlice(doc.Symbols)) - var last DocumentSection - for i, s := range doc.Symbols { - if i > 0 { - if last.End > s.Start { - return fmt.Errorf("sections overlap") - } - } - last = s - } - if last.End > uint32(len(doc.Content)) { - return fmt.Errorf("section goes past end of content") - } - - if doc.SubRepositoryPath != "" { - rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) - if err != nil || rel == doc.Name { - return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) - } - } - docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) - if err != nil { - return err - } - nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) - if err != nil { - return err - } - - subRepoIdx, ok := b.subRepoIndices[doc.SubRepositoryPath] - if !ok { - return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) - } - - var mask uint64 - for _, br := range doc.Branches { - m := b.branchMask(br) - if m == 0 { - return fmt.Errorf("no branch found for %s", br) - } - mask |= m - } - - b.subRepos = append(b.subRepos, subRepoIdx) - - hasher.Write(doc.Content) - - b.contentStrings = append(b.contentStrings, docStr) - b.runeDocSections = append(b.runeDocSections, runeSecs...) - - b.nameStrings = append(b.nameStrings, nameStr) - b.docSections = append(b.docSections, doc.Symbols) - b.branchMasks = append(b.branchMasks, mask) - b.checksums = append(b.checksums, hasher.Sum(nil)...) - - langCode, ok := b.languageMap[doc.Language] - if !ok { - if len(b.languageMap) >= 255 { - return fmt.Errorf("too many languages") - } - langCode = byte(len(b.languageMap)) - b.languageMap[doc.Language] = langCode - } - b.languages = append(b.languages, langCode) - - return nil -} - -func (b *IndexBuilder) branchMask(br string) uint64 { - for i, b := range b.repo.Branches { - if b.Name == br { - return uint64(1) << uint(i) - } - } - return 0 -}
diff --git a/indexdata.go b/indexdata.go deleted file mode 100644 index 4262116..0000000 --- a/indexdata.go +++ /dev/null
@@ -1,275 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "fmt" - "hash/crc64" - "unicode/utf8" - - "github.com/google/zoekt/query" -) - -// indexData holds the pattern-independent data that we have to have -// in memory to search. Most of the memory is taken up by the ngram => -// offset index. -type indexData struct { - file IndexFile - - ngrams map[ngram]simpleSection - - newlinesStart uint32 - newlinesIndex []uint32 - - docSectionsStart uint32 - docSectionsIndex []uint32 - - runeDocSections []DocumentSection - - // rune offset=>byte offset mapping, relative to the start of the content corpus - runeOffsets []uint32 - - // offsets of file contents; includes end of last file - boundariesStart uint32 - boundaries []uint32 - - // rune offsets for the file content boundaries - fileEndRunes []uint32 - - fileNameContent []byte - fileNameIndex []uint32 - fileNameNgrams map[ngram][]uint32 - - // rune offset=>byte offset mapping, relative to the start of the filename corpus - fileNameRuneOffsets []uint32 - - // rune offsets for the file name boundaries - fileNameEndRunes []uint32 - - fileBranchMasks []uint64 - - // mask (power of 2) => name - branchNames map[uint]string - - // name => mask (power of 2) - branchIDs map[string]uint - - metaData IndexMetadata - repoMetaData Repository - - subRepos []uint32 - subRepoPaths []string - - // Checksums for all the files, at 8-byte intervals - checksums []byte - - // languages for all the files. - languages []byte - - // inverse of LanguageMap in metaData - languageMap map[byte]string - - repoListEntry RepoListEntry -} - -func (d *indexData) getChecksum(idx uint32) []byte { - start := crc64.Size * idx - return d.checksums[start : start+crc64.Size] -} - -func (d *indexData) calculateStats() { - var last uint32 - if len(d.boundaries) > 0 { - last += d.boundaries[len(d.boundaries)-1] - } - - lastFN := last - if len(d.fileNameIndex) > 0 { - lastFN = d.fileNameIndex[len(d.fileNameIndex)-1] - } - - stats := RepoStats{ - IndexBytes: int64(d.memoryUse()), - ContentBytes: int64(int(last) + int(lastFN)), - Documents: len(d.newlinesIndex) - 1, - Shards: 1, - } - d.repoListEntry = RepoListEntry{ - Repository: d.repoMetaData, - IndexMetadata: d.metaData, - Stats: stats, - } -} - -func (d *indexData) String() string { - return fmt.Sprintf("shard(%s)", d.file.Name()) -} - -func (d *indexData) memoryUse() int { - sz := 0 - for _, a := range [][]uint32{ - d.newlinesIndex, d.docSectionsIndex, - d.boundaries, d.fileNameIndex, - d.runeOffsets, d.fileNameRuneOffsets, - d.fileEndRunes, d.fileNameEndRunes, - } { - sz += 4 * len(a) - } - sz += 8 * len(d.runeDocSections) - sz += 8 * len(d.fileBranchMasks) - sz += 12 * len(d.ngrams) - for _, v := range d.fileNameNgrams { - sz += 4*len(v) + 4 - } - return sz -} - -const maxUInt32 = 0xffffffff - -func firstMinarg(xs []uint32) uint32 { - m := uint32(maxUInt32) - j := len(xs) - for i, x := range xs { - if x < m { - m = x - j = i - } - } - return uint32(j) -} - -func lastMinarg(xs []uint32) uint32 { - m := uint32(maxUInt32) - j := len(xs) - for i, x := range xs { - if x <= m { - m = x - j = i - } - } - return uint32(j) -} - -func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { - if filename { - return uint32(len(data.fileNameNgrams[ng])) - } - - return data.ngrams[ng].sz -} - -type ngramIterationResults struct { - matchIterator - - caseSensitive bool - fileName bool - substrBytes []byte - substrLowered []byte -} - -func (r *ngramIterationResults) String() string { - return fmt.Sprintf("wrapper(%v)", r.matchIterator) -} - -func (r *ngramIterationResults) candidates() []*candidateMatch { - cs := r.matchIterator.candidates() - for _, c := range cs { - c.caseSensitive = r.caseSensitive - c.fileName = r.fileName - c.substrBytes = r.substrBytes - c.substrLowered = r.substrLowered - } - return cs -} - -func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { - str := query.Pattern - - // Find the 2 least common ngrams from the string. - ngramOffs := splitNGrams([]byte(query.Pattern)) - frequencies := make([]uint32, 0, len(ngramOffs)) - for _, o := range ngramOffs { - var freq uint32 - if query.CaseSensitive { - freq = d.ngramFrequency(o.ngram, query.FileName) - } else { - for _, v := range generateCaseNgrams(o.ngram) { - freq += d.ngramFrequency(v, query.FileName) - } - } - - if freq == 0 { - return &ngramIterationResults{ - matchIterator: &noMatchTree{ - Why: "freq=0", - }, - }, nil - } - - frequencies = append(frequencies, freq) - } - firstI := firstMinarg(frequencies) - frequencies[firstI] = maxUInt32 - lastI := lastMinarg(frequencies) - if firstI > lastI { - lastI, firstI = firstI, lastI - } - - firstNG := ngramOffs[firstI].ngram - lastNG := ngramOffs[lastI].ngram - iter := &ngramDocIterator{ - leftPad: firstI, - rightPad: uint32(utf8.RuneCountInString(str)) - firstI, - } - if query.FileName { - iter.ends = d.fileNameEndRunes - } else { - iter.ends = d.fileEndRunes - } - - if firstI != lastI { - i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName) - if err != nil { - return nil, err - } - - iter.iter = i - } else { - hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName) - if err != nil { - return nil, err - } - iter.iter = hitIter - } - - patBytes := []byte(query.Pattern) - lowerPatBytes := toLower(patBytes) - - return &ngramIterationResults{ - matchIterator: iter, - caseSensitive: query.CaseSensitive, - fileName: query.FileName, - substrBytes: patBytes, - substrLowered: lowerPatBytes, - }, nil -} - -func (d *indexData) fileName(i uint32) []byte { - return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]] -} - -func (s *indexData) Close() { - s.file.Close() -}
diff --git a/indexfile_other.go b/indexfile_other.go deleted file mode 100644 index 1c03fe8..0000000 --- a/indexfile_other.go +++ /dev/null
@@ -1,61 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build !linux,!darwin - -package zoekt - -import ( - "fmt" - "os" -) - -// NewIndexFile returns a new index file. The index file takes -// ownership of the passed in file, and may close it. -func NewIndexFile(f *os.File) (IndexFile, error) { - return &indexFileFromOS{f}, nil -} - -type indexFileFromOS struct { - f *os.File -} - -func (f *indexFileFromOS) Read(off, sz uint32) ([]byte, error) { - r := make([]byte, sz) - _, err := f.f.ReadAt(r, int64(off)) - return r, err -} - -func (f indexFileFromOS) Size() (uint32, error) { - fi, err := f.f.Stat() - if err != nil { - return 0, err - } - - sz := fi.Size() - - if sz >= maxUInt32 { - return 0, fmt.Errorf("overflow") - } - - return uint32(sz), nil -} - -func (f indexFileFromOS) Close() { - f.f.Close() -} - -func (f indexFileFromOS) Name() string { - return f.f.Name() -}
diff --git a/indexfile_unix.go b/indexfile_unix.go deleted file mode 100644 index d7e9402..0000000 --- a/indexfile_unix.go +++ /dev/null
@@ -1,76 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build linux darwin - -package zoekt - -import ( - "fmt" - "os" - "syscall" -) - -type mmapedIndexFile struct { - name string - size uint32 - data []byte -} - -func (f *mmapedIndexFile) Read(off, sz uint32) ([]byte, error) { - if off+sz > uint32(len(f.data)) { - return nil, fmt.Errorf("out of bounds: %d, len %d", off+sz, len(f.data)) - } - return f.data[off : off+sz], nil -} - -func (f *mmapedIndexFile) Name() string { - return f.name -} - -func (f *mmapedIndexFile) Size() (uint32, error) { - return f.size, nil -} - -func (f *mmapedIndexFile) Close() { - syscall.Munmap(f.data) -} - -// NewIndexFile returns a new index file. The index file takes -// ownership of the passed in file, and may close it. -func NewIndexFile(f *os.File) (IndexFile, error) { - defer f.Close() - - fi, err := f.Stat() - if err != nil { - return nil, err - } - - sz := fi.Size() - if sz >= maxUInt32 { - return nil, fmt.Errorf("file %s too large: %d", f.Name(), sz) - } - r := &mmapedIndexFile{ - name: f.Name(), - size: uint32(sz), - } - - rounded := (r.size + 4095) &^ 4095 - r.data, err = syscall.Mmap(int(f.Fd()), 0, int(rounded), syscall.PROT_READ, syscall.MAP_SHARED) - if err != nil { - return nil, err - } - - return r, err -}
diff --git a/matchiter.go b/matchiter.go deleted file mode 100644 index c76e1b8..0000000 --- a/matchiter.go +++ /dev/null
@@ -1,280 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "bytes" - "fmt" - "sort" - "unicode/utf8" - - "github.com/google/zoekt/query" -) - -// candidateMatch is a candidate match for a substring. -type candidateMatch struct { - caseSensitive bool - fileName bool - - substrBytes []byte - substrLowered []byte - - file uint32 - - // Offsets are relative to the start of the filename or file contents. - runeOffset uint32 - byteOffset uint32 - byteMatchSz uint32 -} - -// Matches content against the substring, and populates byteMatchSz on success -func (m *candidateMatch) matchContent(content []byte) bool { - if m.caseSensitive { - comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))]) - - m.byteMatchSz = uint32(len(m.substrBytes)) - return comp - } else { - // It is tempting to try a simple ASCII based - // comparison if possible, but we need more - // information. Simple ASCII chars have unicode upper - // case variants (the ASCII 'k' has the Kelvin symbol - // as upper case variant). We can only degrade to - // ASCII if we are sure that both the corpus and the - // query is ASCII only - sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:]) - m.byteMatchSz = uint32(sz) - return ok - } -} - -// line returns the line holding the match. If the match starts with -// the newline ending line M, we return M. The line is characterized -// by its linenumber (base-1, byte index of line start, byte index of -// line end). The line end is the index of a newline, or the filesize -// (if matching the last line of the file.) -func (m *candidateMatch) line(newlines []uint32, fileSize uint32) (lineNum, lineStart, lineEnd int) { - idx := sort.Search(len(newlines), func(n int) bool { - return newlines[n] >= m.byteOffset - }) - - end := int(fileSize) - if idx < len(newlines) { - end = int(newlines[idx]) - } - - start := 0 - if idx > 0 { - start = int(newlines[idx-1] + 1) - } - - return idx + 1, start, end -} - -// matchIterator is a docIterator that produces candidateMatches for a given document -type matchIterator interface { - docIterator - - candidates() []*candidateMatch - updateStats(*Stats) -} - -// noMatchTree is both matchIterator and matchTree that matches nothing. -type noMatchTree struct { - Why string -} - -func (t *noMatchTree) String() string { - return fmt.Sprintf("not(%q)", t.Why) -} - -func (t *noMatchTree) candidates() []*candidateMatch { - return nil -} - -func (t *noMatchTree) nextDoc() uint32 { - return maxUInt32 -} - -func (t *noMatchTree) prepare(uint32) {} - -func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - return false, true -} - -func (t *noMatchTree) updateStats(*Stats) {} - -func (m *candidateMatch) String() string { - return fmt.Sprintf("%d:%d", m.file, m.runeOffset) -} - -type ngramDocIterator struct { - leftPad uint32 - rightPad uint32 - - iter hitIterator - ends []uint32 - - // mutable - fileIdx uint32 - matchCount int -} - -// nextFileIndex returns the smallest index j of ends such that -// ends[j] > offset, assuming ends[f] <= offset. -func nextFileIndex(offset, f uint32, ends []uint32) uint32 { - d := uint32(1) - for f < uint32(len(ends)) && ends[f] <= offset { - if f+d < uint32(len(ends)) && ends[f+d] <= offset { - f += d - d *= 2 - } else if d > 1 { - d = d/4 + 1 - } else { - f++ - } - } - return f -} - -func (i *ngramDocIterator) nextDoc() uint32 { - i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends) - if i.fileIdx >= uint32(len(i.ends)) { - return maxUInt32 - } - return i.fileIdx -} - -func (i *ngramDocIterator) String() string { - return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter) -} - -func (i *ngramDocIterator) prepare(nextDoc uint32) { - var start uint32 - if nextDoc > 0 { - start = i.ends[nextDoc-1] - } - if start > 0 { - i.iter.next(start + i.leftPad - 1) - } - i.fileIdx = nextDoc -} - -func (i *ngramDocIterator) updateStats(s *Stats) { - i.iter.updateStats(s) - s.NgramMatches += i.matchCount -} - -func (i *ngramDocIterator) candidates() []*candidateMatch { - if i.fileIdx >= uint32(len(i.ends)) { - return nil - } - - var fileStart uint32 - if i.fileIdx > 0 { - fileStart = i.ends[i.fileIdx-1] - } - fileEnd := i.ends[i.fileIdx] - - var candidates []*candidateMatch - for { - p1 := i.iter.first() - if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] { - break - } - i.iter.next(p1) - - if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd { - continue - } - - candidates = append(candidates, &candidateMatch{ - file: uint32(i.fileIdx), - runeOffset: p1 - fileStart - i.leftPad, - }) - } - i.matchCount += len(candidates) - return candidates -} - -type trimBySectionMatchIter struct { - matchIterator - - patternSize uint32 - fileEndRunes []uint32 - - // mutable - doc uint32 - sections []DocumentSection -} - -func (i *trimBySectionMatchIter) String() string { - return fmt.Sprintf("trimSection(sz=%d, %v)", i.patternSize, i.matchIterator) -} - -func (d *indexData) newTrimByDocSectionIter(q *query.Substring, iter matchIterator) *trimBySectionMatchIter { - return &trimBySectionMatchIter{ - matchIterator: iter, - patternSize: uint32(utf8.RuneCountInString(q.Pattern)), - fileEndRunes: d.fileEndRunes, - sections: d.runeDocSections, - } -} - -func (i *trimBySectionMatchIter) prepare(doc uint32) { - i.matchIterator.prepare(doc) - i.doc = doc - - var fileStart uint32 - if doc > 0 { - fileStart = i.fileEndRunes[doc-1] - } - - for len(i.sections) > 0 && i.sections[0].Start < fileStart { - i.sections = i.sections[1:] - } -} - -func (i *trimBySectionMatchIter) candidates() []*candidateMatch { - var fileStart uint32 - if i.doc > 0 { - fileStart = i.fileEndRunes[i.doc-1] - } - - ms := i.matchIterator.candidates() - trimmed := ms[:0] - for len(i.sections) > 0 && len(ms) > 0 { - start := fileStart + ms[0].runeOffset - end := start + i.patternSize - if start >= i.sections[0].End { - i.sections = i.sections[1:] - continue - } - - if start < i.sections[0].Start { - ms = ms[1:] - continue - } - - // here we have: sec.Start <= start < sec.End - if end <= i.sections[0].End { - // complete match falls inside section. - trimmed = append(trimmed, ms[0]) - } - - ms = ms[1:] - } - return trimmed -}
diff --git a/matchtree.go b/matchtree.go deleted file mode 100644 index 66d9372..0000000 --- a/matchtree.go +++ /dev/null
@@ -1,743 +0,0 @@ -// Copyright 2018 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "fmt" - "log" - "regexp" - "strings" - "unicode/utf8" - - "github.com/google/zoekt/query" -) - -// A docIterator iterates over documents in order. -type docIterator interface { - // provide the next document where we can may find something - // interesting. - nextDoc() uint32 - - // clears any per-document state of the docIterator, and - // prepares for evaluating the given doc. The argument is - // strictly increasing over time. - prepare(nextDoc uint32) -} - -const ( - costConst = 0 - costMemory = 1 - costContent = 2 - costRegexp = 3 -) - -const ( - costMin = costConst - costMax = costRegexp -) - -// An expression tree coupled with matches. The matchtree has two -// functions: -// -// * it implements boolean combinations (and, or, not) -// -// * it implements shortcuts, where we skip documents (for example: if -// there are no trigram matches, we can be sure there are no substring -// matches). The matchtree iterates over the documents as they are -// ordered in the shard. -// -// The general process for a given (shard, query) is -// -// - construct matchTree for the query -// -// - find all different leaf matchTrees (substring, regexp, etc.) -// -// in a loop: -// -// - find next doc to process using nextDoc -// -// - evaluate atoms (leaf expressions that match text) -// -// - evaluate the tree using matches(), storing the result in map. -// -// - if the complete tree returns (matches() == true) for the document, -// collect all text matches by looking at leaf matchTrees -// -type matchTree interface { - docIterator - - // returns whether this matches, and if we are sure. - matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool) -} - -// docMatchTree iterates over documents for which predicate(docID) returns true. -type docMatchTree struct { - // the number of documents in a shard. - numDocs uint32 - - predicate func(docID uint32) bool - - // provides additional information about the reason why the docMatchTree was - // created. - reason string - - // mutable - firstDone bool - docID uint32 -} - -type bruteForceMatchTree struct { - // mutable - firstDone bool - docID uint32 -} - -type andLineMatchTree struct { - andMatchTree -} - -type andMatchTree struct { - children []matchTree -} - -type orMatchTree struct { - children []matchTree -} - -type notMatchTree struct { - child matchTree -} - -// Don't visit this subtree for collecting matches. -type noVisitMatchTree struct { - matchTree -} - -type regexpMatchTree struct { - regexp *regexp.Regexp - - fileName bool - - // mutable - reEvaluated bool - found []*candidateMatch - - // nextDoc, prepare. - bruteForceMatchTree -} - -type substrMatchTree struct { - matchIterator - - query *query.Substring - caseSensitive bool - fileName bool - - // mutable - current []*candidateMatch - contEvaluated bool -} - -type branchQueryMatchTree struct { - fileMasks []uint64 - mask uint64 - - // mutable - firstDone bool - docID uint32 -} - -// all prepare methods - -func (t *bruteForceMatchTree) prepare(doc uint32) { - t.docID = doc - t.firstDone = true -} - -func (t *docMatchTree) prepare(doc uint32) { - t.docID = doc - t.firstDone = true -} - -func (t *andMatchTree) prepare(doc uint32) { - for _, c := range t.children { - c.prepare(doc) - } -} - -func (t *regexpMatchTree) prepare(doc uint32) { - t.found = t.found[:0] - t.reEvaluated = false - t.bruteForceMatchTree.prepare(doc) -} - -func (t *orMatchTree) prepare(doc uint32) { - for _, c := range t.children { - c.prepare(doc) - } -} - -func (t *notMatchTree) prepare(doc uint32) { - t.child.prepare(doc) -} - -func (t *substrMatchTree) prepare(nextDoc uint32) { - t.matchIterator.prepare(nextDoc) - t.current = t.matchIterator.candidates() - t.contEvaluated = false -} - -func (t *branchQueryMatchTree) prepare(doc uint32) { - t.firstDone = true - t.docID = doc -} - -// nextDoc - -func (t *docMatchTree) nextDoc() uint32 { - var start uint32 - if t.firstDone { - start = t.docID + 1 - } - for i := start; i < t.numDocs; i++ { - if t.predicate(i) { - return i - } - } - return maxUInt32 -} - -func (t *bruteForceMatchTree) nextDoc() uint32 { - if !t.firstDone { - return 0 - } - return t.docID + 1 -} - -func (t *andMatchTree) nextDoc() uint32 { - var max uint32 - for _, c := range t.children { - m := c.nextDoc() - if m > max { - max = m - } - } - return max -} - -func (t *orMatchTree) nextDoc() uint32 { - min := uint32(maxUInt32) - for _, c := range t.children { - m := c.nextDoc() - if m < min { - min = m - } - } - return min -} - -func (t *notMatchTree) nextDoc() uint32 { - return 0 -} - -func (t *branchQueryMatchTree) nextDoc() uint32 { - var start uint32 - if t.firstDone { - start = t.docID + 1 - } - - for i := start; i < uint32(len(t.fileMasks)); i++ { - if (t.mask & t.fileMasks[i]) != 0 { - return i - } - } - return maxUInt32 -} - -// all String methods - -func (t *bruteForceMatchTree) String() string { - return "all" -} - -func (t *docMatchTree) String() string { - return fmt.Sprintf("doc(%s)", t.reason) -} - -func (t *andMatchTree) String() string { - return fmt.Sprintf("and%v", t.children) -} - -func (t *regexpMatchTree) String() string { - return fmt.Sprintf("re(%s)", t.regexp) -} - -func (t *orMatchTree) String() string { - return fmt.Sprintf("or%v", t.children) -} - -func (t *notMatchTree) String() string { - return fmt.Sprintf("not(%v)", t.child) -} - -func (t *substrMatchTree) String() string { - f := "" - if t.fileName { - f = "f" - } - - return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) -} - -func (t *branchQueryMatchTree) String() string { - return fmt.Sprintf("branch(%x)", t.mask) -} - -// visitMatches visits all atoms in matchTree. Note: This visits -// noVisitMatchTree. For collecting matches use visitMatches. -func visitMatchTree(t matchTree, f func(matchTree)) { - switch s := t.(type) { - case *andMatchTree: - for _, ch := range s.children { - visitMatchTree(ch, f) - } - case *orMatchTree: - for _, ch := range s.children { - visitMatchTree(ch, f) - } - case *andLineMatchTree: - visitMatchTree(&s.andMatchTree, f) - case *noVisitMatchTree: - visitMatchTree(s.matchTree, f) - case *notMatchTree: - visitMatchTree(s.child, f) - default: - f(t) - } -} - -// visitMatches visits all atoms which can contribute matches. Note: This -// skips noVisitMatchTree. -func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { - switch s := t.(type) { - case *andMatchTree: - for _, ch := range s.children { - if known[ch] { - visitMatches(ch, known, f) - } - } - case *andLineMatchTree: - visitMatches(&s.andMatchTree, known, f) - case *orMatchTree: - for _, ch := range s.children { - if known[ch] { - visitMatches(ch, known, f) - } - } - case *notMatchTree: - case *noVisitMatchTree: - // don't collect into negative trees. - default: - f(s) - } -} - -// all matches() methods. - -func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - return t.predicate(cp.idx), true -} - -func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - return true, true -} - -// andLineMatchTree is a performance optimization of andMatchTree. For content -// searches we don't want to run the regex engine if there is no line that -// contains matches from all terms. -func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - matches, sure := t.andMatchTree.matches(cp, cost, known) - if !(sure && matches) { - return matches, sure - } - - // find child with fewest candidates - min := maxUInt32 - fewestChildren := 0 - for ix, child := range t.children { - v, ok := child.(*substrMatchTree) - // make sure we are running a content search and that all candidates are a - // substrMatchTree - if !ok || v.fileName { - return matches, sure - } - if len(v.current) < min { - min = len(v.current) - fewestChildren = ix - } - } - - type lineRange struct { - start int - end int - } - lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) - prev := -1 - for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { - line, byteStart, byteEnd := candidate.line(cp.newlines(), cp.fileSize) - if line == prev { - continue - } - prev = line - lines = append(lines, lineRange{byteStart, byteEnd}) - } - - // children keeps track of the children's candidates we have already seen. - children := make([][]*candidateMatch, 0, len(t.children)-1) - for j, child := range t.children { - if j == fewestChildren { - continue - } - children = append(children, child.(*substrMatchTree).current) - } - -nextLine: - for i := 0; i < len(lines); i++ { - hits := 1 - nextChild: - for j := range children { - nextCandidate: - for len(children[j]) > 0 { - candidate := children[j][0] - bo := int(cp.findOffset(false, candidate.runeOffset)) - if bo < lines[i].start { - children[j] = children[j][1:] - continue nextCandidate - } - if bo <= lines[i].end { - hits++ - continue nextChild - } - // move the `lines` iterator forward until bo <= line.end - for i < len(lines) && bo > lines[i].end { - i++ - } - i-- - continue nextLine - } - } - // return early once we found any line that contains matches from all children - if hits == len(t.children) { - return matches, true - } - } - return false, true -} - -func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - sure := true - - for _, ch := range t.children { - v, ok := evalMatchTree(cp, cost, known, ch) - if ok && !v { - return false, true - } - if !ok { - sure = false - } - } - - return true, sure -} - -func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - matches := false - sure := true - for _, ch := range t.children { - v, ok := evalMatchTree(cp, cost, known, ch) - if ok { - // we could short-circuit, but we want to use - // the other possibilities as a ranking - // signal. - matches = matches || v - } else { - sure = false - } - } - return matches, sure -} - -func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - return t.fileMasks[t.docID]&t.mask != 0, true -} - -func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - if t.reEvaluated { - return len(t.found) > 0, true - } - - if cost < costRegexp { - return false, false - } - - cp.stats.RegexpsConsidered++ - idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) - found := t.found[:0] - for _, idx := range idxs { - cm := &candidateMatch{ - byteOffset: uint32(idx[0]), - byteMatchSz: uint32(idx[1] - idx[0]), - fileName: t.fileName, - } - - found = append(found, cm) - } - t.found = found - t.reEvaluated = true - - return len(t.found) > 0, true -} - -// breakMatchesOnNewlines returns matches resulting from breaking each element -// of cms on newlines within text. -func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { - var lineCMs []*candidateMatch - for _, cm := range cms { - lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) - } - return lineCMs -} - -// breakOnNewlines returns matches resulting from breaking cm on newlines -// within text. -func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { - var cms []*candidateMatch - addMe := &candidateMatch{} - *addMe = *cm - for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { - if text[i] == '\n' { - addMe.byteMatchSz = i - addMe.byteOffset - if addMe.byteMatchSz != 0 { - cms = append(cms, addMe) - } - - addMe = &candidateMatch{} - *addMe = *cm - addMe.byteOffset = i + 1 - } - } - addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset - if addMe.byteMatchSz != 0 { - cms = append(cms, addMe) - } - return cms -} - -func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) { - if v, ok := known[mt]; ok { - return v, true - } - - v, ok := mt.matches(cp, cost, known) - if ok { - known[mt] = v - } - - return v, ok -} - -func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - v, ok := evalMatchTree(cp, cost, known, t.child) - return !v, ok -} - -func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - if t.contEvaluated { - return len(t.current) > 0, true - } - - if len(t.current) == 0 { - return false, true - } - - if t.fileName && cost < costMemory { - return false, false - } - - if !t.fileName && cost < costContent { - return false, false - } - - pruned := t.current[:0] - for _, m := range t.current { - if m.byteOffset == 0 && m.runeOffset > 0 { - m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) - } - if m.matchContent(cp.data(m.fileName)) { - pruned = append(pruned, m) - } - } - t.current = pruned - t.contEvaluated = true - - return len(t.current) > 0, true -} - -func (d *indexData) newMatchTree(q query.Q) (matchTree, error) { - if q == nil { - return nil, fmt.Errorf("got nil (sub)query") - } - switch s := q.(type) { - case *query.Regexp: - // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a - // superset of the regexp. If the returned matchTree is equivalent to the - // original regexp, it returns true. An equivalent matchTree has the same - // behaviour as the original regexp and can be used instead. - // - subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) - if err != nil { - return nil, err - } - // if the query can be used in place of the regexp - // return the subtree - if isEq { - return subMT, nil - } - - prefix := "" - if !s.CaseSensitive { - prefix = "(?i)" - } - - tr := ®expMatchTree{ - regexp: regexp.MustCompile(prefix + s.Regexp.String()), - fileName: s.FileName, - } - - return &andMatchTree{ - children: []matchTree{ - tr, &noVisitMatchTree{subMT}, - }, - }, nil - case *query.And: - var r []matchTree - for _, ch := range s.Children { - ct, err := d.newMatchTree(ch) - if err != nil { - return nil, err - } - r = append(r, ct) - } - return &andMatchTree{r}, nil - case *query.Or: - var r []matchTree - for _, ch := range s.Children { - ct, err := d.newMatchTree(ch) - if err != nil { - return nil, err - } - r = append(r, ct) - } - return &orMatchTree{r}, nil - case *query.Not: - ct, err := d.newMatchTree(s.Child) - return ¬MatchTree{ - child: ct, - }, err - - case *query.Substring: - return d.newSubstringMatchTree(s) - - case *query.Branch: - mask := uint64(0) - if s.Pattern == "HEAD" { - mask = 1 - } else { - for nm, m := range d.branchIDs { - if strings.Contains(nm, s.Pattern) { - mask |= uint64(m) - } - } - } - return &branchQueryMatchTree{ - mask: mask, - fileMasks: d.fileBranchMasks, - }, nil - case *query.Const: - if s.Value { - return &bruteForceMatchTree{}, nil - } else { - return &noMatchTree{"const"}, nil - } - case *query.Language: - code, ok := d.metaData.LanguageMap[s.Language] - if !ok { - return &noMatchTree{"lang"}, nil - } - return &docMatchTree{ - reason: "language", - numDocs: uint32(len(d.languages)), - predicate: func(docID uint32) bool { - return d.languages[docID] == code - }, - }, nil - - case *query.Symbol: - mt, err := d.newSubstringMatchTree(s.Atom) - if err != nil { - return nil, err - } - - if _, ok := mt.(*regexpMatchTree); ok { - return nil, fmt.Errorf("regexps and short queries not implemented for symbol search") - } - subMT, ok := mt.(*substrMatchTree) - if !ok { - return nil, fmt.Errorf("found %T inside query.Symbol", mt) - } - - subMT.matchIterator = d.newTrimByDocSectionIter(s.Atom, subMT.matchIterator) - return subMT, nil - } - log.Panicf("type %T", q) - return nil, nil -} - -func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { - st := &substrMatchTree{ - query: s, - caseSensitive: s.CaseSensitive, - fileName: s.FileName, - } - - if utf8.RuneCountInString(s.Pattern) < ngramSize { - prefix := "" - if !s.CaseSensitive { - prefix = "(?i)" - } - t := ®expMatchTree{ - regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)), - fileName: s.FileName, - } - return t, nil - } - - result, err := d.iterateNgrams(s) - if err != nil { - return nil, err - } - st.matchIterator = result - return st, nil -}
diff --git a/matchtree_test.go b/matchtree_test.go deleted file mode 100644 index 07aa4a5..0000000 --- a/matchtree_test.go +++ /dev/null
@@ -1,191 +0,0 @@ -// Copyright 2018 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "reflect" - "testing" - - "github.com/google/zoekt/query" -) - -func Test_breakOnNewlines(t *testing.T) { - type args struct { - cm *candidateMatch - text []byte - } - tests := []struct { - name string - args args - want []*candidateMatch - }{ - { - name: "trivial case", - args: args{ - cm: &candidateMatch{ - byteOffset: 0, - byteMatchSz: 0, - }, - text: nil, - }, - want: nil, - }, - { - name: "no newlines", - args: args{ - cm: &candidateMatch{ - byteOffset: 0, - byteMatchSz: 1, - }, - text: []byte("a"), - }, - want: []*candidateMatch{ - { - byteOffset: 0, - byteMatchSz: 1, - }, - }, - }, - { - name: "newline at start", - args: args{ - cm: &candidateMatch{ - byteOffset: 0, - byteMatchSz: 2, - }, - text: []byte("\na"), - }, - want: []*candidateMatch{ - { - byteOffset: 1, - byteMatchSz: 1, - }, - }, - }, - { - name: "newline at end", - args: args{ - cm: &candidateMatch{ - byteOffset: 0, - byteMatchSz: 2, - }, - text: []byte("a\n"), - }, - want: []*candidateMatch{ - { - byteOffset: 0, - byteMatchSz: 1, - }, - }, - }, - { - name: "newline in middle", - args: args{ - cm: &candidateMatch{ - byteOffset: 0, - byteMatchSz: 3, - }, - text: []byte("a\nb"), - }, - want: []*candidateMatch{ - { - byteOffset: 0, - byteMatchSz: 1, - }, - { - byteOffset: 2, - byteMatchSz: 1, - }, - }, - }, - { - name: "two newlines", - args: args{ - cm: &candidateMatch{ - byteOffset: 0, - byteMatchSz: 5, - }, - text: []byte("a\nb\nc"), - }, - want: []*candidateMatch{ - { - byteOffset: 0, - byteMatchSz: 1, - }, - { - byteOffset: 2, - byteMatchSz: 1, - }, - { - byteOffset: 4, - byteMatchSz: 1, - }, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := breakOnNewlines(tt.args.cm, tt.args.text); !reflect.DeepEqual(got, tt.want) { - type PrintableCm struct { - byteOffset uint32 - byteMatchSz uint32 - } - var got2, want2 []PrintableCm - for _, g := range got { - got2 = append(got2, PrintableCm{byteOffset: g.byteOffset, byteMatchSz: g.byteMatchSz}) - } - for _, w := range tt.want { - want2 = append(want2, PrintableCm{byteOffset: w.byteOffset, byteMatchSz: w.byteMatchSz}) - } - t.Errorf("breakMatchOnNewlines() = %+v, want %+v", got2, want2) - } - }) - } -} - -func TestEquivalentQuerySkipRegexpTree(t *testing.T) { - tests := []struct { - query string - skip bool - }{ - {query: "^foo", skip: false}, - {query: "foo", skip: true}, - {query: "thread|needle|haystack", skip: true}, - {query: "contain(er|ing)", skip: false}, - {query: "thread (needle|haystack)", skip: true}, - {query: "thread (needle|)", skip: false}, - } - - for _, tt := range tests { - q, err := query.Parse(tt.query) - if err != nil { - t.Errorf("Error parsing query: %s", "sym:"+tt.query) - continue - } - - d := &indexData{} - mt, err := d.newMatchTree(q) - if err != nil { - t.Errorf("Error creating match tree from query: %s", q) - continue - } - - visitMatchTree(mt, func(m matchTree) { - if _, ok := m.(*regexpMatchTree); ok && tt.skip { - t.Errorf("Expected regexpMatchTree to be skipped for query: %s", q) - } - }) - } -}
diff --git a/query/bits.go b/query/bits.go deleted file mode 100644 index 8b199d2..0000000 --- a/query/bits.go +++ /dev/null
@@ -1,26 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package query - -func toLower(in []byte) []byte { - out := make([]byte, len(in)) - for i, c := range in { - if c >= 'A' && c <= 'Z' { - c = c - 'A' + 'a' - } - out[i] = c - } - return out -}
diff --git a/query/parse.go b/query/parse.go deleted file mode 100644 index ecbebf7..0000000 --- a/query/parse.go +++ /dev/null
@@ -1,474 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package query - -import ( - "bytes" - "fmt" - "log" - "regexp/syntax" -) - -var _ = log.Printf - -type SuggestQueryError struct { - Message string - Suggestion string -} - -func (e *SuggestQueryError) Error() string { - return fmt.Sprintf("%s. Suggestion: %s", e.Message, e.Suggestion) -} - -// parseStringLiteral parses a string literal, consumes the starting -// quote too. -func parseStringLiteral(in []byte) (lit []byte, n int, err error) { - left := in[1:] - found := false - -loop: - for len(left) > 0 { - c := left[0] - left = left[1:] - switch c { - case '"': - found = true - break loop - case '\\': - // TODO - other escape sequences. - if len(left) == 0 { - return nil, 0, fmt.Errorf("query: missing char after \\") - } - c = left[0] - left = left[1:] - - lit = append(lit, c) - default: - lit = append(lit, c) - } - } - if !found { - return nil, 0, fmt.Errorf("query: unterminated quoted string") - } - return lit, len(in) - len(left), nil -} - -// orOperator is a placeholder intermediate so we can represent [A, -// or, B] before we convert it to Or{A, B} -type orOperator struct{} - -func (o *orOperator) String() string { - return "orOp" -} - -func isSpace(c byte) bool { - return c == ' ' || c == '\t' -} - -// Parse parses a string into a query. -func Parse(qStr string) (Q, error) { - b := []byte(qStr) - - qs, _, err := parseExprList(b) - if err != nil { - return nil, err - } - - q, err := parseOperators(qs) - if err != nil { - return nil, err - } - - return Simplify(q), nil -} - -// parseExpr parses a single expression, returning the result, and the -// number of bytes consumed. -func parseExpr(in []byte) (Q, int, error) { - b := in[:] - var expr Q - for len(b) > 0 && isSpace(b[0]) { - b = b[1:] - } - - tok, err := nextToken(b) - if err != nil { - return nil, 0, err - } - if tok == nil { - return nil, 0, nil - } - b = b[len(tok.Input):] - - text := string(tok.Text) - switch tok.Type { - case tokCase: - switch text { - case "yes": - case "no": - case "auto": - default: - return nil, 0, fmt.Errorf("query: unknown case argument %q, want {yes,no,auto}", text) - } - expr = &caseQ{text} - case tokRepo: - expr = &Repo{Pattern: text} - case tokBranch: - expr = &Branch{Pattern: text} - case tokText, tokRegex: - q, err := regexpQuery(text, false, false) - if err != nil { - return nil, 0, err - } - expr = q - case tokFile: - q, err := regexpQuery(text, false, true) - if err != nil { - return nil, 0, err - } - expr = q - - case tokContent: - q, err := regexpQuery(text, true, false) - if err != nil { - return nil, 0, err - } - expr = q - case tokLang: - expr = &Language{Language: text} - - case tokSym: - if text == "" { - return nil, 0, fmt.Errorf("the sym: atom must have an argument") - } - expr = &Symbol{&Substring{Pattern: text}} - - case tokParenClose: - // Caller must consume paren. - expr = nil - - case tokParenOpen: - qs, n, err := parseExprList(b) - b = b[n:] - if err != nil { - return nil, 0, err - } - - pTok, err := nextToken(b) - if err != nil { - return nil, 0, err - } - if pTok == nil || pTok.Type != tokParenClose { - return nil, 0, fmt.Errorf("query: missing close paren, got token %v", pTok) - } - - b = b[len(pTok.Input):] - expr, err = parseOperators(qs) - if err != nil { - return nil, 0, err - } - case tokNegate: - subQ, n, err := parseExpr(b) - if err != nil { - return nil, 0, err - } - if subQ == nil { - return nil, 0, fmt.Errorf("query: '-' operator needs an argument") - } - b = b[n:] - expr = &Not{subQ} - - } - - return expr, len(in) - len(b), nil -} - -// regexpQuery parses an atom into either a regular expression, or a -// simple substring atom. -func regexpQuery(text string, content, file bool) (Q, error) { - var expr Q - - r, err := syntax.Parse(text, syntax.ClassNL|syntax.PerlX|syntax.UnicodeGroups) - if err != nil { - return nil, err - } - - if r.Op == syntax.OpLiteral { - expr = &Substring{ - Pattern: string(r.Rune), - FileName: file, - Content: content, - } - } else { - expr = &Regexp{ - Regexp: r, - FileName: file, - Content: content, - } - } - - return expr, nil -} - -// parseOperators interprets the orOperator in a list of queries. -func parseOperators(in []Q) (Q, error) { - top := &Or{} - cur := &And{} - - seenOr := false - for _, q := range in { - if _, ok := q.(*orOperator); ok { - seenOr = true - if len(cur.Children) == 0 { - return nil, fmt.Errorf("query: OR operator should have operand") - } - top.Children = append(top.Children, cur) - cur = &And{} - } else { - cur.Children = append(cur.Children, q) - } - } - - if seenOr && len(cur.Children) == 0 { - return nil, fmt.Errorf("query: OR operator should have operand") - } - top.Children = append(top.Children, cur) - return top, nil -} - -// parseExprList parses a list of query expressions. It is the -// workhorse of the Parse function. -func parseExprList(in []byte) ([]Q, int, error) { - b := in[:] - var qs []Q - for len(b) > 0 { - for len(b) > 0 && isSpace(b[0]) { - b = b[1:] - } - tok, _ := nextToken(b) - if tok != nil && tok.Type == tokParenClose { - break - } else if tok != nil && tok.Type == tokOr { - qs = append(qs, &orOperator{}) - b = b[len(tok.Input):] - continue - } - - q, n, err := parseExpr(b) - if err != nil { - return nil, 0, err - } - - if q == nil { - // eof or a ')' - break - } - qs = append(qs, q) - b = b[n:] - } - - setCase := "auto" - newQS := qs[:0] - for _, q := range qs { - if sc, ok := q.(*caseQ); ok { - setCase = sc.Flavor - } else { - newQS = append(newQS, q) - } - } - qs = mapQueryList(newQS, func(q Q) Q { - if sc, ok := q.(setCaser); ok { - sc.setCase(setCase) - } - return q - }) - return qs, len(in) - len(b), nil -} - -type token struct { - Type int - // The value of the token - Text []byte - - // The input that we consumed to form the token. - Input []byte -} - -func (t *token) String() string { - return fmt.Sprintf("%s:%q", tokNames[t.Type], t.Text) -} - -// token types. -const ( - tokText = 0 - tokFile = 1 - tokRepo = 2 - tokCase = 3 - tokBranch = 4 - tokParenOpen = 5 - tokParenClose = 6 - tokError = 7 - tokNegate = 8 - tokRegex = 9 - tokOr = 10 - tokContent = 11 - tokLang = 12 - tokSym = 13 -) - -var tokNames = map[int]string{ - tokBranch: "Branch", - tokCase: "Case", - tokError: "Error", - tokFile: "File", - tokNegate: "Negate", - tokOr: "Or", - tokParenClose: "ParenClose", - tokParenOpen: "ParenOpen", - tokRegex: "Regex", - tokRepo: "Repo", - tokText: "Text", - tokLang: "Language", - tokSym: "Symbol", -} - -var prefixes = map[string]int{ - "b:": tokBranch, - "branch:": tokBranch, - "c:": tokContent, - "case:": tokCase, - "content:": tokContent, - "f:": tokFile, - "file:": tokFile, - "r:": tokRepo, - "regex:": tokRegex, - "repo:": tokRepo, - "lang:": tokLang, - "sym:": tokSym, -} - -var reservedWords = map[string]int{ - "or": tokOr, -} - -func (t *token) setType() { - // After we consumed the input, we have to interpret some of the text, - // eg. to distinguish between ")" the text and ) the query grouping - // parenthesis. - if len(t.Text) == 1 && t.Text[0] == '(' { - t.Type = tokParenOpen - } - if len(t.Text) == 1 && t.Text[0] == ')' { - t.Type = tokParenClose - } - - for w, typ := range reservedWords { - if string(t.Text) == w && string(t.Input) == w { - t.Type = typ - break - } - } - - for pref, typ := range prefixes { - if !bytes.HasPrefix(t.Input, []byte(pref)) { - continue - } - - t.Text = t.Text[len(pref):] - t.Type = typ - break - } -} - -// nextToken returns the next token from the given input. -func nextToken(in []byte) (*token, error) { - left := in[:] - parenCount := 0 - var cur token - if len(left) == 0 { - return nil, nil - } - - if left[0] == '-' { - return &token{ - Type: tokNegate, - Text: []byte{'-'}, - Input: in[:1], - }, nil - } - - foundSpace := false - -loop: - for len(left) > 0 { - c := left[0] - switch c { - case '(': - parenCount++ - cur.Text = append(cur.Text, c) - left = left[1:] - case ')': - if parenCount == 0 { - if len(cur.Text) == 0 { - cur.Text = []byte{')'} - left = left[1:] - } - break loop - } - - cur.Text = append(cur.Text, c) - left = left[1:] - parenCount-- - - case '"': - t, n, err := parseStringLiteral(left) - if err != nil { - return nil, err - } - cur.Text = append(cur.Text, t...) - left = left[n:] - case '\\': - left = left[1:] - if len(left) == 0 { - return nil, fmt.Errorf("query: lone \\ at end") - } - c := left[0] - cur.Text = append(cur.Text, '\\', c) - left = left[1:] - - case ' ', '\n', '\t': - if parenCount > 0 { - foundSpace = true - } - break loop - default: - cur.Text = append(cur.Text, c) - left = left[1:] - } - } - - if len(cur.Text) == 0 { - return nil, nil - } - - if foundSpace && cur.Text[0] == '(' { - cur.Text = cur.Text[:1] - cur.Input = in[:1] - } else { - cur.Input = in[:len(in)-len(left)] - } - cur.setType() - return &cur, nil -}
diff --git a/query/parse_test.go b/query/parse_test.go deleted file mode 100644 index 8acd719..0000000 --- a/query/parse_test.go +++ /dev/null
@@ -1,164 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package query - -import ( - "log" - "reflect" - "regexp/syntax" - "testing" -) - -func mustParseRE(s string) *syntax.Regexp { - r, err := syntax.Parse(s, syntax.ClassNL|syntax.PerlX|syntax.UnicodeGroups) - if err != nil { - log.Panicf("parsing %q: %v", s, err) - } - return r -} - -func TestParseQuery(t *testing.T) { - type testcase struct { - in string - want Q - } - - for _, c := range []testcase{ - {`\bword\b`, &Regexp{Regexp: mustParseRE(`\bword\b`)}}, - {"fi\"le:bla\"", &Substring{Pattern: "file:bla"}}, - {"abc or def", NewOr(&Substring{Pattern: "abc"}, &Substring{Pattern: "def"})}, - {"(abc or def)", NewOr(&Substring{Pattern: "abc"}, &Substring{Pattern: "def"})}, - {"(ppp qqq or rrr sss)", NewOr( - NewAnd(&Substring{Pattern: "ppp"}, &Substring{Pattern: "qqq"}), - NewAnd(&Substring{Pattern: "rrr"}, &Substring{Pattern: "sss"}))}, - {"((x) ora b(z(d)))", NewAnd( - &Regexp{Regexp: mustParseRE("(x)")}, - &Substring{Pattern: "ora"}, - &Regexp{Regexp: mustParseRE("b(z(d))")})}, - {"( )", &Const{Value: true}}, - {"(abc)(de)", &Regexp{Regexp: mustParseRE("(abc)(de)")}}, - {"sub-pixel", &Substring{Pattern: "sub-pixel"}}, - {"abc", &Substring{Pattern: "abc"}}, - {"ABC", &Substring{Pattern: "ABC", CaseSensitive: true}}, - {"\"abc bcd\"", &Substring{Pattern: "abc bcd"}}, - {"abc bcd", NewAnd( - &Substring{Pattern: "abc"}, - &Substring{Pattern: "bcd"})}, - {"f:fs", &Substring{Pattern: "fs", FileName: true}}, - {"fs", &Substring{Pattern: "fs"}}, - {"-abc", &Not{&Substring{Pattern: "abc"}}}, - {"abccase:yes", &Substring{Pattern: "abccase:yes"}}, - {"file:abc", &Substring{Pattern: "abc", FileName: true}}, - {"branch:pqr", &Branch{Pattern: "pqr"}}, - {"((x) )", &Regexp{Regexp: mustParseRE("(x)")}}, - {"file:helpers\\.go byte", NewAnd( - &Substring{Pattern: "helpers.go", FileName: true}, - &Substring{Pattern: "byte"})}, - {"(abc def)", NewAnd( - &Substring{Pattern: "abc"}, - &Substring{Pattern: "def"})}, - {"(abc def", nil}, - {"regex:abc[p-q]", &Regexp{Regexp: mustParseRE("abc[p-q]")}}, - {"aBc[p-q]", &Regexp{Regexp: mustParseRE("aBc[p-q]"), CaseSensitive: true}}, - {"aBc[p-q] case:auto", &Regexp{Regexp: mustParseRE("aBc[p-q]"), CaseSensitive: true}}, - {"repo:go", &Repo{"go"}}, - - {"file:\"\"", &Const{true}}, - {"abc.*def", &Regexp{Regexp: mustParseRE("abc.*def")}}, - {"abc\\.\\*def", &Substring{Pattern: "abc.*def"}}, - {"(abc)", &Regexp{Regexp: mustParseRE("(abc)")}}, - - {"c:abc", &Substring{Pattern: "abc", Content: true}}, - {"content:abc", &Substring{Pattern: "abc", Content: true}}, - - {"lang:c++", &Language{"c++"}}, - {"sym:pqr", &Symbol{&Substring{Pattern: "pqr"}}}, - {"sym:Pqr", &Symbol{&Substring{Pattern: "Pqr", CaseSensitive: true}}}, - - // case - {"abc case:yes", &Substring{Pattern: "abc", CaseSensitive: true}}, - {"abc case:auto", &Substring{Pattern: "abc", CaseSensitive: false}}, - {"ABC case:auto", &Substring{Pattern: "ABC", CaseSensitive: true}}, - {"ABC case:\"auto\"", &Substring{Pattern: "ABC", CaseSensitive: true}}, - {"abc -f:def case:yes", NewAnd( - &Substring{Pattern: "abc", CaseSensitive: true}, - &Not{Child: &Substring{Pattern: "def", FileName: true, CaseSensitive: true}}, - )}, - - // errors. - {"--", nil}, - {"\"abc", nil}, - {"\"a\\", nil}, - {"case:foo", nil}, - - {"sym:", nil}, - {"abc or", nil}, - {"or abc", nil}, - {"def or or abc", nil}, - - {"", &Const{Value: true}}, - } { - got, err := Parse(c.in) - if (c.want == nil) != (err != nil) { - t.Errorf("Parse(%q): error %v, want %v", c.in, err, c.want) - } else if got != nil { - if !reflect.DeepEqual(got, c.want) { - t.Errorf("Parse(%s): got %v want %v", c.in, got, c.want) - } - } - } -} - -func TestTokenize(t *testing.T) { - type testcase struct { - in string - typ int - text string - } - - cases := []testcase{ - {"file:bla", tokFile, "bla"}, - {"file:bla ", tokFile, "bla"}, - {"f:bla ", tokFile, "bla"}, - {"(abc def) ", tokParenOpen, "("}, - {"(abcdef)", tokText, "(abcdef)"}, - {"(abc)(de)", tokText, "(abc)(de)"}, - {"(ab(c)def) ", tokText, "(ab(c)def)"}, - {"(ab\\ def) ", tokText, "(ab\\ def)"}, - {") ", tokParenClose, ")"}, - {"a(bc))", tokText, "a(bc)"}, - {"abc) ", tokText, "abc"}, - {"file:\"bla\"", tokFile, "bla"}, - {"\"file:bla\"", tokText, "file:bla"}, - {"\\", tokError, ""}, - {"o\"r\" bla", tokText, "or"}, - {"or bla", tokOr, "or"}, - {"ar bla", tokText, "ar"}, - } - for _, c := range cases { - tok, err := nextToken([]byte(c.in)) - if err != nil { - tok = &token{Type: tokError} - } - if tok.Type != c.typ { - t.Errorf("%s: got type %d, want %d", c.in, tok.Type, c.typ) - continue - } - - if string(tok.Text) != c.text { - t.Errorf("%s: got text %q, want %q", c.in, tok.Text, c.text) - } - } -}
diff --git a/query/query.go b/query/query.go deleted file mode 100644 index 31741a0..0000000 --- a/query/query.go +++ /dev/null
@@ -1,394 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package query - -import ( - "fmt" - "log" - "reflect" - "regexp/syntax" - "strings" -) - -var _ = log.Println - -// Q is a representation for a possibly hierarchical search query. -type Q interface { - String() string -} - -// RegexpQuery is a query looking for regular expressions matches. -type Regexp struct { - Regexp *syntax.Regexp - FileName bool - Content bool - CaseSensitive bool -} - -// Symbol finds a string that is a symbol. -type Symbol struct { - Atom *Substring -} - -func (s *Symbol) String() string { - return fmt.Sprintf("sym:%s", s.Atom) -} - -func (q *Regexp) String() string { - pref := "" - if q.FileName { - pref = "file_" - } - if q.CaseSensitive { - pref = "case_" + pref - } - return fmt.Sprintf("%sregex:%q", pref, q.Regexp.String()) -} - -type caseQ struct { - Flavor string -} - -func (c *caseQ) String() string { - return "case:" + c.Flavor -} - -type Language struct { - Language string -} - -func (l *Language) String() string { - return "lang:" + l.Language -} - -type Const struct { - Value bool -} - -func (q *Const) String() string { - if q.Value { - return "TRUE" - } - return "FALSE" -} - -type Repo struct { - Pattern string -} - -func (q *Repo) String() string { - return fmt.Sprintf("repo:%s", q.Pattern) -} - -// Substring is the most basic query: a query for a substring. -type Substring struct { - Pattern string - CaseSensitive bool - - // Match only filename - FileName bool - - // Match only content - Content bool -} - -func (q *Substring) String() string { - s := "" - - t := "" - if q.FileName { - t = "file_" - } else if q.Content { - t = "content_" - } - - s += fmt.Sprintf("%ssubstr:%q", t, q.Pattern) - if q.CaseSensitive { - s = "case_" + s - } - return s -} - -type setCaser interface { - setCase(string) -} - -func (q *Substring) setCase(k string) { - switch k { - case "yes": - q.CaseSensitive = true - case "no": - q.CaseSensitive = false - case "auto": - // TODO - unicode - q.CaseSensitive = (q.Pattern != string(toLower([]byte(q.Pattern)))) - } -} - -func (q *Symbol) setCase(k string) { - q.Atom.setCase(k) -} - -func (q *Regexp) setCase(k string) { - switch k { - case "yes": - q.CaseSensitive = true - case "no": - q.CaseSensitive = false - case "auto": - q.CaseSensitive = (q.Regexp.String() != LowerRegexp(q.Regexp).String()) - } -} - -// Or is matched when any of its children is matched. -type Or struct { - Children []Q -} - -func (q *Or) String() string { - var sub []string - for _, ch := range q.Children { - sub = append(sub, ch.String()) - } - return fmt.Sprintf("(or %s)", strings.Join(sub, " ")) -} - -// Not inverts the meaning of its child. -type Not struct { - Child Q -} - -func (q *Not) String() string { - return fmt.Sprintf("(not %s)", q.Child) -} - -// And is matched when all its children are. -type And struct { - Children []Q -} - -func (q *And) String() string { - var sub []string - for _, ch := range q.Children { - sub = append(sub, ch.String()) - } - return fmt.Sprintf("(and %s)", strings.Join(sub, " ")) -} - -// NewAnd is syntactic sugar for constructing And queries. -func NewAnd(qs ...Q) Q { - return &And{Children: qs} -} - -// NewOr is syntactic sugar for constructing Or queries. -func NewOr(qs ...Q) Q { - return &Or{Children: qs} -} - -// Branch limits search to a specific branch. -type Branch struct { - Pattern string -} - -func (q *Branch) String() string { - return fmt.Sprintf("branch:%q", q.Pattern) -} - -func queryChildren(q Q) []Q { - switch s := q.(type) { - case *And: - return s.Children - case *Or: - return s.Children - } - return nil -} - -func flattenAndOr(children []Q, typ Q) ([]Q, bool) { - var flat []Q - changed := false - for _, ch := range children { - ch, subChanged := flatten(ch) - changed = changed || subChanged - if reflect.TypeOf(ch) == reflect.TypeOf(typ) { - changed = true - subChildren := queryChildren(ch) - if subChildren != nil { - flat = append(flat, subChildren...) - } - } else { - flat = append(flat, ch) - } - } - - return flat, changed -} - -// (and (and x y) z) => (and x y z) , the same for "or" -func flatten(q Q) (Q, bool) { - switch s := q.(type) { - case *And: - if len(s.Children) == 1 { - return s.Children[0], true - } - flatChildren, changed := flattenAndOr(s.Children, s) - return &And{flatChildren}, changed - case *Or: - if len(s.Children) == 1 { - return s.Children[0], true - } - flatChildren, changed := flattenAndOr(s.Children, s) - return &Or{flatChildren}, changed - case *Not: - child, changed := flatten(s.Child) - return &Not{child}, changed - default: - return q, false - } -} - -func mapQueryList(qs []Q, f func(Q) Q) []Q { - neg := make([]Q, len(qs)) - for i, sub := range qs { - neg[i] = Map(sub, f) - } - return neg -} - -func invertConst(q Q) Q { - c, ok := q.(*Const) - if ok { - return &Const{!c.Value} - } - return q -} - -func evalAndOrConstants(q Q, children []Q) Q { - _, isAnd := q.(*And) - - children = mapQueryList(children, evalConstants) - - newCH := children[:0] - for _, ch := range children { - c, ok := ch.(*Const) - if ok { - if c.Value == isAnd { - continue - } else { - return ch - } - } - newCH = append(newCH, ch) - } - if len(newCH) == 0 { - return &Const{isAnd} - } - if isAnd { - return &And{newCH} - } - return &Or{newCH} -} - -func evalConstants(q Q) Q { - switch s := q.(type) { - case *And: - return evalAndOrConstants(q, s.Children) - case *Or: - return evalAndOrConstants(q, s.Children) - case *Not: - ch := evalConstants(s.Child) - if _, ok := ch.(*Const); ok { - return invertConst(ch) - } - return &Not{ch} - case *Substring: - if len(s.Pattern) == 0 { - return &Const{true} - } - case *Regexp: - if s.Regexp.Op == syntax.OpEmptyMatch { - return &Const{true} - } - case *Branch: - if s.Pattern == "" { - return &Const{true} - } - } - return q -} - -func Simplify(q Q) Q { - q = evalConstants(q) - for { - var changed bool - q, changed = flatten(q) - if !changed { - break - } - } - - return q -} - -// Map runs f over the q. -func Map(q Q, f func(q Q) Q) Q { - switch s := q.(type) { - case *And: - q = &And{Children: mapQueryList(s.Children, f)} - case *Or: - q = &Or{Children: mapQueryList(s.Children, f)} - case *Not: - q = &Not{Child: Map(s.Child, f)} - } - return f(q) -} - -// Expand expands Substr queries into (OR file_substr content_substr) -// queries, and the same for Regexp queries.. -func ExpandFileContent(q Q) Q { - switch s := q.(type) { - case *Substring: - if !s.FileName && !s.Content { - f := *s - f.FileName = true - c := *s - c.Content = true - return NewOr(&f, &c) - } - case *Regexp: - if !s.FileName && !s.Content { - f := *s - f.FileName = true - c := *s - c.Content = true - return NewOr(&f, &c) - } - } - return q -} - -// VisitAtoms runs `v` on all atom queries within `q`. -func VisitAtoms(q Q, v func(q Q)) { - Map(q, func(iQ Q) Q { - switch iQ.(type) { - case *And: - case *Or: - case *Not: - default: - v(iQ) - } - return iQ - }) -}
diff --git a/query/query_test.go b/query/query_test.go deleted file mode 100644 index 0d85b29..0000000 --- a/query/query_test.go +++ /dev/null
@@ -1,111 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package query - -import ( - "log" - "reflect" - "testing" -) - -var _ = log.Println - -func TestQueryString(t *testing.T) { - q := &Or{[]Q{ - &And{[]Q{ - &Substring{Pattern: "hoi"}, - &Not{&Substring{Pattern: "hai"}}, - }}, - }} - got := q.String() - want := `(or (and substr:"hoi" (not substr:"hai")))` - - if got != want { - t.Errorf("got %s, want %s", got, want) - } -} - -func TestSimplify(t *testing.T) { - type testcase struct { - in Q - want Q - } - - cases := []testcase{ - { - in: NewOr( - NewOr( - NewAnd(&Substring{Pattern: "hoi"}, - &Not{&Substring{Pattern: "hai"}}), - NewOr( - &Substring{Pattern: "zip"}, - &Substring{Pattern: "zap"}, - ))), - want: NewOr( - NewAnd( - &Substring{Pattern: "hoi"}, - &Not{&Substring{Pattern: "hai"}}), - &Substring{Pattern: "zip"}, - &Substring{Pattern: "zap"}), - }, - {in: &And{}, want: &Const{true}}, - {in: &Or{}, want: &Const{false}}, - {in: NewAnd(&Const{true}, &Const{false}), want: &Const{false}}, - {in: NewOr(&Const{false}, &Const{true}), want: &Const{true}}, - {in: &Not{&Const{true}}, want: &Const{false}}, - { - in: NewAnd( - &Substring{Pattern: "byte"}, - &Not{NewAnd(&Substring{Pattern: "byte"})}), - want: NewAnd( - &Substring{Pattern: "byte"}, - &Not{&Substring{Pattern: "byte"}}), - }, - } - - for _, c := range cases { - got := Simplify(c.in) - if !reflect.DeepEqual(got, c.want) { - t.Errorf("got %s, want %s", got, c.want) - } - } -} - -func TestMap(t *testing.T) { - in := NewAnd(&Substring{Pattern: "bla"}, &Not{&Repo{"foo"}}) - out := NewAnd(&Substring{Pattern: "bla"}, &Not{&Const{false}}) - - f := func(q Q) Q { - if _, ok := q.(*Repo); ok { - return &Const{false} - } - return q - } - got := Map(in, f) - if !reflect.DeepEqual(got, out) { - t.Errorf("got %v, want %v", got, out) - } -} - -func TestVisitAtoms(t *testing.T) { - in := NewAnd(&Substring{}, &Repo{}, &Not{&Const{}}) - count := 0 - VisitAtoms(in, func(q Q) { - count++ - }) - if count != 3 { - t.Errorf("got %d, want 3", count) - } -}
diff --git a/query/regexp.go b/query/regexp.go deleted file mode 100644 index 64c9def..0000000 --- a/query/regexp.go +++ /dev/null
@@ -1,44 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package query - -import ( - "log" - "regexp/syntax" -) - -var _ = log.Println - -func LowerRegexp(r *syntax.Regexp) *syntax.Regexp { - newRE := *r - switch r.Op { - case syntax.OpLiteral, syntax.OpCharClass: - newRE.Rune = make([]rune, len(r.Rune)) - for i, c := range r.Rune { - if c >= 'A' && c <= 'Z' { - newRE.Rune[i] = c + 'a' - 'A' - } else { - newRE.Rune[i] = c - } - } - default: - newRE.Sub = make([]*syntax.Regexp, len(newRE.Sub)) - for i, s := range r.Sub { - newRE.Sub[i] = LowerRegexp(s) - } - } - - return &newRE -}
diff --git a/query/regexp_test.go b/query/regexp_test.go deleted file mode 100644 index a3a12a0..0000000 --- a/query/regexp_test.go +++ /dev/null
@@ -1,67 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package query - -import ( - "regexp/syntax" - "strings" - "testing" -) - -var opnames = map[syntax.Op]string{ - syntax.OpNoMatch: "OpNoMatch", - syntax.OpEmptyMatch: "OpEmptyMatch", - syntax.OpLiteral: "OpLiteral", - syntax.OpCharClass: "OpCharClass", - syntax.OpAnyCharNotNL: "OpAnyCharNotNL", - syntax.OpAnyChar: "OpAnyChar", - syntax.OpBeginLine: "OpBeginLine", - syntax.OpEndLine: "OpEndLine", - syntax.OpBeginText: "OpBeginText", - syntax.OpEndText: "OpEndText", - syntax.OpWordBoundary: "OpWordBoundary", - syntax.OpNoWordBoundary: "OpNoWordBoundary", - syntax.OpCapture: "OpCapture", - syntax.OpStar: "OpStar", - syntax.OpPlus: "OpPlus", - syntax.OpQuest: "OpQuest", - syntax.OpRepeat: "OpRepeat", - syntax.OpConcat: "OpConcat", - syntax.OpAlternate: "OpAlternate", -} - -func printRegexp(t *testing.T, r *syntax.Regexp, lvl int) { - t.Logf("%s%s ch: %d", strings.Repeat(" ", lvl), opnames[r.Op], len(r.Sub)) - for _, s := range r.Sub { - printRegexp(t, s, lvl+1) - } -} - -func TestLowerRegexp(t *testing.T) { - in := "[a-zA-Z]fooBAR" - re := mustParseRE(in) - in = re.String() - got := LowerRegexp(re) - want := "[a-za-z]foobar" - if got.String() != want { - printRegexp(t, re, 0) - printRegexp(t, got, 0) - t.Errorf("got %s, want %s", got, want) - } - - if re.String() != in { - t.Errorf("got mutated original %s want %s", re.String(), in) - } -}
diff --git a/read.go b/read.go deleted file mode 100644 index 8351a23..0000000 --- a/read.go +++ /dev/null
@@ -1,484 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "encoding/binary" - "encoding/json" - "fmt" - "log" - "sort" -) - -// IndexFile is a file suitable for concurrent read access. For performance -// reasons, it allows a mmap'd implementation. -type IndexFile interface { - Read(off uint32, sz uint32) ([]byte, error) - Size() (uint32, error) - Close() - Name() string -} - -// reader is a stateful file -type reader struct { - r IndexFile - off uint32 -} - -func (r *reader) seek(off uint32) { - r.off = off -} - -func (r *reader) U32() (uint32, error) { - b, err := r.r.Read(r.off, 4) - r.off += 4 - if err != nil { - return 0, err - } - return binary.BigEndian.Uint32(b), nil -} - -func (r *reader) U64() (uint64, error) { - b, err := r.r.Read(r.off, 8) - r.off += 8 - if err != nil { - return 0, err - } - return binary.BigEndian.Uint64(b), nil -} - -func (r *reader) ReadByte() (byte, error) { - b, err := r.r.Read(r.off, 1) - r.off += 1 - if err != nil { - return 0, err - } - return b[0], nil -} - -func (r *reader) Varint() (uint64, error) { - v, err := binary.ReadUvarint(r) - if err != nil { - return 0, err - } - return v, nil -} - -func (r *reader) Str() (string, error) { - slen, err := r.Varint() - if err != nil { - return "", err - } - b, err := r.r.Read(r.off, uint32(slen)) - if err != nil { - return "", err - } - r.off += uint32(slen) - return string(b), nil -} - -func (r *reader) readTOC(toc *indexTOC) error { - sz, err := r.r.Size() - if err != nil { - return err - } - r.off = sz - 8 - - var tocSection simpleSection - if err := tocSection.read(r); err != nil { - return err - } - - r.seek(tocSection.off) - - sectionCount, err := r.U32() - if err != nil { - return err - } - - if sectionCount == 0 { - // tagged sections are indicated by a 0 sectionCount, - // and then a list of string-tagged type-indicated sections. - secs := toc.sectionsTagged() - for r.off < tocSection.off+tocSection.sz { - tag, err := r.Str() - if err != nil { - return err - } - kind, err := r.Varint() - if err != nil { - return err - } - sec := secs[tag] - if sec != nil && sec.kind() == sectionKind(kind) { - // happy path - if err := sec.read(r); err != nil { - return err - } - continue - } - // error case: skip over unknown section - if sec == nil { - log.Printf("file %s TOC has unknown section %q", r.r.Name(), tag) - } else { - return fmt.Errorf("file %s TOC section %q expects kind %d, got kind %d", r.r.Name(), tag, - kind, sec.kind()) - } - if kind == 0 { - (&simpleSection{}).read(r) - } else if kind == 1 { - (&compoundSection{}).read(r) - } - } - } else { - // TODO: Remove this branch when ReaderMinFeatureVersion >= 10 - - secs := toc.sections() - - if len(secs) != int(sectionCount) { - return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs)) - } - - for _, s := range secs { - if err := s.read(r); err != nil { - return err - } - } - } - return nil -} - -func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) { - return r.file.Read(sec.off, sec.sz) -} - -func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) { - if sec.sz%4 != 0 { - return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz) - } - blob, err := f.Read(sec.off, sec.sz) - if err != nil { - return nil, err - } - arr := make([]uint32, 0, len(blob)/4) - for len(blob) > 0 { - arr = append(arr, binary.BigEndian.Uint32(blob)) - blob = blob[4:] - } - return arr, nil -} - -func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) { - if sec.sz%8 != 0 { - return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz) - } - blob, err := f.Read(sec.off, sec.sz) - if err != nil { - return nil, err - } - arr := make([]uint64, 0, len(blob)/8) - for len(blob) > 0 { - arr = append(arr, binary.BigEndian.Uint64(blob)) - blob = blob[8:] - } - return arr, nil -} - -func (r *reader) readJSON(data interface{}, sec *simpleSection) error { - blob, err := r.r.Read(sec.off, sec.sz) - if err != nil { - return err - } - - return json.Unmarshal(blob, data) -} - -func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { - d := indexData{ - file: r.r, - ngrams: map[ngram]simpleSection{}, - fileNameNgrams: map[ngram][]uint32{}, - branchIDs: map[string]uint{}, - branchNames: map[uint]string{}, - } - - blob, err := d.readSectionBlob(toc.metaData) - if err != nil { - return nil, err - } - - if err := json.Unmarshal(blob, &d.metaData); err != nil { - return nil, err - } - - if d.metaData.IndexFormatVersion != IndexFormatVersion { - return nil, fmt.Errorf("file is v%d, want v%d", d.metaData.IndexFormatVersion, IndexFormatVersion) - } - - if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion { - return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion) - } - - if d.metaData.IndexMinReaderVersion > FeatureVersion { - return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion) - } - - blob, err = d.readSectionBlob(toc.repoMetaData) - if err != nil { - return nil, err - } - if err := json.Unmarshal(blob, &d.repoMetaData); err != nil { - return nil, err - } - - d.boundariesStart = toc.fileContents.data.off - d.boundaries = toc.fileContents.relativeIndex() - d.newlinesStart = toc.newlines.data.off - d.newlinesIndex = toc.newlines.relativeIndex() - d.docSectionsStart = toc.fileSections.data.off - d.docSectionsIndex = toc.fileSections.relativeIndex() - - d.checksums, err = d.readSectionBlob(toc.contentChecksums) - if err != nil { - return nil, err - } - - d.languages, err = d.readSectionBlob(toc.languages) - if err != nil { - return nil, err - } - - d.ngrams, err = d.readNgrams(toc) - if err != nil { - return nil, err - } - - d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) - if err != nil { - return nil, err - } - - d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data) - if err != nil { - return nil, err - } - - d.fileNameIndex = toc.fileNames.relativeIndex() - - d.fileNameNgrams, err = d.readFileNameNgrams(toc) - if err != nil { - return nil, err - } - - for j, br := range d.repoMetaData.Branches { - id := uint(1) << uint(j) - d.branchIDs[br.Name] = id - d.branchNames[id] = br.Name - } - - blob, err = d.readSectionBlob(toc.runeDocSections) - if err != nil { - return nil, err - } - d.runeDocSections = unmarshalDocSections(blob, nil) - - for sect, dest := range map[simpleSection]*[]uint32{ - toc.subRepos: &d.subRepos, - toc.runeOffsets: &d.runeOffsets, - toc.nameRuneOffsets: &d.fileNameRuneOffsets, - toc.nameEndRunes: &d.fileNameEndRunes, - toc.fileEndRunes: &d.fileEndRunes, - } { - if blob, err := d.readSectionBlob(sect); err != nil { - return nil, err - } else { - *dest = fromSizedDeltas(blob, nil) - } - } - - keys := []string{""} - for k := range d.repoMetaData.SubRepoMap { - if k != "" { // we used to marshal "" in SubRepoMap. Prevent adding twice. - keys = append(keys, k) - } - } - sort.Strings(keys) - d.subRepoPaths = keys - - d.languageMap = map[byte]string{} - for k, v := range d.metaData.LanguageMap { - d.languageMap[v] = k - } - - if err := d.verify(); err != nil { - return nil, err - } - - d.calculateStats() - return &d, nil -} - -const ngramEncoding = 8 - -func (d *indexData) readNgrams(toc *indexTOC) (map[ngram]simpleSection, error) { - textContent, err := d.readSectionBlob(toc.ngramText) - if err != nil { - return nil, err - } - postingsIndex := toc.postings.relativeIndex() - - ngrams := make(map[ngram]simpleSection, len(textContent)/ngramEncoding) - for i := 0; i < len(textContent); i += ngramEncoding { - j := i / ngramEncoding - ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) - ngrams[ng] = simpleSection{ - toc.postings.data.off + postingsIndex[j], - postingsIndex[j+1] - postingsIndex[j], - } - } - - return ngrams, nil -} - -func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]uint32, error) { - nameNgramText, err := d.readSectionBlob(toc.nameNgramText) - if err != nil { - return nil, err - } - - fileNamePostingsData, err := d.readSectionBlob(toc.namePostings.data) - if err != nil { - return nil, err - } - - fileNamePostingsIndex := toc.namePostings.relativeIndex() - - fileNameNgrams := make(map[ngram][]uint32, len(nameNgramText)/ngramEncoding) - for i := 0; i < len(nameNgramText); i += ngramEncoding { - j := i / ngramEncoding - off := fileNamePostingsIndex[j] - end := fileNamePostingsIndex[j+1] - ng := ngram(binary.BigEndian.Uint64(nameNgramText[i : i+ngramEncoding])) - fileNameNgrams[ng] = fromDeltas(fileNamePostingsData[off:end], nil) - } - - return fileNameNgrams, nil -} - -func (d *indexData) verify() error { - // This is not an exhaustive check: the postings can easily - // generate OOB acccesses, and are expensive to check, but this lets us rule out - // other sources of OOB access. - n := len(d.fileNameIndex) - if n == 0 { - return nil - } - - n-- - for what, got := range map[string]int{ - "boundaries": len(d.boundaries) - 1, - "branch masks": len(d.fileBranchMasks), - "doc section index": len(d.docSectionsIndex) - 1, - "newlines index": len(d.newlinesIndex) - 1, - } { - if got != n { - return fmt.Errorf("got %s %d, want %d", what, got, n) - } - } - return nil -} - -func (d *indexData) readContents(i uint32) ([]byte, error) { - return d.readSectionBlob(simpleSection{ - off: d.boundariesStart + d.boundaries[i], - sz: d.boundaries[i+1] - d.boundaries[i], - }) -} - -func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) { - // TODO(hanwen): cap result if it is at the end of the content - // section. - return d.readSectionBlob(simpleSection{ - off: d.boundariesStart + off, - sz: sz, - }) -} - -func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) { - sec := simpleSection{ - off: d.newlinesStart + d.newlinesIndex[i], - sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], - } - blob, err := d.readSectionBlob(sec) - if err != nil { - return nil, 0, err - } - - return fromSizedDeltas(blob, buf), sec.sz, nil -} - -func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) { - sec := simpleSection{ - off: d.docSectionsStart + d.docSectionsIndex[i], - sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i], - } - blob, err := d.readSectionBlob(sec) - if err != nil { - return nil, 0, err - } - - return unmarshalDocSections(blob, buf), sec.sz, nil -} - -// NewSearcher creates a Searcher for a single index file. Search -// results coming from this searcher are valid only for the lifetime -// of the Searcher itself, ie. []byte members should be copied into -// fresh buffers if the result is to survive closing the shard. -func NewSearcher(r IndexFile) (Searcher, error) { - rd := &reader{r: r} - - var toc indexTOC - if err := rd.readTOC(&toc); err != nil { - return nil, err - } - indexData, err := rd.readIndexData(&toc) - if err != nil { - return nil, err - } - indexData.file = r - return indexData, nil -} - -// ReadMetadata returns the metadata of index shard without reading -// the index data. The IndexFile is not closed. -func ReadMetadata(inf IndexFile) (*Repository, *IndexMetadata, error) { - rd := &reader{r: inf} - var toc indexTOC - if err := rd.readTOC(&toc); err != nil { - return nil, nil, err - } - - var md IndexMetadata - if err := rd.readJSON(&md, &toc.metaData); err != nil { - return nil, nil, err - } - - var repo Repository - if err := rd.readJSON(&repo, &toc.repoMetaData); err != nil { - return nil, nil, err - } - - return &repo, &md, nil -}
diff --git a/read_test.go b/read_test.go deleted file mode 100644 index b6b57cd..0000000 --- a/read_test.go +++ /dev/null
@@ -1,178 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "bytes" - "flag" - "fmt" - "io/fs" - "os" - "path" - "reflect" - "testing" -) - -var update = flag.Bool("update", false, "update the golden files of this test") - -func TestReadWrite(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - if err := b.AddFile("filename", []byte("abcde")); err != nil { - t.Fatalf("AddFile: %v", err) - } - - var buf bytes.Buffer - b.Write(&buf) - f := &memSeeker{buf.Bytes()} - - r := reader{r: f} - - var toc indexTOC - err = r.readTOC(&toc) - - if err != nil { - t.Errorf("got read error %v", err) - } - if toc.fileContents.data.sz != 5 { - t.Errorf("got contents size %d, want 5", toc.fileContents.data.sz) - } - - data, err := r.readIndexData(&toc) - if err != nil { - t.Fatalf("readIndexData: %v", err) - } - if got := data.fileName(0); string(got) != "filename" { - t.Errorf("got filename %q, want %q", got, "filename") - } - - if len(data.ngrams) != 3 { - t.Fatalf("got ngrams %v, want 3 ngrams", data.ngrams) - } - - if _, ok := data.ngrams[stringToNGram("bcq")]; ok { - t.Errorf("found ngram bcd in %v", data.ngrams) - } -} - -func TestReadWriteNames(t *testing.T) { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - if err := b.AddFile("abCd", []byte("")); err != nil { - t.Fatalf("AddFile: %v", err) - } - - var buf bytes.Buffer - b.Write(&buf) - f := &memSeeker{buf.Bytes()} - - r := reader{r: f} - - var toc indexTOC - if err := r.readTOC(&toc); err != nil { - t.Errorf("got read error %v", err) - } - if toc.fileNames.data.sz != 4 { - t.Errorf("got contents size %d, want 4", toc.fileNames.data.sz) - } - - data, err := r.readIndexData(&toc) - if err != nil { - t.Fatalf("readIndexData: %v", err) - } - if !reflect.DeepEqual([]uint32{0, 4}, data.fileNameIndex) { - t.Errorf("got index %v, want {0,4}", data.fileNameIndex) - } - if got := data.fileNameNgrams[stringToNGram("bCd")]; !reflect.DeepEqual(got, []uint32{1}) { - t.Errorf("got trigram bcd at bits %v, want sz 2", data.fileNameNgrams) - } -} - -func TestBackwardsCompat(t *testing.T) { - if *update { - b, err := NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - if err := b.AddFile("filename", []byte("abcde")); err != nil { - t.Fatalf("AddFile: %v", err) - } - - var buf bytes.Buffer - b.Write(&buf) - - outname := fmt.Sprintf("testdata/backcompat/new_v%d.%05d.zoekt", IndexFormatVersion, 0) - t.Log("writing new file", outname) - - err = os.WriteFile(outname, buf.Bytes(), 0644) - if err != nil { - t.Fatalf("Creating output file: %v", err) - } - } - - compatibleFiles, err := fs.Glob(os.DirFS("."), "testdata/backcompat/*.zoekt") - if err != nil { - t.Fatalf("fs.Glob: %v", err) - } - - for _, fname := range compatibleFiles { - t.Run(path.Base(fname), - func(t *testing.T) { - f, err := os.Open(fname) - if err != nil { - t.Fatal("os.Open", err) - } - idx, err := NewIndexFile(f) - if err != nil { - t.Fatal("NewIndexFile", err) - } - r := reader{r: idx} - - var toc indexTOC - err = r.readTOC(&toc) - - if err != nil { - t.Errorf("got read error %v", err) - } - if toc.fileContents.data.sz != 5 { - t.Errorf("got contents size %d, want 5", toc.fileContents.data.sz) - } - - data, err := r.readIndexData(&toc) - if err != nil { - t.Fatalf("readIndexData: %v", err) - } - if got := data.fileName(0); string(got) != "filename" { - t.Errorf("got filename %q, want %q", got, "filename") - } - - if len(data.ngrams) != 3 { - t.Fatalf("got ngrams %v, want 3 ngrams", data.ngrams) - } - - if _, ok := data.ngrams[stringToNGram("bcq")]; ok { - t.Errorf("found ngram bcd in %v", data.ngrams) - } - }, - ) - } -}
diff --git a/section.go b/section.go deleted file mode 100644 index 90e4e94..0000000 --- a/section.go +++ /dev/null
@@ -1,183 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "encoding/binary" - "io" - "log" -) - -var _ = log.Println - -// writer is an io.Writer that keeps track of errors and offsets -type writer struct { - err error - w io.Writer - off uint32 -} - -func (w *writer) Write(b []byte) error { - if w.err != nil { - return w.err - } - - var n int - n, w.err = w.w.Write(b) - w.off += uint32(n) - return w.err -} - -func (w *writer) Off() uint32 { return w.off } - -func (w *writer) B(b byte) { - s := []byte{b} - w.Write(s) -} - -func (w *writer) U32(n uint32) { - var enc [4]byte - binary.BigEndian.PutUint32(enc[:], n) - w.Write(enc[:]) -} - -func (w *writer) U64(n uint64) { - var enc [8]byte - binary.BigEndian.PutUint64(enc[:], n) - w.Write(enc[:]) -} - -func (w *writer) Varint(n uint32) { - var enc [8]byte - m := binary.PutUvarint(enc[:], uint64(n)) - w.Write(enc[:m]) -} - -func (w *writer) String(s string) { - b := []byte(s) - w.Varint(uint32(len(b))) - w.Write(b) -} - -func (s *simpleSection) start(w *writer) { - s.off = w.Off() -} - -func (s *simpleSection) end(w *writer) { - s.sz = w.Off() - s.off -} - -// section is a range of bytes in the index file. -type section interface { - read(*reader) error - write(*writer) - kind() sectionKind // simple or complex, used in serialization -} - -type sectionKind int - -const ( - sectionKindSimple sectionKind = 0 - sectionKindComplex sectionKind = 1 -) - -// simpleSection is a simple range of bytes. -type simpleSection struct { - off uint32 - sz uint32 -} - -func (s *simpleSection) kind() sectionKind { - return sectionKindSimple -} - -func (s *simpleSection) read(r *reader) error { - var err error - s.off, err = r.U32() - if err != nil { - return err - } - s.sz, err = r.U32() - if err != nil { - return err - } - return nil -} - -func (s *simpleSection) write(w *writer) { - w.U32(s.off) - w.U32(s.sz) -} - -// compoundSection is a range of bytes containg a list of variable -// sized items. -type compoundSection struct { - data simpleSection - - offsets []uint32 - index simpleSection -} - -func (s *compoundSection) kind() sectionKind { - return sectionKindComplex -} - -func (s *compoundSection) start(w *writer) { - s.data.start(w) -} - -func (s *compoundSection) end(w *writer) { - s.data.end(w) - s.index.start(w) - for _, o := range s.offsets { - w.U32(o) - } - s.index.end(w) -} - -func (s *compoundSection) addItem(w *writer, item []byte) { - s.offsets = append(s.offsets, w.Off()) - w.Write(item) -} - -func (s *compoundSection) write(w *writer) { - s.data.write(w) - s.index.write(w) -} - -func (s *compoundSection) read(r *reader) error { - if err := s.data.read(r); err != nil { - return err - } - if err := s.index.read(r); err != nil { - return err - } - var err error - s.offsets, err = readSectionU32(r.r, s.index) - return err -} - -// relativeIndex returns the relative offsets of the items (first -// element is 0), plus a final marking the end of the last item. -func (s *compoundSection) relativeIndex() []uint32 { - ri := make([]uint32, 0, len(s.offsets)+1) - for _, o := range s.offsets { - ri = append(ri, o-s.offsets[0]) - } - if len(s.offsets) > 0 { - ri = append(ri, s.data.sz) - } - return ri -}
diff --git a/section_test.go b/section_test.go deleted file mode 100644 index 19ed111..0000000 --- a/section_test.go +++ /dev/null
@@ -1,29 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "reflect" - "testing" -) - -func TestDeltas(t *testing.T) { - in := []uint32{1, 72, 0xfff} - out := toSizedDeltas(in) - round := fromSizedDeltas(out, nil) - if !reflect.DeepEqual(in, round) { - t.Errorf("got %v, want %v", round, in) - } -}
diff --git a/shards/shards.go b/shards/shards.go deleted file mode 100644 index 41afe40..0000000 --- a/shards/shards.go +++ /dev/null
@@ -1,548 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package shards - -import ( - "context" - "fmt" - "log" - "os" - "runtime" - "runtime/debug" - "sort" - "time" - - "golang.org/x/net/trace" - "golang.org/x/sync/semaphore" - - "github.com/google/zoekt" - "github.com/google/zoekt/query" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" -) - -var ( - metricShardsLoaded = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "zoekt_shards_loaded", - Help: "The number of shards currently loaded", - }) - metricShardsLoadedTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_shards_loaded_total", - Help: "The total number of shards loaded", - }) - metricShardsLoadFailedTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_shards_load_failed_total", - Help: "The total number of shard loads that failed", - }) - - metricSearchRunning = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "zoekt_search_running", - Help: "The number of concurrent search requests running", - }) - metricSearchShardRunning = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "zoekt_search_shard_running", - Help: "The number of concurrent search requests in a shard running", - }) - metricSearchFailedTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_failed_total", - Help: "The total number of search requests that failed", - }) - metricSearchDuration = promauto.NewHistogram(prometheus.HistogramOpts{ - Name: "zoekt_search_duration_seconds", - Help: "The duration a search request took in seconds", - Buckets: prometheus.DefBuckets, // DefBuckets good for service timings - }) - - // A Counter per Stat. Name should match field in zoekt.Stats. - metricSearchContentBytesLoadedTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_content_loaded_bytes_total", - Help: "Total amount of I/O for reading contents", - }) - metricSearchIndexBytesLoadedTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_index_loaded_bytes_total", - Help: "Total amount of I/O for reading from index", - }) - metricSearchCrashesTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_crashes_total", - Help: "Total number of search shards that had a crash", - }) - metricSearchFileCountTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_file_count_total", - Help: "Total number of files containing a match", - }) - metricSearchShardFilesConsideredTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_shard_files_considered_total", - Help: "Total number of files in shards that we considered", - }) - metricSearchFilesConsideredTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_files_considered_total", - Help: "Total files that we evaluated. Equivalent to files for which all atom matches (including negations) evaluated to true", - }) - metricSearchFilesLoadedTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_files_loaded_total", - Help: "Total files for which we loaded file content to verify substring matches", - }) - metricSearchFilesSkippedTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_files_skipped_total", - Help: "Total candidate files whose contents weren't examined because we gathered enough matches", - }) - metricSearchShardsSkippedTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_shards_skipped_total", - Help: "Total shards that we did not process because a query was canceled", - }) - metricSearchMatchCountTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_match_count_total", - Help: "Total number of non-overlapping matches", - }) - metricSearchNgramMatchesTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "zoekt_search_ngram_matches_total", - Help: "Total number of candidate matches as a result of searching ngrams", - }) - - metricListRunning = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "zoekt_list_running", - Help: "The number of concurrent list requests running", - }) - metricListShardRunning = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "zoekt_list_shard_running", - Help: "The number of concurrent list requests in a shard running", - }) -) - -type rankedShard struct { - zoekt.Searcher - rank uint16 -} - -type shardedSearcher struct { - // Limit the number of parallel queries. Since searching is - // CPU bound, we can't do better than #CPU queries in - // parallel. If we do so, we just create more memory - // pressure. - throttle *semaphore.Weighted - capacity int64 - - shards map[string]rankedShard - - rankedVersion uint64 - ranked []rankedShard -} - -func newShardedSearcher(n int64) *shardedSearcher { - ss := &shardedSearcher{ - shards: make(map[string]rankedShard), - throttle: semaphore.NewWeighted(n), - capacity: n, - } - return ss -} - -// NewDirectorySearcher returns a searcher instance that loads all -// shards corresponding to a glob into memory. -func NewDirectorySearcher(dir string) (zoekt.Searcher, error) { - ss := newShardedSearcher(int64(runtime.GOMAXPROCS(0))) - tl := &loader{ - ss: ss, - } - dw, err := NewDirectoryWatcher(dir, tl) - if err != nil { - return nil, err - } - - return &directorySearcher{ - Searcher: ss, - directoryWatcher: dw, - }, nil -} - -type directorySearcher struct { - zoekt.Searcher - - directoryWatcher *DirectoryWatcher -} - -func (s *directorySearcher) Close() { - // We need to Stop directoryWatcher first since it calls load/unload on - // Searcher. - s.directoryWatcher.Stop() - s.Searcher.Close() -} - -type loader struct { - ss *shardedSearcher -} - -func (tl *loader) load(key string) { - shard, err := loadShard(key) - if err != nil { - metricShardsLoadFailedTotal.Inc() - log.Printf("reloading: %s, err %v ", key, err) - return - } - - metricShardsLoadedTotal.Inc() - tl.ss.replace(key, shard) -} - -func (tl *loader) drop(key string) { - tl.ss.replace(key, nil) -} - -func (ss *shardedSearcher) String() string { - return "shardedSearcher" -} - -// Close closes references to open files. It may be called only once. -func (ss *shardedSearcher) Close() { - ss.lock() - defer ss.unlock() - for _, s := range ss.shards { - s.Close() - } - ss.shards = make(map[string]rankedShard) -} - -func (ss *shardedSearcher) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (sr *zoekt.SearchResult, err error) { - tr := trace.New("shardedSearcher.Search", "") - tr.LazyLog(q, true) - tr.LazyPrintf("opts: %+v", opts) - overallStart := time.Now() - metricSearchRunning.Inc() - defer func() { - metricSearchRunning.Dec() - metricSearchDuration.Observe(time.Since(overallStart).Seconds()) - if sr != nil { - metricSearchContentBytesLoadedTotal.Add(float64(sr.Stats.ContentBytesLoaded)) - metricSearchIndexBytesLoadedTotal.Add(float64(sr.Stats.IndexBytesLoaded)) - metricSearchCrashesTotal.Add(float64(sr.Stats.Crashes)) - metricSearchFileCountTotal.Add(float64(sr.Stats.FileCount)) - metricSearchShardFilesConsideredTotal.Add(float64(sr.Stats.ShardFilesConsidered)) - metricSearchFilesConsideredTotal.Add(float64(sr.Stats.FilesConsidered)) - metricSearchFilesLoadedTotal.Add(float64(sr.Stats.FilesLoaded)) - metricSearchFilesSkippedTotal.Add(float64(sr.Stats.FilesSkipped)) - metricSearchShardsSkippedTotal.Add(float64(sr.Stats.ShardsSkipped)) - metricSearchMatchCountTotal.Add(float64(sr.Stats.MatchCount)) - metricSearchNgramMatchesTotal.Add(float64(sr.Stats.NgramMatches)) - - tr.LazyPrintf("num files: %d", len(sr.Files)) - tr.LazyPrintf("stats: %+v", sr.Stats) - } - if err != nil { - metricSearchFailedTotal.Inc() - - tr.LazyPrintf("error: %v", err) - tr.SetError() - } - tr.Finish() - }() - - start := time.Now() - - aggregate := &zoekt.SearchResult{ - RepoURLs: map[string]string{}, - LineFragments: map[string]string{}, - } - - // This critical section is large, but we don't want to deal with - // searches on shards that have just been closed. - if err := ss.rlock(ctx); err != nil { - return aggregate, err - } - defer ss.runlock() - tr.LazyPrintf("acquired lock") - aggregate.Wait = time.Since(start) - start = time.Now() - - shards := ss.getShards() - all := make(chan shardResult, len(shards)) - - var childCtx context.Context - var cancel context.CancelFunc - if opts.MaxWallTime == 0 { - childCtx, cancel = context.WithCancel(ctx) - } else { - childCtx, cancel = context.WithTimeout(ctx, opts.MaxWallTime) - } - - defer cancel() - - // For each query, throttle the number of parallel - // actions. Since searching is mostly CPU bound, we limit the - // number of parallel searches. This reduces the peak working - // set, which hopefully stops https://cs.bazel.build from crashing - // when looking for the string "com". - feeder := make(chan zoekt.Searcher, len(shards)) - for _, s := range shards { - feeder <- s - } - close(feeder) - for i := 0; i < runtime.GOMAXPROCS(0); i++ { - go func() { - for s := range feeder { - searchOneShard(childCtx, s, q, opts, all) - } - }() - } - - for range shards { - r := <-all - if r.err != nil { - return nil, r.err - } - aggregate.Files = append(aggregate.Files, r.sr.Files...) - aggregate.Stats.Add(r.sr.Stats) - - if len(r.sr.Files) > 0 { - for k, v := range r.sr.RepoURLs { - aggregate.RepoURLs[k] = v - } - for k, v := range r.sr.LineFragments { - aggregate.LineFragments[k] = v - } - } - - if cancel != nil && opts.TotalMaxMatchCount > 0 && aggregate.Stats.MatchCount > opts.TotalMaxMatchCount { - cancel() - cancel = nil - } - } - - zoekt.SortFilesByScore(aggregate.Files) - if max := opts.MaxDocDisplayCount; max > 0 && len(aggregate.Files) > max { - aggregate.Files = aggregate.Files[:max] - } - for i := range aggregate.Files { - copySlice(&aggregate.Files[i].Content) - copySlice(&aggregate.Files[i].Checksum) - for l := range aggregate.Files[i].LineMatches { - copySlice(&aggregate.Files[i].LineMatches[l].Line) - } - } - - aggregate.Duration = time.Since(start) - return aggregate, nil -} - -func copySlice(src *[]byte) { - dst := make([]byte, len(*src)) - copy(dst, *src) - *src = dst -} - -type shardResult struct { - sr *zoekt.SearchResult - err error -} - -func searchOneShard(ctx context.Context, s zoekt.Searcher, q query.Q, opts *zoekt.SearchOptions, sink chan shardResult) { - metricSearchShardRunning.Inc() - defer func() { - metricSearchShardRunning.Dec() - if r := recover(); r != nil { - log.Printf("crashed shard: %s: %s, %s", s.String(), r, debug.Stack()) - - var r zoekt.SearchResult - r.Stats.Crashes = 1 - sink <- shardResult{&r, nil} - } - }() - - ms, err := s.Search(ctx, q, opts) - sink <- shardResult{ms, err} -} - -func (ss *shardedSearcher) List(ctx context.Context, r query.Q) (rl *zoekt.RepoList, err error) { - tr := trace.New("shardedSearcher.List", "") - tr.LazyLog(r, true) - metricListRunning.Inc() - defer func() { - metricListRunning.Dec() - if rl != nil { - tr.LazyPrintf("repos size: %d", len(rl.Repos)) - tr.LazyPrintf("crashes: %d", rl.Crashes) - } - if err != nil { - tr.LazyPrintf("error: %v", err) - tr.SetError() - } - tr.Finish() - }() - - type res struct { - rl *zoekt.RepoList - err error - } - - if err := ss.rlock(ctx); err != nil { - return nil, err - } - defer ss.runlock() - tr.LazyPrintf("acquired lock") - - shards := ss.getShards() - shardCount := len(shards) - all := make(chan res, shardCount) - tr.LazyPrintf("shardCount: %d", len(shards)) - - for _, s := range shards { - go func(s zoekt.Searcher) { - metricListShardRunning.Inc() - defer func() { - metricListShardRunning.Dec() - if r := recover(); r != nil { - all <- res{ - &zoekt.RepoList{Crashes: 1}, nil, - } - } - }() - ms, err := s.List(ctx, r) - all <- res{ms, err} - }(s.Searcher) - } - - crashes := 0 - uniq := map[string]*zoekt.RepoListEntry{} - - for i := 0; i < shardCount; i++ { - r := <-all - if r.err != nil { - return nil, r.err - } - crashes += r.rl.Crashes - for _, r := range r.rl.Repos { - prev, ok := uniq[r.Repository.Name] - if !ok { - cp := *r - uniq[r.Repository.Name] = &cp - } else { - prev.Stats.Add(&r.Stats) - } - } - } - - aggregate := make([]*zoekt.RepoListEntry, 0, len(uniq)) - for _, v := range uniq { - aggregate = append(aggregate, v) - } - return &zoekt.RepoList{ - Repos: aggregate, - Crashes: crashes, - }, nil -} - -func (s *shardedSearcher) rlock(ctx context.Context) error { - return s.throttle.Acquire(ctx, 1) -} - -// getShards returns the currently loaded shards. The shards must be accessed -// under a rlock call. The shards are sorted by decreasing rank and should not -// be mutated. -func (s *shardedSearcher) getShards() []rankedShard { - if len(s.ranked) > 0 { - return s.ranked - } - - var res []rankedShard - for _, sh := range s.shards { - res = append(res, sh) - } - sort.Slice(res, func(i, j int) bool { - return res[i].rank > res[j].rank - }) - - // Cache ranked. We currently hold a read lock, so start a goroutine which - // acquires a write lock to update. Use requiredVersion to ensure our - // cached slice is still current after acquiring the write lock. - go func(ranked []rankedShard, requiredVersion uint64) { - s.lock() - if s.rankedVersion == requiredVersion { - s.ranked = ranked - } - s.unlock() - }(res, s.rankedVersion) - - return res -} - -func (s *shardedSearcher) runlock() { - s.throttle.Release(1) -} - -func (s *shardedSearcher) lock() { - // won't error since context.Background won't expire - _ = s.throttle.Acquire(context.Background(), s.capacity) -} - -func (s *shardedSearcher) unlock() { - s.throttle.Release(s.capacity) -} - -func shardRank(s zoekt.Searcher) uint16 { - q := query.Repo{} - result, err := s.List(context.Background(), &q) - if err != nil { - return 0 - } - if len(result.Repos) == 0 { - return 0 - } - return result.Repos[0].Repository.Rank -} - -func (s *shardedSearcher) replace(key string, shard zoekt.Searcher) { - var rank uint16 - if shard != nil { - rank = shardRank(shard) - } - - s.lock() - defer s.unlock() - old := s.shards[key] - if old.Searcher != nil { - old.Close() - } - - if shard == nil { - delete(s.shards, key) - } else { - s.shards[key] = rankedShard{ - rank: rank, - Searcher: shard, - } - } - s.rankedVersion++ - s.ranked = nil - - metricShardsLoaded.Set(float64(len(s.shards))) -} - -func loadShard(fn string) (zoekt.Searcher, error) { - f, err := os.Open(fn) - if err != nil { - return nil, err - } - - iFile, err := zoekt.NewIndexFile(f) - if err != nil { - return nil, err - } - s, err := zoekt.NewSearcher(iFile) - if err != nil { - iFile.Close() - return nil, fmt.Errorf("NewSearcher(%s): %v", fn, err) - } - - return s, nil -}
diff --git a/shards/shards_test.go b/shards/shards_test.go deleted file mode 100644 index 3df2025..0000000 --- a/shards/shards_test.go +++ /dev/null
@@ -1,228 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package shards - -import ( - "bytes" - "context" - "fmt" - "log" - "os" - "runtime" - "testing" - "time" - - "github.com/google/zoekt" - "github.com/google/zoekt/query" -) - -type crashSearcher struct{} - -func (s *crashSearcher) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (*zoekt.SearchResult, error) { - panic("search") -} - -func (s *crashSearcher) List(ctx context.Context, q query.Q) (*zoekt.RepoList, error) { - panic("list") -} - -func (s *crashSearcher) Stats() (*zoekt.RepoStats, error) { - return &zoekt.RepoStats{}, nil -} - -func (s *crashSearcher) Close() {} - -func (s *crashSearcher) String() string { return "crashSearcher" } - -func TestCrashResilience(t *testing.T) { - out := &bytes.Buffer{} - log.SetOutput(out) - defer log.SetOutput(os.Stderr) - ss := newShardedSearcher(2) - ss.shards = map[string]rankedShard{ - "x": {Searcher: &crashSearcher{}}, - } - - q := &query.Substring{Pattern: "hoi"} - opts := &zoekt.SearchOptions{} - if res, err := ss.Search(context.Background(), q, opts); err != nil { - t.Fatalf("Search: %v", err) - } else if res.Stats.Crashes != 1 { - t.Errorf("got stats %#v, want crashes = 1", res.Stats) - } - - if res, err := ss.List(context.Background(), q); err != nil { - t.Fatalf("List: %v", err) - } else if res.Crashes != 1 { - t.Errorf("got result %#v, want crashes = 1", res) - } -} - -type rankSearcher struct { - rank uint16 -} - -func (s *rankSearcher) Close() { -} - -func (s *rankSearcher) String() string { - return "" -} - -func (s *rankSearcher) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (*zoekt.SearchResult, error) { - select { - case <-ctx.Done(): - return &zoekt.SearchResult{}, nil - default: - } - - // Ugly, but without sleep it's too fast, and we can't - // simulate the cutoff. - time.Sleep(time.Millisecond) - return &zoekt.SearchResult{ - Files: []zoekt.FileMatch{ - { - FileName: fmt.Sprintf("f%d", s.rank), - Score: float64(s.rank), - }, - }, - Stats: zoekt.Stats{ - MatchCount: 1, - }, - }, nil -} - -func (s *rankSearcher) List(ctx context.Context, q query.Q) (*zoekt.RepoList, error) { - return &zoekt.RepoList{ - Repos: []*zoekt.RepoListEntry{ - {Repository: zoekt.Repository{Rank: s.rank}}, - }, - }, nil -} - -func TestOrderByShard(t *testing.T) { - ss := newShardedSearcher(1) - - n := 10 * runtime.GOMAXPROCS(0) - for i := 0; i < n; i++ { - ss.replace(fmt.Sprintf("shard%d", i), - &rankSearcher{ - rank: uint16(i), - }) - } - - if res, err := ss.Search(context.Background(), &query.Substring{Pattern: "bla"}, &zoekt.SearchOptions{}); err != nil { - t.Errorf("Search: %v", err) - } else if len(res.Files) != n { - t.Fatalf("empty options: got %d results, want %d", len(res.Files), n) - } - - opts := zoekt.SearchOptions{ - TotalMaxMatchCount: 3, - } - res, err := ss.Search(context.Background(), &query.Substring{Pattern: "bla"}, &opts) - if err != nil { - t.Errorf("Search: %v", err) - } - - if len(res.Files) < opts.TotalMaxMatchCount { - t.Errorf("got %d results, want %d", len(res.Files), opts.TotalMaxMatchCount) - } - if len(res.Files) == n { - t.Errorf("got %d results, want < %d", len(res.Files), n) - } - for i, f := range res.Files { - rev := n - 1 - i - want := fmt.Sprintf("f%d", rev) - got := f.FileName - - if got != want { - t.Logf("%d: got %q, want %q", i, got, want) - } - } -} - -type memSeeker struct { - data []byte -} - -func (s *memSeeker) Name() string { - return "memseeker" -} - -func (s *memSeeker) Close() {} -func (s *memSeeker) Read(off, sz uint32) ([]byte, error) { - return s.data[off : off+sz], nil -} - -func (s *memSeeker) Size() (uint32, error) { - return uint32(len(s.data)), nil -} - -func TestUnloadIndex(t *testing.T) { - b, err := zoekt.NewIndexBuilder(nil) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - for i, d := range []zoekt.Document{{ - Name: "filename", - Content: []byte("needle needle needle"), - }} { - if err := b.Add(d); err != nil { - t.Fatalf("Add %d: %v", i, err) - } - } - - var buf bytes.Buffer - b.Write(&buf) - indexBytes := buf.Bytes() - indexFile := &memSeeker{indexBytes} - searcher, err := zoekt.NewSearcher(indexFile) - if err != nil { - t.Fatalf("NewSearcher: %v", err) - } - - ss := newShardedSearcher(2) - ss.replace("key", searcher) - - var opts zoekt.SearchOptions - q := &query.Substring{Pattern: "needle"} - res, err := ss.Search(context.Background(), q, &opts) - if err != nil { - t.Fatalf("Search(%s): %v", q, err) - } - - forbidden := byte(29) - for i := range indexBytes { - // non-ASCII - indexBytes[i] = forbidden - } - - for _, f := range res.Files { - if bytes.Contains(f.Content, []byte{forbidden}) { - t.Errorf("found %d in content %q", forbidden, f.Content) - } - if bytes.Contains(f.Checksum, []byte{forbidden}) { - t.Errorf("found %d in checksum %q", forbidden, f.Checksum) - } - - for _, l := range f.LineMatches { - if bytes.Contains(l.Line, []byte{forbidden}) { - t.Errorf("found %d in line %q", forbidden, l.Line) - } - } - } -}
diff --git a/shards/watcher.go b/shards/watcher.go deleted file mode 100644 index 62f2067..0000000 --- a/shards/watcher.go +++ /dev/null
@@ -1,212 +0,0 @@ -// Copyright 2017 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package shards - -import ( - "fmt" - "log" - "os" - "path/filepath" - "runtime" - "sort" - "strings" - "sync" - "time" - - "github.com/fsnotify/fsnotify" -) - -type shardLoader interface { - // Load a new file. Should be safe for concurrent calls. - load(filename string) - drop(filename string) -} - -type DirectoryWatcher struct { - dir string - timestamps map[string]time.Time - loader shardLoader - - closeOnce sync.Once - // quit is closed by Close to signal the directory watcher to stop. - quit chan struct{} - // stopped is closed once the directory watcher has stopped. - stopped chan struct{} -} - -func (sw *DirectoryWatcher) Stop() { - sw.closeOnce.Do(func() { - close(sw.quit) - <-sw.stopped - }) -} - -func NewDirectoryWatcher(dir string, loader shardLoader) (*DirectoryWatcher, error) { - sw := &DirectoryWatcher{ - dir: dir, - timestamps: map[string]time.Time{}, - loader: loader, - quit: make(chan struct{}), - stopped: make(chan struct{}), - } - if err := sw.scan(); err != nil { - return nil, err - } - - if err := sw.watch(); err != nil { - return nil, err - } - - return sw, nil -} - -func (s *DirectoryWatcher) String() string { - return fmt.Sprintf("shardWatcher(%s)", s.dir) -} - -func (s *DirectoryWatcher) scan() error { - fs, err := filepath.Glob(filepath.Join(s.dir, "*.zoekt")) - if err != nil { - return err - } - - if len(s.timestamps) == 0 && len(fs) == 0 { - return fmt.Errorf("directory %s is empty", s.dir) - } - - ts := map[string]time.Time{} - for _, fn := range fs { - fi, err := os.Lstat(fn) - if err != nil { - continue - } - - ts[fn] = fi.ModTime() - } - - var toLoad []string - for k, mtime := range ts { - if t, ok := s.timestamps[k]; !ok || t != mtime { - toLoad = append(toLoad, k) - s.timestamps[k] = mtime - } - } - - var toDrop []string - // Unload deleted shards. - for k := range s.timestamps { - if _, ok := ts[k]; !ok { - toDrop = append(toDrop, k) - delete(s.timestamps, k) - } - } - - if len(toDrop) > 0 { - log.Printf("unloading %d shard(s)", len(toDrop)) - } - for _, t := range toDrop { - log.Printf("unloading: %s", filepath.Base(t)) - s.loader.drop(t) - } - - if len(toLoad) == 0 { - return nil - } - - log.Printf("loading %d shard(s): %s", len(toLoad), humanTruncateList(toLoad, 5)) - - // Limit amount of concurrent shard loads. - throttle := make(chan struct{}, runtime.GOMAXPROCS(0)) - lastProgress := time.Now() - for i, t := range toLoad { - // If taking a while to start-up occasionally give a progress message - if time.Since(lastProgress) > 10*time.Second { - log.Printf("still need to load %d shards...", len(toLoad)-i) - lastProgress = time.Now() - } - - throttle <- struct{}{} - go func(k string) { - s.loader.load(k) - <-throttle - }(t) - } - for i := 0; i < cap(throttle); i++ { - throttle <- struct{}{} - } - - return nil -} - -func humanTruncateList(paths []string, max int) string { - sort.Strings(paths) - var b strings.Builder - for i, p := range paths { - if i >= max { - fmt.Fprintf(&b, "... %d more", len(paths)-i) - break - } - if i > 0 { - b.WriteString(", ") - } - b.WriteString(filepath.Base(p)) - } - return b.String() -} - -func (s *DirectoryWatcher) watch() error { - watcher, err := fsnotify.NewWatcher() - if err != nil { - return err - } - if err := watcher.Add(s.dir); err != nil { - return err - } - - // intermediate signal channel so if there are multiple watcher.Events we - // only call scan once. - signal := make(chan struct{}, 1) - - go func() { - for { - select { - case <-watcher.Events: - select { - case signal <- struct{}{}: - default: - } - case err := <-watcher.Errors: - // Ignore ErrEventOverflow since we rely on the presence of events so - // safe to ignore. - if err != nil && err != fsnotify.ErrEventOverflow { - log.Println("watcher error:", err) - } - case <-s.quit: - watcher.Close() - close(signal) - return - } - } - }() - - go func() { - defer close(s.stopped) - for range signal { - s.scan() - } - }() - - return nil -}
diff --git a/shards/watcher_test.go b/shards/watcher_test.go deleted file mode 100644 index 5aad02f..0000000 --- a/shards/watcher_test.go +++ /dev/null
@@ -1,129 +0,0 @@ -// Copyright 2018 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package shards - -import ( - "io/ioutil" - "os" - "path/filepath" - "strings" - "testing" - "time" -) - -type loggingLoader struct { - loads chan string - drops chan string -} - -func (l *loggingLoader) load(k string) { - l.loads <- k -} - -func (l *loggingLoader) drop(k string) { - l.drops <- k -} - -func advanceFS() { - time.Sleep(10 * time.Millisecond) -} - -func TestDirWatcherUnloadOnce(t *testing.T) { - dir, err := ioutil.TempDir("", "") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(dir) - - logger := &loggingLoader{ - loads: make(chan string, 10), - drops: make(chan string, 10), - } - _, err = NewDirectoryWatcher(dir, logger) - if err == nil || !strings.Contains(err.Error(), "empty") { - t.Fatalf("got %v, want 'empty'", err) - } - - shard := filepath.Join(dir, "foo.zoekt") - if err := ioutil.WriteFile(shard, []byte("hello"), 0o644); err != nil { - t.Fatalf("WriteFile: %v", err) - } - - dw, err := NewDirectoryWatcher(dir, logger) - if err != nil { - t.Fatalf("NewDirectoryWatcher: %v", err) - } - defer dw.Stop() - - if got := <-logger.loads; got != shard { - t.Fatalf("got load event %v, want %v", got, shard) - } - - // Must sleep because of FS timestamp resolution. - advanceFS() - if err := ioutil.WriteFile(shard, []byte("changed"), 0o644); err != nil { - t.Fatalf("WriteFile: %v", err) - } - - if got := <-logger.loads; got != shard { - t.Fatalf("got load event %v, want %v", got, shard) - } - - advanceFS() - if err := os.Remove(shard); err != nil { - t.Fatalf("Remove: %v", err) - } - - if got := <-logger.drops; got != shard { - t.Fatalf("got drops event %v, want %v", got, shard) - } - - advanceFS() - if err := ioutil.WriteFile(shard+".bla", []byte("changed"), 0o644); err != nil { - t.Fatalf("WriteFile: %v", err) - } - - dw.Stop() - - select { - case k := <-logger.loads: - t.Errorf("spurious load of %q", k) - case k := <-logger.drops: - t.Errorf("spurious drops of %q", k) - default: - } -} - -func TestHumanTruncateList(t *testing.T) { - paths := []string{ - "dir/1", - "dir/2", - "dir/3", - "dir/4", - } - - assert := func(max int, want string) { - got := humanTruncateList(paths, max) - if got != want { - t.Errorf("unexpected humanTruncateList max=%d.\ngot: %s\nwant: %s", max, got, want) - } - } - - assert(1, "1... 3 more") - assert(2, "1, 2... 2 more") - assert(3, "1, 2, 3... 1 more") - assert(4, "1, 2, 3, 4") - assert(5, "1, 2, 3, 4") -}
diff --git a/testdata/backcompat/static_toc_v15.00000.zoekt b/testdata/backcompat/static_toc_v15.00000.zoekt deleted file mode 100644 index a070892..0000000 --- a/testdata/backcompat/static_toc_v15.00000.zoekt +++ /dev/null Binary files differ
diff --git a/toc.go b/toc.go deleted file mode 100644 index 9eab283..0000000 --- a/toc.go +++ /dev/null
@@ -1,163 +0,0 @@ -// Copyright 2017 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -// FormatVersion is a version number. It is increased every time the -// on-disk index format is changed. -// 5: subrepositories. -// 6: remove size prefix for posting varint list. -// 7: move subrepos into Repository struct. -// 8: move repoMetaData out of indexMetadata -// 9: use bigendian uint64 for trigrams. -// 10: sections for rune offsets. -// 11: file ends in rune offsets. -// 12: 64-bit branchmasks. -// 13: content checksums -// 14: languages -// 15: rune based symbol sections -// 16 (TBA): TODO: remove fallback parsing in readTOC -const IndexFormatVersion = 15 - -// FeatureVersion is increased if a feature is added that requires reindexing data -// without changing the format version -// 2: Rank field for shards. -// 3: Rank documents within shards -// 4: Dedup file bugfix -// 5: Remove max line size limit -// 6: Include '#' into the LineFragment template -// 7: Record skip reasons in the index. -// 8: Record source path in the index. -// 9: Bump default max file size. -// 10: Switch to a more flexible TOC format. -const FeatureVersion = 10 - -// WriteMinFeatureVersion and ReadMinFeatureVersion constrain forwards and backwards -// compatibility. For example, if a new way to encode filenameNgrams on disk is -// added using a new section but the old one is retained, this would only bump -// FeatureVersion, since the previous version can read the file and ignore the -// new section, but the index files should be regenerated. -// When the new encoding is fully rolled out and stable, the section with the old -// encoding and the associated reader can be removed, and WriteMinFeatureVersion and -// ReadMinFeatureVersion can be set to the current FeatureVersion, indicating -// that the reader must handle the new version and that older versions are no -// longer valid. -// In this way, compatibility with arbitrary version offsets can be indicated. - -// WriteMinFeatureVersion constrains forwards compatibility by emitting files -// that won't load in zoekt with a FeatureVersion below it. -const WriteMinFeatureVersion = 10 - -// ReadMinFeatureVersion constrains backwards compatibility by refusing to -// load a file with a FeatureVersion below it. -const ReadMinFeatureVersion = 8 - -type indexTOC struct { - fileContents compoundSection - fileNames compoundSection - fileSections compoundSection - postings compoundSection - newlines compoundSection - ngramText simpleSection - runeOffsets simpleSection - fileEndRunes simpleSection - languages simpleSection - - branchMasks simpleSection - subRepos simpleSection - - nameNgramText simpleSection - namePostings compoundSection - nameRuneOffsets simpleSection - metaData simpleSection - repoMetaData simpleSection - nameEndRunes simpleSection - contentChecksums simpleSection - runeDocSections simpleSection -} - -func (t *indexTOC) sections() []section { - // This old sections list is only needed to maintain backwards compatibility, - // and can be removed when a migration to tagged sections is complete. - return []section{ - // This must be first, so it can be reliably read across - // file format versions. - &t.metaData, - &t.repoMetaData, - &t.fileContents, - &t.fileNames, - &t.fileSections, - &t.newlines, - &t.ngramText, - &t.postings, - &t.nameNgramText, - &t.namePostings, - &t.branchMasks, - &t.subRepos, - &t.runeOffsets, - &t.nameRuneOffsets, - &t.fileEndRunes, - &t.nameEndRunes, - &t.contentChecksums, - &t.languages, - &t.runeDocSections, - } -} - -type taggedSection struct { - tag string - sec section -} - -func (t *indexTOC) sectionsTagged() map[string]section { - out := map[string]section{} - for _, ent := range t.sectionsTaggedList() { - out[ent.tag] = ent.sec - } - for _, ent := range t.sectionsTaggedCompatibilityList() { - out[ent.tag] = ent.sec - } - return out -} - -func (t *indexTOC) sectionsTaggedList() []taggedSection { - return []taggedSection{ - {"metadata", &t.metaData}, - {"repoMetaData", &t.repoMetaData}, - {"fileContents", &t.fileContents}, - {"fileNames", &t.fileNames}, - {"fileSections", &t.fileSections}, - {"newlines", &t.newlines}, - {"ngramText", &t.ngramText}, - {"postings", &t.postings}, - {"nameNgramText", &t.nameNgramText}, - {"namePostings", &t.namePostings}, - {"branchMasks", &t.branchMasks}, - {"subRepos", &t.subRepos}, - {"runeOffsets", &t.runeOffsets}, - {"nameRuneOffsets", &t.nameRuneOffsets}, - {"fileEndRunes", &t.fileEndRunes}, - {"nameEndRunes", &t.nameEndRunes}, - {"contentChecksums", &t.contentChecksums}, - {"languages", &t.languages}, - {"runeDocSections", &t.runeDocSections}, - } -} - -// sectionsTaggedCompatibilityList returns a list of sections that will be -// handled or converted for backwards compatiblity, but aren't written by -// the current iteration of the indexer. -func (t *indexTOC) sectionsTaggedCompatibilityList() []taggedSection { - return []taggedSection{} -}
diff --git a/web/api.go b/web/api.go deleted file mode 100644 index 8eca345..0000000 --- a/web/api.go +++ /dev/null
@@ -1,113 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package web - -import ( - "time" - - "github.com/google/zoekt" -) - -type LastInput struct { - Query string - Num int - - // If set, focus on the search box. - AutoFocus bool -} - -// Result holds the data provided to the search results template. -type ResultInput struct { - Last LastInput - QueryStr string - Query string - Stats zoekt.Stats - Duration time.Duration - FileMatches []*FileMatch - SearchOptions string -} - -// FileMatch holds the per file data provided to search results template -type FileMatch struct { - FileName string - Repo string - ResultID string - Language string - // If this was a duplicate result, this will contain the file - // of the first match. - DuplicateID string - - Branches []string - Matches []Match - URL string -} - -// Match holds the per line data provided to the search results template -type Match struct { - URL string - FileName string - LineNum int - - Fragments []Fragment -} - -// Fragment holds data of a single contiguous match within in a line -// for the results template. -type Fragment struct { - Pre string - Match string - Post string -} - -// SearchBoxInput is provided to the SearchBox template. -type SearchBoxInput struct { - Last LastInput - Stats *zoekt.RepoStats - Version string - Uptime time.Duration -} - -// RepoListInput is provided to the RepoList template. -type RepoListInput struct { - Last LastInput - Stats zoekt.RepoStats - Repos []Repository -} - -// Branch holds the metadata for a indexed branch. -type Branch struct { - Name string - Version string - URL string -} - -// Repository holds the metadata for an indexed repository. -type Repository struct { - Name string - URL string - IndexTime time.Time - Branches []Branch - Files int64 - - // Total amount of content bytes. - Size int64 -} - -// PrintInput is provided to the server.Print template. -type PrintInput struct { - Repo, Name string - Lines []string - Last LastInput -}
diff --git a/web/e2e_test.go b/web/e2e_test.go deleted file mode 100644 index 479441b..0000000 --- a/web/e2e_test.go +++ /dev/null
@@ -1,442 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package web - -import ( - "bytes" - "context" - "fmt" - "io/ioutil" - "log" - "net/http" - "net/http/httptest" - "strings" - "testing" - "time" - - "github.com/google/zoekt" - "github.com/google/zoekt/query" -) - -// TODO(hanwen): cut & paste from ../ . Should create internal test -// util package. -type memSeeker struct { - data []byte -} - -func (s *memSeeker) Close() {} -func (s *memSeeker) Read(off, sz uint32) ([]byte, error) { - return s.data[off : off+sz], nil -} - -func (s *memSeeker) Size() (uint32, error) { - return uint32(len(s.data)), nil -} - -func (s *memSeeker) Name() string { - return "memSeeker" -} - -func searcherForTest(t *testing.T, b *zoekt.IndexBuilder) zoekt.Searcher { - var buf bytes.Buffer - b.Write(&buf) - f := &memSeeker{buf.Bytes()} - - searcher, err := zoekt.NewSearcher(f) - if err != nil { - t.Fatalf("NewSearcher: %v", err) - } - - return searcher -} - -func TestBasic(t *testing.T) { - b, err := zoekt.NewIndexBuilder(&zoekt.Repository{ - Name: "name", - URL: "repo-url", - CommitURLTemplate: "{{.Version}}", - FileURLTemplate: "file-url", - LineFragmentTemplate: "#line", - Branches: []zoekt.RepositoryBranch{{Name: "master", Version: "1234"}}, - }) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - if err := b.Add(zoekt.Document{ - Name: "f2", - Content: []byte("to carry water in the no later bla"), - // ------------- 0123456789012345678901234567890123 - // ------------- 0 1 2 3 - Branches: []string{"master"}, - }); err != nil { - t.Fatalf("Add: %v", err) - } - - s := searcherForTest(t, b) - srv := Server{ - Searcher: s, - Top: Top, - HTML: true, - } - - mux, err := NewMux(&srv) - if err != nil { - t.Fatalf("NewMux: %v", err) - } - - ts := httptest.NewServer(mux) - defer ts.Close() - - nowStr := time.Now().Format("Jan 02, 2006 15:04") - for req, needles := range map[string][]string{ - "/": {"from 1 repositories"}, - "/search?q=water": { - "href=\"file-url#line", - "carry <b>water</b>", - }, - "/search?q=r:": { - "1234\">master", - "Found 1 repositories", - nowStr, - "repo-url\">name", - "1 files (36)", - }, - "/search?q=magic": { - `value=magic`, - }, - "/robots.txt": { - "disallow: /search", - }, - } { - checkNeedles(t, ts, req, needles) - } -} - -func TestPrint(t *testing.T) { - b, err := zoekt.NewIndexBuilder(&zoekt.Repository{ - Name: "name", - URL: "repo-url", - CommitURLTemplate: "{{.Version}}", - FileURLTemplate: "file-url", - LineFragmentTemplate: "line", - Branches: []zoekt.RepositoryBranch{{Name: "master", Version: "1234"}}, - }) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - if err := b.Add(zoekt.Document{ - Name: "f2", - Content: []byte("to carry water in the no later bla"), - Branches: []string{"master"}, - }); err != nil { - t.Fatalf("Add: %v", err) - } - - if err := b.Add(zoekt.Document{ - Name: "dir/f2", - Content: []byte("blabla"), - Branches: []string{"master"}, - }); err != nil { - t.Fatalf("Add: %v", err) - } - - s := searcherForTest(t, b) - srv := Server{ - Searcher: s, - Top: Top, - HTML: true, - Print: true, - } - - mux, err := NewMux(&srv) - if err != nil { - t.Fatalf("NewMux: %v", err) - } - - ts := httptest.NewServer(mux) - defer ts.Close() - - for req, needles := range map[string][]string{ - "/print?q=bla&r=name&f=f2": { - `pre id="l1" class="inline-pre"><span class="noselect"><a href="#l1">`, - }, - } { - checkNeedles(t, ts, req, needles) - } -} - -func TestPrintDefault(t *testing.T) { - b, err := zoekt.NewIndexBuilder(&zoekt.Repository{ - Name: "name", - URL: "repo-url", - Branches: []zoekt.RepositoryBranch{{Name: "master", Version: "1234"}}, - }) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - if err := b.Add(zoekt.Document{ - Name: "f2", - Content: []byte("to carry water in the no later bla"), - Branches: []string{"master"}, - }); err != nil { - t.Fatalf("Add: %v", err) - } - s := searcherForTest(t, b) - srv := Server{ - Searcher: s, - Top: Top, - HTML: true, - } - - mux, err := NewMux(&srv) - if err != nil { - t.Fatalf("NewMux: %v", err) - } - - ts := httptest.NewServer(mux) - defer ts.Close() - - for req, needles := range map[string][]string{ - "/search?q=water": { - `href="print?`, - }, - } { - checkNeedles(t, ts, req, needles) - } -} - -func checkNeedles(t *testing.T, ts *httptest.Server, req string, needles []string) { - res, err := http.Get(ts.URL + req) - if err != nil { - t.Fatal(err) - } - resultBytes, err := ioutil.ReadAll(res.Body) - res.Body.Close() - if err != nil { - log.Fatal(err) - } - - result := string(resultBytes) - for _, want := range needles { - if !strings.Contains(result, want) { - t.Errorf("query %q: result did not have %q: %s", req, want, result) - } - } - if notWant := "crashed"; strings.Contains(result, notWant) { - t.Errorf("result has %q: %s", notWant, result) - } - if notWant := "bytes skipped)..."; strings.Contains(result, notWant) { - t.Errorf("result has %q: %s", notWant, result) - } -} - -type crashSearcher struct { - zoekt.Searcher -} - -func (s *crashSearcher) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (*zoekt.SearchResult, error) { - res := zoekt.SearchResult{} - res.Stats.Crashes = 1 - return &res, nil -} - -func TestCrash(t *testing.T) { - srv := Server{ - Searcher: &crashSearcher{}, - Top: Top, - HTML: true, - } - - mux, err := NewMux(&srv) - if err != nil { - t.Fatalf("NewMux: %v", err) - } - - ts := httptest.NewServer(mux) - defer ts.Close() - - res, err := http.Get(ts.URL + "/search?q=water") - if err != nil { - t.Fatal(err) - } - resultBytes, err := ioutil.ReadAll(res.Body) - res.Body.Close() - if err != nil { - t.Fatal(err) - } - - result := string(resultBytes) - if want := "1 shards crashed"; !strings.Contains(result, want) { - t.Errorf("result did not have %q: %s", want, result) - } -} - -func TestHostCustomization(t *testing.T) { - b, err := zoekt.NewIndexBuilder(&zoekt.Repository{ - Name: "name", - }) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - if err := b.Add(zoekt.Document{ - Name: "file", - Content: []byte("bla"), - }); err != nil { - t.Fatalf("Add: %v", err) - } - - s := searcherForTest(t, b) - srv := Server{ - Searcher: s, - Top: Top, - HTML: true, - HostCustomQueries: map[string]string{ - "myproject.io": "r:myproject", - }, - } - - mux, err := NewMux(&srv) - if err != nil { - t.Fatalf("NewMux: %v", err) - } - - ts := httptest.NewServer(mux) - defer ts.Close() - - req, err := http.NewRequest("GET", ts.URL, &bytes.Buffer{}) - if err != nil { - t.Fatalf("NewRequest: %v", err) - } - req.Host = "myproject.io" - res, err := (&http.Client{}).Do(req) - if err != nil { - t.Fatalf("Do(%v): %v", req, err) - } - resultBytes, err := ioutil.ReadAll(res.Body) - res.Body.Close() - if err != nil { - t.Fatalf("ReadAll: %v", err) - } - - if got, want := string(resultBytes), "r:myproject"; !strings.Contains(got, want) { - t.Fatalf("got %s, want substring %q", got, want) - } -} - -func TestDupResult(t *testing.T) { - b, err := zoekt.NewIndexBuilder(&zoekt.Repository{ - Name: "name", - }) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - for i := 0; i < 2; i++ { - if err := b.Add(zoekt.Document{ - Name: fmt.Sprintf("file%d", i), - Content: []byte("bla"), - }); err != nil { - t.Fatalf("Add: %v", err) - } - } - s := searcherForTest(t, b) - srv := Server{ - Searcher: s, - Top: Top, - HTML: true, - } - - mux, err := NewMux(&srv) - if err != nil { - t.Fatalf("NewMux: %v", err) - } - - ts := httptest.NewServer(mux) - defer ts.Close() - - req, err := http.NewRequest("GET", ts.URL+"/search?q=bla", &bytes.Buffer{}) - if err != nil { - t.Fatalf("NewRequest: %v", err) - } - res, err := (&http.Client{}).Do(req) - if err != nil { - t.Fatalf("Do(%v): %v", req, err) - } - resultBytes, err := ioutil.ReadAll(res.Body) - res.Body.Close() - if err != nil { - t.Fatalf("ReadAll: %v", err) - } - - if got, want := string(resultBytes), "Duplicate result"; !strings.Contains(got, want) { - t.Fatalf("got %s, want substring %q", got, want) - } -} - -func TestTruncateLine(t *testing.T) { - b, err := zoekt.NewIndexBuilder(&zoekt.Repository{ - Name: "name", - }) - if err != nil { - t.Fatalf("NewIndexBuilder: %v", err) - } - - largePadding := bytes.Repeat([]byte{'a'}, 100*1000) // 100kb - if err := b.Add(zoekt.Document{ - Name: "file", - Content: append(append(largePadding, []byte("helloworld")...), largePadding...), - }); err != nil { - t.Fatalf("Add: %v", err) - } - s := searcherForTest(t, b) - srv := Server{ - Searcher: s, - Top: Top, - HTML: true, - } - - mux, err := NewMux(&srv) - if err != nil { - t.Fatalf("NewMux: %v", err) - } - - ts := httptest.NewServer(mux) - defer ts.Close() - - req, err := http.NewRequest("GET", ts.URL+"/search?q=helloworld", &bytes.Buffer{}) - if err != nil { - t.Fatalf("NewRequest: %v", err) - } - res, err := (&http.Client{}).Do(req) - if err != nil { - t.Fatalf("Do(%v): %v", req, err) - } - resultBytes, err := ioutil.ReadAll(res.Body) - res.Body.Close() - if err != nil { - t.Fatalf("ReadAll: %v", err) - } - - if got, want := len(resultBytes)/1000, 10; got > want { - t.Fatalf("got %dkb response, want <= %dkb", got, want) - } - result := string(resultBytes) - if want := "aa<b>helloworld</b>aa"; !strings.Contains(result, want) { - t.Fatalf("got %s, want substring %q", result, want) - } - if want := "bytes skipped)..."; !strings.Contains(result, want) { - t.Fatalf("got %s, want substring %q", result, want) - } -}
diff --git a/web/server.go b/web/server.go deleted file mode 100644 index 94a16ac..0000000 --- a/web/server.go +++ /dev/null
@@ -1,582 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package web - -import ( - "bytes" - "fmt" - "html/template" - "log" - "net" - "net/http" - "regexp" - "regexp/syntax" - "sort" - "strconv" - "strings" - "sync" - "time" - - "golang.org/x/net/context" - - "github.com/google/zoekt" - "github.com/google/zoekt/query" -) - -var Funcmap = template.FuncMap{ - "Inc": func(orig int) int { - return orig + 1 - }, - "More": func(orig int) int { - return orig * 3 - }, - "HumanUnit": func(orig int64) string { - b := orig - suffix := "" - if orig > 10*(1<<30) { - suffix = "G" - b = orig / (1 << 30) - } else if orig > 10*(1<<20) { - suffix = "M" - b = orig / (1 << 20) - } else if orig > 10*(1<<10) { - suffix = "K" - b = orig / (1 << 10) - } - - return fmt.Sprintf("%d%s", b, suffix) - }, - "LimitPre": func(limit int, pre string) string { - if len(pre) < limit { - return pre - } - return fmt.Sprintf("...(%d bytes skipped)...%s", len(pre)-limit, pre[len(pre)-limit:]) - }, - "LimitPost": func(limit int, post string) string { - if len(post) < limit { - return post - } - return fmt.Sprintf("%s...(%d bytes skipped)...", post[:limit], len(post)-limit) - }, -} - -const defaultNumResults = 50 - -type Server struct { - Searcher zoekt.Searcher - - // Serve HTML interface - HTML bool - - // If set, show files from the index. - Print bool - - // Version string for this server. - Version string - - // Depending on the Host header, add a query to the entry - // page. For example, when serving on "search.myproject.org" - // we could add "r:myproject" automatically. This allows a - // single instance to serve as search engine for multiple - // domains. - HostCustomQueries map[string]string - - // This should contain the following templates: "didyoumean" - // (for suggestions), "repolist" (for the repo search result - // page), "result" for the search results, "search" (for the - // opening page), "box" for the search query input element and - // "print" for the show file functionality. - Top *template.Template - - didYouMean *template.Template - repolist *template.Template - search *template.Template - result *template.Template - print *template.Template - about *template.Template - robots *template.Template - - startTime time.Time - - templateMu sync.Mutex - templateCache map[string]*template.Template - - lastStatsMu sync.Mutex - lastStats *zoekt.RepoStats - lastStatsTS time.Time -} - -func (s *Server) getTemplate(str string) *template.Template { - s.templateMu.Lock() - defer s.templateMu.Unlock() - t := s.templateCache[str] - if t != nil { - return t - } - - t, err := template.New("cache").Parse(str) - if err != nil { - log.Printf("template parse error: %v", err) - t = template.Must(template.New("empty").Parse("")) - } - s.templateCache[str] = t - return t -} - -func NewMux(s *Server) (*http.ServeMux, error) { - s.print = s.Top.Lookup("print") - if s.print == nil { - return nil, fmt.Errorf("missing template 'print'") - } - - for k, v := range map[string]**template.Template{ - "didyoumean": &s.didYouMean, - "results": &s.result, - "print": &s.print, - "search": &s.search, - "repolist": &s.repolist, - "about": &s.about, - "robots": &s.robots, - } { - *v = s.Top.Lookup(k) - if *v == nil { - return nil, fmt.Errorf("missing template %q", k) - } - } - - s.templateCache = map[string]*template.Template{} - s.startTime = time.Now() - - mux := http.NewServeMux() - - if s.HTML { - mux.HandleFunc("/robots.txt", s.serveRobots) - mux.HandleFunc("/search", s.serveSearch) - mux.HandleFunc("/", s.serveSearchBox) - mux.HandleFunc("/about", s.serveAbout) - mux.HandleFunc("/print", s.servePrint) - } - - return mux, nil -} - -func (s *Server) serveSearch(w http.ResponseWriter, r *http.Request) { - err := s.serveSearchErr(w, r) - - if suggest, ok := err.(*query.SuggestQueryError); ok { - var buf bytes.Buffer - if err := s.didYouMean.Execute(&buf, suggest); err != nil { - http.Error(w, err.Error(), http.StatusTeapot) - } - - w.Write(buf.Bytes()) - return - } - - if err != nil { - http.Error(w, err.Error(), http.StatusTeapot) - } -} - -func (s *Server) serveSearchErr(w http.ResponseWriter, r *http.Request) error { - qvals := r.URL.Query() - queryStr := qvals.Get("q") - if queryStr == "" { - return fmt.Errorf("no query found") - } - - q, err := query.Parse(queryStr) - if err != nil { - return err - } - - repoOnly := true - query.VisitAtoms(q, func(q query.Q) { - _, ok := q.(*query.Repo) - repoOnly = repoOnly && ok - }) - if repoOnly { - return s.serveListReposErr(q, queryStr, w, r) - } - - numStr := qvals.Get("num") - - num, err := strconv.Atoi(numStr) - if err != nil || num <= 0 { - num = defaultNumResults - } - - sOpts := zoekt.SearchOptions{ - MaxWallTime: 10 * time.Second, - } - - sOpts.SetDefaults() - - ctx := r.Context() - if result, err := s.Searcher.Search(ctx, q, &zoekt.SearchOptions{EstimateDocCount: true}); err != nil { - return err - } else if numdocs := result.ShardFilesConsidered; numdocs > 10000 { - // If the search touches many shards and many files, we - // have to limit the number of matches. This setting - // is based on the number of documents eligible after - // considering reponames, so large repos (both - // android, chromium are about 500k files) aren't - // covered fairly. - - // 10k docs, 50 num -> max match = (250 + 250 / 10) - sOpts.ShardMaxMatchCount = num*5 + (5*num)/(numdocs/1000) - - // 10k docs, 50 num -> max important match = 4 - sOpts.ShardMaxImportantMatch = num/20 + num/(numdocs/500) - } else { - // Virtually no limits for a small corpus; important - // matches are just as expensive as normal matches. - n := numdocs + num*100 - sOpts.ShardMaxImportantMatch = n - sOpts.ShardMaxMatchCount = n - sOpts.TotalMaxMatchCount = n - sOpts.TotalMaxImportantMatch = n - } - sOpts.MaxDocDisplayCount = num - - result, err := s.Searcher.Search(ctx, q, &sOpts) - if err != nil { - return err - } - - fileMatches, err := s.formatResults(result, queryStr, s.Print) - if err != nil { - return err - } - - res := ResultInput{ - Last: LastInput{ - Query: queryStr, - Num: num, - AutoFocus: true, - }, - Stats: result.Stats, - Query: q.String(), - QueryStr: queryStr, - SearchOptions: sOpts.String(), - FileMatches: fileMatches, - } - if res.Stats.Wait < res.Stats.Duration/10 { - // Suppress queueing stats if they are neglible. - res.Stats.Wait = 0 - } - - var buf bytes.Buffer - if err := s.result.Execute(&buf, &res); err != nil { - return err - } - - w.Write(buf.Bytes()) - return nil -} - -func (s *Server) servePrint(w http.ResponseWriter, r *http.Request) { - err := s.servePrintErr(w, r) - if err != nil { - http.Error(w, err.Error(), http.StatusTeapot) - } -} - -const statsStaleNess = 30 * time.Second - -func (s *Server) fetchStats(ctx context.Context) (*zoekt.RepoStats, error) { - s.lastStatsMu.Lock() - stats := s.lastStats - if time.Since(s.lastStatsTS) > statsStaleNess { - stats = nil - } - s.lastStatsMu.Unlock() - - if stats != nil { - return stats, nil - } - - repos, err := s.Searcher.List(ctx, &query.Const{Value: true}) - if err != nil { - return nil, err - } - - stats = &zoekt.RepoStats{} - names := map[string]struct{}{} - for _, r := range repos.Repos { - stats.Add(&r.Stats) - names[r.Repository.Name] = struct{}{} - } - stats.Repos = len(names) - - s.lastStatsMu.Lock() - s.lastStatsTS = time.Now() - s.lastStats = stats - s.lastStatsMu.Unlock() - - return stats, nil -} - -func (s *Server) serveSearchBoxErr(w http.ResponseWriter, r *http.Request) error { - stats, err := s.fetchStats(r.Context()) - if err != nil { - return err - } - d := SearchBoxInput{ - Last: LastInput{ - Num: defaultNumResults, - AutoFocus: true, - }, - Stats: stats, - Version: s.Version, - Uptime: time.Since(s.startTime), - } - - d.Last.Query = r.URL.Query().Get("q") - if d.Last.Query == "" { - custom := s.HostCustomQueries[r.Host] - if custom == "" { - host, _, _ := net.SplitHostPort(r.Host) - custom = s.HostCustomQueries[host] - } - - if custom != "" { - d.Last.Query = custom + " " - } - } - - var buf bytes.Buffer - if err := s.search.Execute(&buf, &d); err != nil { - return err - } - w.Write(buf.Bytes()) - return nil -} - -func (s *Server) serveSearchBox(w http.ResponseWriter, r *http.Request) { - if err := s.serveSearchBoxErr(w, r); err != nil { - http.Error(w, err.Error(), http.StatusTeapot) - } -} - -func (s *Server) serveAboutErr(w http.ResponseWriter, r *http.Request) error { - stats, err := s.fetchStats(r.Context()) - if err != nil { - return err - } - - d := SearchBoxInput{ - Stats: stats, - Version: s.Version, - Uptime: time.Since(s.startTime), - } - - var buf bytes.Buffer - if err := s.about.Execute(&buf, &d); err != nil { - return err - } - w.Write(buf.Bytes()) - return nil -} - -func (s *Server) serveAbout(w http.ResponseWriter, r *http.Request) { - if err := s.serveAboutErr(w, r); err != nil { - http.Error(w, err.Error(), http.StatusTeapot) - } -} - -func (s *Server) serveRobotsErr(w http.ResponseWriter, r *http.Request) error { - data := struct{}{} - var buf bytes.Buffer - if err := s.robots.Execute(&buf, &data); err != nil { - return err - } - w.Write(buf.Bytes()) - return nil -} - -func (s *Server) serveRobots(w http.ResponseWriter, r *http.Request) { - if err := s.serveRobotsErr(w, r); err != nil { - http.Error(w, err.Error(), http.StatusTeapot) - } -} - -func (s *Server) serveListReposErr(q query.Q, qStr string, w http.ResponseWriter, r *http.Request) error { - ctx := r.Context() - repos, err := s.Searcher.List(ctx, q) - if err != nil { - return err - } - - qvals := r.URL.Query() - order := qvals.Get("order") - switch order { - case "", "name", "revname": - sort.Slice(repos.Repos, func(i, j int) bool { - return strings.Compare(repos.Repos[i].Repository.Name, repos.Repos[j].Repository.Name) < 0 - }) - case "size", "revsize": - sort.Slice(repos.Repos, func(i, j int) bool { - return repos.Repos[i].Stats.ContentBytes < repos.Repos[j].Stats.ContentBytes - }) - case "time", "revtime": - sort.Slice(repos.Repos, func(i, j int) bool { - return repos.Repos[i].IndexMetadata.IndexTime.Before( - repos.Repos[j].IndexMetadata.IndexTime) - }) - default: - return fmt.Errorf("got unknown sort key %q, allowed [rev]name, [rev]time, [rev]size", order) - } - if strings.HasPrefix(order, "rev") { - for i, j := 0, len(repos.Repos)-1; i < j; { - repos.Repos[i], repos.Repos[j] = repos.Repos[j], repos.Repos[i] - i++ - j-- - - } - } - - aggregate := zoekt.RepoStats{ - Repos: len(repos.Repos), - } - for _, s := range repos.Repos { - aggregate.Add(&s.Stats) - } - res := RepoListInput{ - Last: LastInput{ - Query: qStr, - AutoFocus: true, - }, - Stats: aggregate, - } - - numStr := qvals.Get("num") - num, err := strconv.Atoi(numStr) - if err != nil || num <= 0 { - num = 0 - } - if num > 0 { - if num > len(repos.Repos) { - num = len(repos.Repos) - } - - repos.Repos = repos.Repos[:num] - } - - for _, r := range repos.Repos { - t := s.getTemplate(r.Repository.CommitURLTemplate) - - repo := Repository{ - Name: r.Repository.Name, - URL: r.Repository.URL, - IndexTime: r.IndexMetadata.IndexTime, - Size: r.Stats.ContentBytes, - Files: int64(r.Stats.Documents), - } - for _, b := range r.Repository.Branches { - var buf bytes.Buffer - if err := t.Execute(&buf, b); err != nil { - return err - } - repo.Branches = append(repo.Branches, - Branch{ - Name: b.Name, - Version: b.Version, - URL: buf.String(), - }) - } - res.Repos = append(res.Repos, repo) - } - - var buf bytes.Buffer - if err := s.repolist.Execute(&buf, &res); err != nil { - return err - } - - w.Write(buf.Bytes()) - return nil -} - -func (s *Server) servePrintErr(w http.ResponseWriter, r *http.Request) error { - qvals := r.URL.Query() - fileStr := qvals.Get("f") - repoStr := qvals.Get("r") - queryStr := qvals.Get("q") - numStr := qvals.Get("num") - num, err := strconv.Atoi(numStr) - if err != nil || num <= 0 { - num = defaultNumResults - } - - re, err := syntax.Parse("^"+regexp.QuoteMeta(fileStr)+"$", 0) - if err != nil { - return err - } - qs := []query.Q{ - &query.Regexp{Regexp: re, FileName: true, CaseSensitive: true}, - &query.Repo{Pattern: repoStr}, - } - - if branchStr := qvals.Get("b"); branchStr != "" { - qs = append(qs, &query.Branch{Pattern: branchStr}) - } - - q := &query.And{Children: qs} - - sOpts := zoekt.SearchOptions{ - Whole: true, - } - - ctx := r.Context() - result, err := s.Searcher.Search(ctx, q, &sOpts) - if err != nil { - return err - } - - if len(result.Files) != 1 { - var ss []string - for _, n := range result.Files { - ss = append(ss, n.FileName) - } - return fmt.Errorf("ambiguous result: %v", ss) - } - - f := result.Files[0] - - byteLines := bytes.Split(f.Content, []byte{'\n'}) - strLines := make([]string, 0, len(byteLines)) - for _, l := range byteLines { - strLines = append(strLines, string(l)) - } - - d := PrintInput{ - Name: f.FileName, - Repo: f.Repository, - Lines: strLines, - Last: LastInput{ - Query: queryStr, - Num: num, - AutoFocus: false, - }, - } - - var buf bytes.Buffer - if err := s.print.Execute(&buf, &d); err != nil { - return err - } - - w.Write(buf.Bytes()) - return nil -}
diff --git a/web/snippets.go b/web/snippets.go deleted file mode 100644 index 1b6112e..0000000 --- a/web/snippets.go +++ /dev/null
@@ -1,149 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package web - -import ( - "bytes" - "html/template" - "log" - "net/url" - "strconv" - "strings" - - "github.com/google/zoekt" -) - -func (s *Server) formatResults(result *zoekt.SearchResult, query string, localPrint bool) ([]*FileMatch, error) { - var fmatches []*FileMatch - - templateMap := map[string]*template.Template{} - fragmentMap := map[string]*template.Template{} - if !localPrint { - for repo, str := range result.RepoURLs { - if str != "" { - templateMap[repo] = s.getTemplate(str) - } - } - for repo, str := range result.LineFragments { - if str != "" { - fragmentMap[repo] = s.getTemplate(str) - } - } - } - getFragment := func(repo string, linenum int) string { - tpl := fragmentMap[repo] - - if tpl == nil || localPrint { - return "#l" + strconv.Itoa(linenum) - } - - var buf bytes.Buffer - if err := tpl.Execute(&buf, map[string]string{ - "LineNumber": strconv.Itoa(linenum), - }); err != nil { - log.Printf("fragment template: %v", err) - return "" - } - return buf.String() - } - getURL := func(repo, filename string, branches []string, version string) string { - tpl := templateMap[repo] - if localPrint || tpl == nil { - v := make(url.Values) - v.Add("r", repo) - v.Add("f", filename) - v.Add("q", query) - if len(branches) > 0 { - v.Add("b", branches[0]) - } - return "print?" + v.Encode() - } - - var buf bytes.Buffer - b := "" - if len(branches) > 0 { - b = branches[0] - } - err := tpl.Execute(&buf, map[string]string{ - "Branch": b, - "Version": version, - "Path": filename, - }) - if err != nil { - log.Printf("url template: %v", err) - return "" - } - return buf.String() - } - - // hash => result-id - seenFiles := map[string]string{} - for _, f := range result.Files { - fMatch := FileMatch{ - FileName: f.FileName, - Repo: f.Repository, - ResultID: f.Repository + ":" + f.FileName, - Branches: f.Branches, - Language: f.Language, - } - - if dup, ok := seenFiles[string(f.Checksum)]; ok { - fMatch.DuplicateID = dup - } else { - seenFiles[string(f.Checksum)] = fMatch.ResultID - } - - if f.SubRepositoryName != "" { - fn := strings.TrimPrefix(fMatch.FileName[len(f.SubRepositoryPath):], "/") - fMatch.URL = getURL(f.SubRepositoryName, fn, f.Branches, f.Version) - } else { - fMatch.URL = getURL(f.Repository, f.FileName, f.Branches, f.Version) - } - - for _, m := range f.LineMatches { - fragment := getFragment(f.Repository, m.LineNumber) - if !strings.HasPrefix(fragment, "#") && !strings.HasPrefix(fragment, ";") { - // TODO - remove this is backward compatibility glue. - fragment = "#" + fragment - } - md := Match{ - FileName: f.FileName, - LineNum: m.LineNumber, - URL: fMatch.URL + fragment, - } - - lastEnd := 0 - line := m.Line - for i, f := range m.LineFragments { - l := f.LineOffset - e := l + f.MatchLength - - frag := Fragment{ - Pre: string(line[lastEnd:l]), - Match: string(line[l:e]), - } - if i == len(m.LineFragments)-1 { - frag.Post = string(m.Line[e:]) - } - - md.Fragments = append(md.Fragments, frag) - lastEnd = e - } - fMatch.Matches = append(fMatch.Matches, md) - } - fmatches = append(fmatches, &fMatch) - } - return fmatches, nil -}
diff --git a/web/templates.go b/web/templates.go deleted file mode 100644 index eea9bb7..0000000 --- a/web/templates.go +++ /dev/null
@@ -1,409 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package web - -import ( - "html/template" - "log" -) - -// Top provides the standard templates in parsed form -var Top = template.New("top").Funcs(Funcmap) - -// TemplateText contains the text of the standard templates. -var TemplateText = map[string]string{ - - "didyoumean": ` -<html> -<head> - <title>Error</title> -</head> -<body> - <p>{{.Message}}. Did you mean <a href="/search?q={{.Suggestion}}">{{.Suggestion}}</a> ? -</body> -</html> -`, - - "head": ` -<head> -<meta charset="utf-8"> -<meta http-equiv="X-UA-Compatible" content="IE=edge"> -<meta name="viewport" content="width=device-width, initial-scale=1"> -<!-- Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) --> -<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous"> -<style> - #navsearchbox { width: 350px !important; } - #maxhits { width: 100px !important; } - .label-dup { - border-width: 1px !important; - border-style: solid !important; - border-color: #aaa !important; - color: black; - } - .noselect { - user-select: none; - } - a.label-dup:hover { - color: black; - background: #ddd; - } - .result { - display: block; - content: " "; - visibility: hidden; - } - .container-results { - overflow: auto; - max-height: calc(100% - 72px); - } - .inline-pre { - border: unset; - background-color: unset; - margin: unset; - padding: unset; - overflow: unset; - } - :target { background-color: #ccf; } - table tbody tr td { border: none !important; padding: 2px !important; } -</style> -</head> - `, - - "jsdep": ` -<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script> -<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script> -`, - - // the template for the search box. - "searchbox": ` -<form action="search"> - <div class="form-group form-group-lg"> - <div class="input-group input-group-lg"> - <input class="form-control" placeholder="Search for some code..." autofocus - {{if .Query}} - value={{.Query}} - {{end}} - id="searchbox" type="text" name="q"> - <div class="input-group-btn"> - <button class="btn btn-primary">Search</button> - </div> - </div> - </div> -</form> -`, - - "navbar": ` -<nav class="navbar navbar-default"> - <div class="container-fluid"> - <div class="navbar-header"> - <a class="navbar-brand" href="/">Zoekt</a> - <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar-collapse" aria-expanded="false"> - <span class="sr-only">Toggle navigation</span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - </button> - </div> - <div class="navbar-collapse collapse" id="navbar-collapse" aria-expanded="false" style="height: 1px;"> - <form class="navbar-form navbar-left" action="search"> - <div class="form-group"> - <input class="form-control" - placeholder="Search for some code..." role="search" - id="navsearchbox" type="text" name="q" autofocus - {{if .Query}} - value={{.Query}} - {{end}}> - <div class="input-group"> - <div class="input-group-addon">Max Results</div> - <input class="form-control" type="number" id="maxhits" name="num" value="{{.Num}}"> - </div> - <button class="btn btn-primary">Search</button> - </div> - </form> - </div> - </div> -</nav> -<script> -document.onkeydown=function(e){ - var e = e || window.event; - if (e.key == "/") { - var navbox = document.getElementById("navsearchbox"); - if (document.activeElement !== navbox) { - navbox.focus(); - return false; - } - } -}; -</script> -`, - // search box for the entry page. - "search": ` -<html> -{{template "head"}} -<title>Zoekt, en gij zult spinazie eten</title> -<body> - <div class="jumbotron"> - <div class="container"> - {{template "searchbox" .Last}} - </div> - </div> - - <div class="container"> - <div class="row"> - <div class="col-md-8"> - <h3>Search examples:</h3> - <dl class="dl-horizontal"> - <dt><a href="search?q=needle">needle</a></dt><dd>search for "needle"</dd> - <dt><a href="search?q=thread+or+needle">thread or needle</a></dt><dd>search for either "thread" or "needle"</dd> - <dt><a href="search?q=class+needle">class needle</a></span></dt><dd>search for files containing both "class" and "needle"</dd> - <dt><a href="search?q=class+Needle">class Needle</a></dt><dd>search for files containing both "class" (case insensitive) and "Needle" (case sensitive)</dd> - <dt><a href="search?q=class+Needle+case:yes">class Needle case:yes</a></dt><dd>search for files containing "class" and "Needle", both case sensitively</dd> - <dt><a href="search?q=%22class Needle%22">"class Needle"</a></dt><dd>search for files with the phrase "class Needle"</dd> - <dt><a href="search?q=needle+-hay">needle -hay</a></dt><dd>search for files with the word "needle" but not the word "hay"</dd> - <dt><a href="search?q=path+file:java">path file:java</a></dt><dd>search for the word "path" in files whose name contains "java"</dd> - <dt><a href="search?q=needle+lang%3Apython&num=50">needle lang:python</a></dt><dd>search for "needle" in Python source code</dd> - <dt><a href="search?q=f:%5C.c%24">f:\.c$</a></dt><dd>search for files whose name ends with ".c"</dd> - <dt><a href="search?q=path+-file:java">path -file:java</a></dt><dd>search for the word "path" excluding files whose name contains "java"</dd> - <dt><a href="search?q=foo.*bar">foo.*bar</a></dt><dd>search for the regular expression "foo.*bar"</dd> - <dt><a href="search?q=-%28Path File%29 Stream">-(Path File) Stream</a></dt><dd>search "Stream", but exclude files containing both "Path" and "File"</dd> - <dt><a href="search?q=-Path%5c+file+Stream">-Path\ file Stream</a></dt><dd>search "Stream", but exclude files containing "Path File"</dd> - <dt><a href="search?q=sym:data">sym:data</a></span></dt><dd>search for symbol definitions containing "data"</dd> - <dt><a href="search?q=phone+r:droid">phone r:droid</a></dt><dd>search for "phone" in repositories whose name contains "droid"</dd> - <dt><a href="search?q=phone+b:master">phone b:master</a></dt><dd>for Git repos, find "phone" in files in branches whose name contains "master".</dd> - <dt><a href="search?q=phone+b:HEAD">phone b:HEAD</a></dt><dd>for Git repos, find "phone" in the default ('HEAD') branch.</dd> - </dl> - </div> - <div class="col-md-4"> - <h3>To list repositories, try:</h3> - <dl class="dl-horizontal"> - <dt><a href="search?q=r:droid">r:droid</a></dt><dd>list repositories whose name contains "droid".</dd> - <dt><a href="search?q=r:go+-r:google">r:go -r:google</a></dt><dd>list repositories whose name contains "go" but not "google".</dd> - </dl> - </div> - </div> - </div> - <nav class="navbar navbar-default navbar-bottom"> - <div class="container"> - {{template "footerBoilerplate"}} - <p class="navbar-text navbar-right"> - Used {{HumanUnit .Stats.IndexBytes}} mem for - {{.Stats.Documents}} documents ({{HumanUnit .Stats.ContentBytes}}) - from {{.Stats.Repos}} repositories. - </p> - </div> - </nav> -</body> -</html> -`, - "footerBoilerplate": `<a class="navbar-text" href="about">About</a>`, - "results": ` -<html> -{{template "head"}} -<title>Results for {{.QueryStr}}</title> -<script> - function zoektAddQ(atom) { - window.location.href = "/search?q=" + escape("{{.QueryStr}}" + " " + atom) + - "&" + "num=" + {{.Last.Num}}; - } -</script> -<body id="results"> - {{template "navbar" .Last}} - <div class="container-fluid container-results"> - <h5> - {{if .Stats.Crashes}}<br><b>{{.Stats.Crashes}} shards crashed</b><br>{{end}} - {{ $fileCount := len .FileMatches }} - Found {{.Stats.MatchCount}} results in {{.Stats.FileCount}} files{{if or (lt $fileCount .Stats.FileCount) (or (gt .Stats.ShardsSkipped 0) (gt .Stats.FilesSkipped 0)) }}, - showing top {{ $fileCount }} files (<a rel="nofollow" - href="search?q={{.Last.Query}}&num={{More .Last.Num}}">show more</a>). - {{else}}.{{end}} - </h5> - {{range .FileMatches}} - <table class="table table-hover table-condensed"> - <thead> - <tr> - <th> - {{if .URL}}<a name="{{.ResultID}}" class="result"></a><a href="{{.URL}}" >{{else}}<a name="{{.ResultID}}">{{end}} - <small> - {{.Repo}}:{{.FileName}}</a>: - <span style="font-weight: normal">[ {{if .Branches}}{{range .Branches}}<span class="label label-default">{{.}}</span>,{{end}}{{end}} ]</span> - {{if .Language}}<button - title="restrict search to files written in {{.Language}}" - onclick="zoektAddQ('lang:{{.Language}}')" class="label label-primary">language {{.Language}}</button></span>{{end}} - {{if .DuplicateID}}<a class="label label-dup" href="#{{.DuplicateID}}">Duplicate result</a>{{end}} - </small> - </th> - </tr> - </thead> - {{if not .DuplicateID}} - <tbody> - {{range .Matches}} - <tr> - <td style="background-color: rgba(238, 238, 255, 0.6);"> - <pre class="inline-pre"><span class="noselect">{{if .URL}}<a href="{{.URL}}">{{end}}<u>{{.LineNum}}</u>{{if .URL}}</a>{{end}}: </span>{{range .Fragments}}{{LimitPre 100 .Pre}}<b>{{.Match}}</b>{{LimitPost 100 .Post}}{{end}}</pre> - </td> - </tr> - {{end}} - </tbody> - {{end}} - </table> - {{end}} - - <nav class="navbar navbar-default navbar-bottom"> - <div class="container"> - {{template "footerBoilerplate"}} - <p class="navbar-text navbar-right"> - Took {{.Stats.Duration}}{{if .Stats.Wait}}(queued: {{.Stats.Wait}}){{end}} for - {{HumanUnit .Stats.IndexBytesLoaded}}B index data, - {{.Stats.NgramMatches}} ngram matches, - {{.Stats.FilesConsidered}} docs considered, - {{.Stats.FilesLoaded}} docs ({{HumanUnit .Stats.ContentBytesLoaded}}B) - loaded{{if or .Stats.FilesSkipped .Stats.ShardsSkipped}}, - {{.Stats.FilesSkipped}} docs and {{.Stats.ShardsSkipped}} shards skipped{{else}}.{{end}} - </p> - </div> - </nav> - </div> - {{ template "jsdep"}} -</body> -</html> -`, - - "repolist": ` -<html> -{{template "head"}} -<body id="results"> - <div class="container"> - {{template "navbar" .Last}} - <div><b> - Found {{.Stats.Repos}} repositories ({{.Stats.Documents}} files, {{HumanUnit .Stats.ContentBytes}}b content) - </b></div> - <table class="table table-hover table-condensed"> - <thead> - <tr> - <th>Name <a href="/search?q={{.Last.Query}}&order=name">▼</a><a href="/search?q={{.Last.Query}}&order=revname">▲</a></th> - <th>Last updated <a href="/search?q={{.Last.Query}}&order=revtime">▼</a><a href="/search?q={{.Last.Query}}&order=time">▲</a></th> - <th>Branches</th> - <th>Size <a href="/search?q={{.Last.Query}}&order=revsize">▼</a><a href="/search?q={{.Last.Query}}&order=size">▲</a></th> - </tr> - </thead> - <tbody> - {{range .Repos}} - <tr> - <td>{{if .URL}}<a href="{{.URL}}">{{end}}{{.Name}}{{if .URL}}</a>{{end}}</td> - <td><small>{{.IndexTime.Format "Jan 02, 2006 15:04"}}</small></td> - <td style="vertical-align: middle;"> - {{range .Branches}} - {{if .URL}}<tt><a class="label label-default small" href="{{.URL}}">{{end}}{{.Name}}{{if .URL}}</a> </tt>{{end}} - {{end}} - </td> - <td><small>{{HumanUnit .Files}} files ({{HumanUnit .Size}})</small></td> - </tr> - {{end}} - </tbody> - </table> - </div> - - <nav class="navbar navbar-default navbar-bottom"> - <div class="container"> - {{template "footerBoilerplate"}} - <p class="navbar-text navbar-right"> - </p> - </div> - </nav> - - {{ template "jsdep"}} -</body> -</html> -`, - - "print": ` -<html> - {{template "head"}} - <title>{{.Repo}}:{{.Name}}</title> -<body id="results"> - {{template "navbar" .Last}} - <div class="container-fluid container-results" > - <div><b>{{.Name}}</b></div> - <div class="table table-hover table-condensed" style="overflow:auto; background: #eef;"> - {{ range $index, $ln := .Lines}} - <pre id="l{{Inc $index}}" class="inline-pre"><span class="noselect"><a href="#l{{Inc $index}}">{{Inc $index}}</a>: </span>{{$ln}}</pre> - {{end}} - </div> - <nav class="navbar navbar-default navbar-bottom"> - <div class="container"> - {{template "footerBoilerplate"}} - <p class="navbar-text navbar-right"> - </p> - </div> - </nav> - </div> - {{ template "jsdep"}} -</body> -</html> -`, - - "about": ` - -<html> - {{template "head"}} - <title>About <em>zoekt</em></title> -<body> - - - <div class="jumbotron"> - <div class="container"> - {{template "searchbox" .Last}} - </div> - </div> - - <div class="container"> - <p> - This is <a href="http://github.com/google/zoekt"><em>zoekt</em> (IPA: /zukt/)</a>, - an open-source full text search engine. It's pronounced roughly as you would - pronounce "zooked" in English. - </p> - <p> - {{if .Version}}<em>Zoekt</em> version {{.Version}}, uptime{{else}}Uptime{{end}} {{.Uptime}} - </p> - - <p> - Used {{HumanUnit .Stats.IndexBytes}} memory for - {{.Stats.Documents}} documents ({{HumanUnit .Stats.ContentBytes}}) - from {{.Stats.Repos}} repositories. - </p> - </div> - - <nav class="navbar navbar-default navbar-bottom"> - <div class="container"> - {{template "footerBoilerplate"}} - <p class="navbar-text navbar-right"> - </p> - </div> - </nav> -`, - "robots": ` -user-agent: * -disallow: /search -`, -} - -func init() { - for k, v := range TemplateText { - _, err := Top.New(k).Parse(v) - if err != nil { - log.Panicf("parse(%s): %v:", k, err) - } - } -}
diff --git a/write.go b/write.go deleted file mode 100644 index 7a89167..0000000 --- a/write.go +++ /dev/null
@@ -1,178 +0,0 @@ -// Copyright 2016 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zoekt - -import ( - "bufio" - "bytes" - "encoding/binary" - "encoding/json" - "io" - "sort" - "time" -) - -func (w *writer) writeTOC(toc *indexTOC) { - // Tagged sections are indicated with a 0 section count. - // Tagged sections allow easier forwards and backwards - // compatibility when evolving zoekt index files with new - // sections. - // - // A tagged section is: - // Varint TagLen, Tag String, Varint SecType, Section - // - // Section type is indicated because simpleSections and - // compoundSections have different lengths. - w.U32(0) - secs := toc.sectionsTaggedList() - for _, s := range secs { - w.String(s.tag) - w.Varint(uint32(s.sec.kind())) - s.sec.write(w) - } -} - -func (s *compoundSection) writeStrings(w *writer, strs []*searchableString) { - s.start(w) - for _, f := range strs { - s.addItem(w, f.data) - } - s.end(w) -} - -func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection, - charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection) { - keys := make(ngramSlice, 0, len(s.postings)) - for k := range s.postings { - keys = append(keys, k) - } - sort.Sort(keys) - - ngramText.start(w) - for _, k := range keys { - var buf [8]byte - binary.BigEndian.PutUint64(buf[:], uint64(k)) - w.Write(buf[:]) - } - ngramText.end(w) - - postings.start(w) - for _, k := range keys { - postings.addItem(w, s.postings[k]) - } - postings.end(w) - - charOffsets.start(w) - w.Write(toSizedDeltas(s.runeOffsets)) - charOffsets.end(w) - - endRunes.start(w) - w.Write(toSizedDeltas(s.endRunes)) - endRunes.end(w) -} - -func (b *IndexBuilder) Write(out io.Writer) error { - buffered := bufio.NewWriterSize(out, 1<<20) - defer buffered.Flush() - - w := &writer{w: buffered} - toc := indexTOC{} - - toc.fileContents.writeStrings(w, b.contentStrings) - toc.newlines.start(w) - for _, f := range b.contentStrings { - toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data))) - } - toc.newlines.end(w) - - toc.branchMasks.start(w) - for _, m := range b.branchMasks { - w.U64(m) - } - toc.branchMasks.end(w) - - toc.fileSections.start(w) - for _, s := range b.docSections { - toc.fileSections.addItem(w, marshalDocSections(s)) - } - toc.fileSections.end(w) - - writePostings(w, b.contentPostings, &toc.ngramText, &toc.runeOffsets, &toc.postings, &toc.fileEndRunes) - - // names. - toc.fileNames.writeStrings(w, b.nameStrings) - - writePostings(w, b.namePostings, &toc.nameNgramText, &toc.nameRuneOffsets, &toc.namePostings, &toc.nameEndRunes) - - toc.subRepos.start(w) - w.Write(toSizedDeltas(b.subRepos)) - toc.subRepos.end(w) - - toc.contentChecksums.start(w) - w.Write(b.checksums) - toc.contentChecksums.end(w) - - toc.languages.start(w) - w.Write(b.languages) - toc.languages.end(w) - - toc.runeDocSections.start(w) - w.Write(marshalDocSections(b.runeDocSections)) - toc.runeDocSections.end(w) - - if err := b.writeJSON(&IndexMetadata{ - IndexFormatVersion: IndexFormatVersion, - IndexTime: time.Now(), - IndexFeatureVersion: FeatureVersion, - IndexMinReaderVersion: WriteMinFeatureVersion, - PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII, - LanguageMap: b.languageMap, - ZoektVersion: Version, - }, &toc.metaData, w); err != nil { - return err - } - if err := b.writeJSON(b.repo, &toc.repoMetaData, w); err != nil { - return err - } - - var tocSection simpleSection - - tocSection.start(w) - w.writeTOC(&toc) - tocSection.end(w) - tocSection.write(w) - return w.err -} - -func (b *IndexBuilder) writeJSON(data interface{}, sec *simpleSection, w *writer) error { - blob, err := json.Marshal(data) - if err != nil { - return err - } - sec.start(w) - w.Write(blob) - sec.end(w) - return nil -} - -func newLinesIndices(in []byte) []uint32 { - out := make([]uint32, 0, bytes.Count(in, []byte{'\n'})) - for i, c := range in { - if c == '\n' { - out = append(out, uint32(i)) - } - } - return out -}