blob: 73765ce6518ae72b1726b62813d5cb31d2e2c244 [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zoekt
import (
"fmt"
"hash/crc64"
"regexp"
"unicode/utf8"
"github.com/google/zoekt/query"
)
// indexData holds the pattern-independent data that we have to have
// in memory to search. Most of the memory is taken up by the ngram =>
// offset index.
type indexData struct {
file IndexFile
ngrams map[ngram]simpleSection
newlinesStart uint32
newlinesIndex []uint32
docSectionsStart uint32
docSectionsIndex []uint32
// rune offset=>byte offset mapping, relative to the start of the content corpus
runeOffsets []uint32
// offsets of file contents; includes end of last file
boundariesStart uint32
boundaries []uint32
// rune offsets for the file content boundaries
fileEndRunes []uint32
fileNameContent []byte
fileNameIndex []uint32
fileNameNgrams map[ngram][]uint32
// rune offset=>byte offset mapping, relative to the start of the filename corpus
fileNameRuneOffsets []uint32
// rune offsets for the file name boundaries
fileNameEndRunes []uint32
fileBranchMasks []uint64
// mask (power of 2) => name
branchNames map[uint]string
// name => mask (power of 2)
branchIDs map[string]uint
metaData IndexMetadata
repoMetaData Repository
subRepos []uint32
subRepoPaths []string
// Checksums for all the files, at 8-byte intervals
checksums []byte
repoListEntry RepoListEntry
}
func (d *indexData) getChecksum(idx uint32) []byte {
start := crc64.Size * idx
return d.checksums[start : start+crc64.Size]
}
func (d *indexData) calculateStats() {
var last uint32
if len(d.boundaries) > 0 {
last += d.boundaries[len(d.boundaries)-1]
}
lastFN := last
if len(d.fileNameIndex) > 0 {
lastFN = d.fileNameIndex[len(d.fileNameIndex)-1]
}
stats := RepoStats{
IndexBytes: int64(d.memoryUse()),
ContentBytes: int64(int(last) + int(lastFN)),
Documents: len(d.newlinesIndex) - 1,
}
d.repoListEntry = RepoListEntry{
Repository: d.repoMetaData,
IndexMetadata: d.metaData,
Stats: stats,
}
}
func (d *indexData) String() string {
return fmt.Sprintf("shard(%s)", d.file.Name())
}
func (d *indexData) memoryUse() int {
sz := 0
for _, a := range [][]uint32{
d.newlinesIndex, d.docSectionsIndex,
d.boundaries, d.fileNameIndex,
d.runeOffsets, d.fileNameRuneOffsets,
d.fileEndRunes, d.fileNameEndRunes,
} {
sz += 4 * len(a)
}
sz += 8 * len(d.fileBranchMasks)
sz += 12 * len(d.ngrams)
for _, v := range d.fileNameNgrams {
sz += 4*len(v) + 4
}
return sz
}
func (data *indexData) bruteForceMatchFilenames(query *query.Substring) []*candidateMatch {
quoted := regexp.QuoteMeta(query.Pattern)
if !query.CaseSensitive {
quoted = "(?i)" + quoted
}
lowerPat := toLower([]byte(query.Pattern))
re := regexp.MustCompile(quoted)
fileID := 0
startName := data.fileNameIndex[fileID]
endName := data.fileNameIndex[fileID+1]
matches := re.FindAllIndex(data.fileNameContent, -1)
substrBytes := []byte(query.Pattern)
cands := make([]*candidateMatch, 0, len(matches))
for _, match := range matches {
start := uint32(match[0])
end := uint32(match[1])
for endName < end {
fileID++
startName = data.fileNameIndex[fileID]
endName = data.fileNameIndex[fileID+1]
}
if start < startName {
// straddles a filename boundary.
continue
}
cands = append(cands, &candidateMatch{
caseSensitive: query.CaseSensitive,
fileName: true,
substrBytes: substrBytes,
substrLowered: lowerPat,
file: uint32(fileID),
// TODO - should also populate runeOffset?
byteOffset: uint32(start - startName),
byteMatchSz: uint32(end - start),
})
}
return cands
}
const maxUInt32 = 0xffffffff
func minarg(xs []uint32) uint32 {
m := uint32(maxUInt32)
j := len(xs)
for i, x := range xs {
if x <= m {
m = x
j = i
}
}
return uint32(j)
}
func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
if filename {
return uint32(len(data.fileNameNgrams[ng]))
}
return data.ngrams[ng].sz
}
type ngramIterationResults struct {
cands []*candidateMatch
// The ngram matches cover the pattern, so no need to check
// contents.
coversContent bool
// The number of posting bytes
bytesRead uint32
}
func (data *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
iter := &ngramDocIterator{
query: query,
}
if query.FileName {
iter.ends = data.fileNameEndRunes
} else {
iter.ends = data.fileEndRunes
}
str := query.Pattern
// Find the 2 least common ngrams from the string.
ngramOffs := splitNGrams([]byte(query.Pattern))
frequencies := make([]uint32, 0, len(ngramOffs))
for _, o := range ngramOffs {
var freq uint32
if query.CaseSensitive {
freq = data.ngramFrequency(o.ngram, query.FileName)
} else {
for _, v := range generateCaseNgrams(o.ngram) {
freq += data.ngramFrequency(v, query.FileName)
}
}
if freq == 0 {
return &ngramIterationResults{}, nil
}
frequencies = append(frequencies, freq)
}
firstI := minarg(frequencies)
frequencies[firstI] = maxUInt32
lastI := minarg(frequencies)
if firstI > lastI {
lastI, firstI = firstI, lastI
}
firstNG := ngramOffs[firstI].ngram
lastNG := ngramOffs[lastI].ngram
iter.distance = lastI - firstI
iter.leftPad = firstI
iter.rightPad = uint32(utf8.RuneCountInString(str)-ngramSize) - lastI
postings, bytesRead, err := data.readPostings(firstNG, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
}
iter.first = postings
if firstI != lastI {
postings, sz, err := data.readPostings(lastNG, query.CaseSensitive, query.FileName)
bytesRead += sz
if err != nil {
return nil, err
}
iter.last = postings
} else {
// TODO - we could be a little faster and skip the
// list intersection
iter.last = iter.first
}
result := ngramIterationResults{
bytesRead: bytesRead,
cands: iter.candidates(),
coversContent: lastI-firstI <= ngramSize && iter.leftPad == 0 && iter.rightPad == 0,
}
return &result, nil
}
func (d *indexData) readPostings(ng ngram, caseSensitive, fileName bool) ([]uint32, uint32, error) {
variants := []ngram{ng}
if !caseSensitive {
variants = generateCaseNgrams(ng)
}
// TODO - generate less garbage.
var sz uint32
postings := make([][]uint32, 0, len(variants))
for _, v := range variants {
if fileName {
postings = append(postings, d.fileNameNgrams[v])
continue
}
sec := d.ngrams[v]
blob, err := d.readSectionBlob(sec)
if err != nil {
return nil, 0, err
}
sz += sec.sz
ps := fromDeltas(blob, nil)
if len(ps) > 0 {
postings = append(postings, ps)
}
}
result := mergeUint32(postings)
return result, sz, nil
}
func mergeUint32(in [][]uint32) []uint32 {
sz := 0
for _, i := range in {
sz += len(i)
}
out := make([]uint32, 0, sz)
for len(in) > 0 {
minVal := uint32(maxUInt32)
for _, n := range in {
if len(n) > 0 && n[0] < minVal {
minVal = n[0]
}
}
next := in[:0]
for _, n := range in {
if len(n) == 0 {
continue
}
if n[0] == minVal {
out = append(out, minVal)
n = n[1:]
}
if len(n) > 0 {
next = append(next, n)
}
}
in = next
}
return out
}
func (d *indexData) fileName(i uint32) []byte {
return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
}
func (s *indexData) Close() {
s.file.Close()
}