blob: 524b6b5551003da64cf04aeab90d54245d20ed8d [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zoekt
import (
"fmt"
"hash/crc64"
"unicode/utf8"
"github.com/google/zoekt/query"
)
// indexData holds the pattern-independent data that we have to have
// in memory to search. Most of the memory is taken up by the ngram =>
// offset index.
type indexData struct {
file IndexFile
ngrams map[ngram]simpleSection
newlinesStart uint32
newlinesIndex []uint32
docSectionsStart uint32
docSectionsIndex []uint32
// rune offset=>byte offset mapping, relative to the start of the content corpus
runeOffsets []uint32
// offsets of file contents; includes end of last file
boundariesStart uint32
boundaries []uint32
// rune offsets for the file content boundaries
fileEndRunes []uint32
fileNameContent []byte
fileNameIndex []uint32
fileNameNgrams map[ngram][]uint32
// rune offset=>byte offset mapping, relative to the start of the filename corpus
fileNameRuneOffsets []uint32
// rune offsets for the file name boundaries
fileNameEndRunes []uint32
fileBranchMasks []uint64
// mask (power of 2) => name
branchNames map[uint]string
// name => mask (power of 2)
branchIDs map[string]uint
metaData IndexMetadata
repoMetaData Repository
subRepos []uint32
subRepoPaths []string
// Checksums for all the files, at 8-byte intervals
checksums []byte
// languages for all the files.
languages []byte
// inverse of LanguageMap in metaData
languageMap map[byte]string
repoListEntry RepoListEntry
}
func (d *indexData) getChecksum(idx uint32) []byte {
start := crc64.Size * idx
return d.checksums[start : start+crc64.Size]
}
func (d *indexData) calculateStats() {
var last uint32
if len(d.boundaries) > 0 {
last += d.boundaries[len(d.boundaries)-1]
}
lastFN := last
if len(d.fileNameIndex) > 0 {
lastFN = d.fileNameIndex[len(d.fileNameIndex)-1]
}
stats := RepoStats{
IndexBytes: int64(d.memoryUse()),
ContentBytes: int64(int(last) + int(lastFN)),
Documents: len(d.newlinesIndex) - 1,
}
d.repoListEntry = RepoListEntry{
Repository: d.repoMetaData,
IndexMetadata: d.metaData,
Stats: stats,
}
}
func (d *indexData) String() string {
return fmt.Sprintf("shard(%s)", d.file.Name())
}
func (d *indexData) memoryUse() int {
sz := 0
for _, a := range [][]uint32{
d.newlinesIndex, d.docSectionsIndex,
d.boundaries, d.fileNameIndex,
d.runeOffsets, d.fileNameRuneOffsets,
d.fileEndRunes, d.fileNameEndRunes,
} {
sz += 4 * len(a)
}
sz += 8 * len(d.fileBranchMasks)
sz += 12 * len(d.ngrams)
for _, v := range d.fileNameNgrams {
sz += 4*len(v) + 4
}
return sz
}
const maxUInt32 = 0xffffffff
func minarg(xs []uint32) uint32 {
m := uint32(maxUInt32)
j := len(xs)
for i, x := range xs {
if x <= m {
m = x
j = i
}
}
return uint32(j)
}
func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
if filename {
return uint32(len(data.fileNameNgrams[ng]))
}
return data.ngrams[ng].sz
}
type ngramIterationResults struct {
cands []*candidateMatch
// The ngram matches cover the pattern, so no need to check
// contents.
coversContent bool
// The number of posting bytes
bytesRead uint32
}
func (data *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
iter := &ngramDocIterator{
query: query,
}
if query.FileName {
iter.ends = data.fileNameEndRunes
} else {
iter.ends = data.fileEndRunes
}
str := query.Pattern
// Find the 2 least common ngrams from the string.
ngramOffs := splitNGrams([]byte(query.Pattern))
frequencies := make([]uint32, 0, len(ngramOffs))
for _, o := range ngramOffs {
var freq uint32
if query.CaseSensitive {
freq = data.ngramFrequency(o.ngram, query.FileName)
} else {
for _, v := range generateCaseNgrams(o.ngram) {
freq += data.ngramFrequency(v, query.FileName)
}
}
if freq == 0 {
return &ngramIterationResults{}, nil
}
frequencies = append(frequencies, freq)
}
firstI := minarg(frequencies)
frequencies[firstI] = maxUInt32
lastI := minarg(frequencies)
if firstI > lastI {
lastI, firstI = firstI, lastI
}
firstNG := ngramOffs[firstI].ngram
lastNG := ngramOffs[lastI].ngram
iter.distance = lastI - firstI
iter.leftPad = firstI
iter.rightPad = uint32(utf8.RuneCountInString(str)-ngramSize) - lastI
postings, bytesRead, err := data.readPostings(firstNG, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
}
iter.first = postings
if firstI != lastI {
postings, sz, err := data.readPostings(lastNG, query.CaseSensitive, query.FileName)
bytesRead += sz
if err != nil {
return nil, err
}
iter.last = postings
} else {
// TODO - we could be a little faster and skip the
// list intersection
iter.last = iter.first
}
result := ngramIterationResults{
bytesRead: bytesRead,
cands: iter.candidates(),
coversContent: lastI-firstI <= ngramSize && iter.leftPad == 0 && iter.rightPad == 0,
}
return &result, nil
}
func (d *indexData) readPostings(ng ngram, caseSensitive, fileName bool) ([]uint32, uint32, error) {
variants := []ngram{ng}
if !caseSensitive {
variants = generateCaseNgrams(ng)
}
// TODO - generate less garbage.
var sz uint32
postings := make([][]uint32, 0, len(variants))
for _, v := range variants {
if fileName {
postings = append(postings, d.fileNameNgrams[v])
continue
}
sec := d.ngrams[v]
blob, err := d.readSectionBlob(sec)
if err != nil {
return nil, 0, err
}
sz += sec.sz
ps := fromDeltas(blob, nil)
if len(ps) > 0 {
postings = append(postings, ps)
}
}
result := mergeUint32(postings)
return result, sz, nil
}
func mergeUint32(in [][]uint32) []uint32 {
sz := 0
for _, i := range in {
sz += len(i)
}
out := make([]uint32, 0, sz)
for len(in) > 0 {
minVal := uint32(maxUInt32)
for _, n := range in {
if len(n) > 0 && n[0] < minVal {
minVal = n[0]
}
}
next := in[:0]
for _, n := range in {
if len(n) == 0 {
continue
}
if n[0] == minVal {
out = append(out, minVal)
n = n[1:]
}
if len(n) > 0 {
next = append(next, n)
}
}
in = next
}
return out
}
func (d *indexData) fileName(i uint32) []byte {
return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
}
func (s *indexData) Close() {
s.file.Close()
}