blob: a2d16556ea7964477d57332c7aee7b8b17dff101 [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zoekt
import (
"fmt"
"log"
"regexp"
"unicode/utf8"
"github.com/google/zoekt/query"
)
var _ = log.Println
// indexData holds the pattern-independent data that we have to have
// in memory to search. Most of the memory is taken up by the ngram =>
// offset index.
type indexData struct {
file IndexFile
ngrams map[ngram]simpleSection
newlinesIndex []uint32
docSectionsIndex []uint32
runeOffsets []uint32
// offsets of file contents. Includes end of last file.
boundaries []uint32
fileEnds []uint32
fileEndRunes []uint32
fileNameContent []byte
fileNameIndex []uint32
fileNameNgrams map[ngram][]uint32
fileNameRuneOffsets []uint32
fileNameEndRunes []uint32
fileBranchMasks []uint32
// mask (power of 2) => name
branchNames map[uint]string
// name => mask (power of 2)
branchIDs map[string]uint
metaData IndexMetadata
repoMetaData Repository
subRepos []uint32
subRepoPaths []string
repoListEntry RepoListEntry
}
func (d *indexData) String() string {
return fmt.Sprintf("shard(%s)", d.file.Name())
}
func (d *indexData) memoryUse() int {
sz := 0
for _, a := range [][]uint32{
d.newlinesIndex, d.docSectionsIndex,
d.fileEnds, d.fileNameIndex, d.fileBranchMasks,
d.runeOffsets, d.fileNameRuneOffsets,
d.fileEndRunes, d.fileNameEndRunes,
} {
sz += 4 * len(a)
}
sz += 12 * len(d.ngrams)
for _, v := range d.fileNameNgrams {
sz += 4*len(v) + 4
}
return sz
}
func (data *indexData) getDocIterator(q query.Q) (docIterator, error) {
switch s := q.(type) {
case *query.Substring:
if utf8.RuneCountInString(s.Pattern) < ngramSize {
if !s.FileName {
return nil, fmt.Errorf("pattern %q less than %d characters", s.Pattern, ngramSize)
}
return data.getBruteForceFileNameDocIterator(s), nil
}
return data.getNgramDocIterator(s)
case *query.Const:
if s.Value {
return data.matchAllDocIterator(), nil
}
return &bruteForceIter{}, nil
}
log.Panicf("type %T", q)
return nil, nil
}
type bruteForceIter struct {
cands []*candidateMatch
}
func (i *bruteForceIter) next() []*candidateMatch {
return i.cands
}
func (i *bruteForceIter) coversContent() bool {
return true
}
func (data *indexData) matchAllDocIterator() docIterator {
var cands []*candidateMatch
if len(data.fileNameIndex) == 0 {
return &bruteForceIter{cands}
}
var last uint32
for i, off := range data.fileNameIndex[1:] {
name := data.fileNameContent[last:off]
last = off
cands = append(cands, &candidateMatch{
caseSensitive: false,
fileName: true,
substrBytes: name,
substrLowered: name,
file: uint32(i),
runeOffset: 0,
byteOffset: 0,
byteMatchSz: uint32(len(name)),
})
}
return &bruteForceIter{cands}
}
func (data *indexData) getBruteForceFileNameDocIterator(query *query.Substring) docIterator {
quoted := regexp.QuoteMeta(query.Pattern)
if !query.CaseSensitive {
quoted = "(?i)" + quoted
}
lowerPat := toLower([]byte(query.Pattern))
re := regexp.MustCompile(quoted)
fileID := 0
startName := data.fileNameIndex[fileID]
endName := data.fileNameIndex[fileID+1]
var cands []*candidateMatch
for _, match := range re.FindAllIndex(data.fileNameContent, -1) {
start := uint32(match[0])
end := uint32(match[1])
for endName < end {
fileID++
startName = data.fileNameIndex[fileID]
endName = data.fileNameIndex[fileID+1]
}
if start < startName {
// straddles a filename boundary.
continue
}
cands = append(cands, &candidateMatch{
caseSensitive: query.CaseSensitive,
fileName: true,
substrBytes: []byte(query.Pattern),
substrLowered: lowerPat,
file: uint32(fileID),
// TODO - should also populate runeOffset?
byteOffset: uint32(start - startName),
byteMatchSz: uint32(end - start),
})
}
return &bruteForceIter{cands}
}
const maxUInt32 = 0xffffffff
func minarg(xs []uint32) uint32 {
m := uint32(maxUInt32)
j := len(xs)
for i, x := range xs {
if x <= m {
m = x
j = i
}
}
return uint32(j)
}
func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
if filename {
return uint32(len(data.fileNameNgrams[ng]))
}
return data.ngrams[ng].sz
}
func (data *indexData) getNgramDocIterator(query *query.Substring) (docIterator, error) {
input := &ngramDocIterator{
query: query,
}
if query.FileName {
input.ends = data.fileNameEndRunes
} else {
input.ends = data.fileEndRunes
}
str := query.Pattern
// Find the 2 least common ngrams from the string.
ngramOffs := splitNGrams([]byte(query.Pattern))
var frequencies []uint32
for _, o := range ngramOffs {
var freq uint32
if query.CaseSensitive {
freq = data.ngramFrequency(o.ngram, query.FileName)
} else {
for _, v := range generateCaseNgrams(o.ngram) {
freq += data.ngramFrequency(v, query.FileName)
}
}
if freq == 0 {
return input, nil
}
frequencies = append(frequencies, freq)
}
firstI := minarg(frequencies)
frequencies[firstI] = maxUInt32
lastI := minarg(frequencies)
if firstI > lastI {
lastI, firstI = firstI, lastI
}
firstNG := ngramOffs[firstI].ngram
lastNG := ngramOffs[lastI].ngram
input.distance = lastI - firstI
input.leftPad = firstI
input.rightPad = uint32(utf8.RuneCountInString(str)-ngramSize) - lastI
postings, err := data.readPostings(firstNG, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
}
input.first = postings
if firstI != lastI {
postings, err = data.readPostings(lastNG, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
}
input.last = postings
} else {
input.last = input.first
}
input.ng1 = firstNG
input.ng2 = lastNG
if lastI-firstI <= ngramSize && input.leftPad == 0 && input.rightPad == 0 {
input._coversContent = true
}
return input, nil
}
func (d *indexData) readPostings(ng ngram, caseSensitive, fileName bool) ([]uint32, error) {
variants := []ngram{ng}
if !caseSensitive {
variants = generateCaseNgrams(ng)
}
// TODO - generate less garbage.
postings := make([][]uint32, 0, len(variants))
for _, v := range variants {
if fileName {
postings = append(postings, d.fileNameNgrams[v])
continue
}
sec := d.ngrams[v]
blob, err := d.readSectionBlob(sec)
if err != nil {
return nil, err
}
ps := fromDeltas(blob, nil)
if len(ps) > 0 {
postings = append(postings, ps)
}
}
result := mergeUint32(postings)
return result, nil
}
func mergeUint32(in [][]uint32) []uint32 {
sz := 0
for _, i := range in {
sz += len(i)
}
out := make([]uint32, 0, sz)
for len(in) > 0 {
minVal := uint32(maxUInt32)
for _, n := range in {
if len(n) > 0 && n[0] < minVal {
minVal = n[0]
}
}
next := in[:0]
for _, n := range in {
if len(n) == 0 {
continue
}
if n[0] == minVal {
out = append(out, minVal)
n = n[1:]
}
if len(n) > 0 {
next = append(next, n)
}
}
in = next
}
return out
}
func (d *indexData) fileName(i uint32) []byte {
return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
}
func (s *indexData) Close() {
s.file.Close()
}