blob: 426211672798dead0960ef57765751edfe3cab75 [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zoekt
import (
"fmt"
"hash/crc64"
"unicode/utf8"
"github.com/google/zoekt/query"
)
// indexData holds the pattern-independent data that we have to have
// in memory to search. Most of the memory is taken up by the ngram =>
// offset index.
type indexData struct {
file IndexFile
ngrams map[ngram]simpleSection
newlinesStart uint32
newlinesIndex []uint32
docSectionsStart uint32
docSectionsIndex []uint32
runeDocSections []DocumentSection
// rune offset=>byte offset mapping, relative to the start of the content corpus
runeOffsets []uint32
// offsets of file contents; includes end of last file
boundariesStart uint32
boundaries []uint32
// rune offsets for the file content boundaries
fileEndRunes []uint32
fileNameContent []byte
fileNameIndex []uint32
fileNameNgrams map[ngram][]uint32
// rune offset=>byte offset mapping, relative to the start of the filename corpus
fileNameRuneOffsets []uint32
// rune offsets for the file name boundaries
fileNameEndRunes []uint32
fileBranchMasks []uint64
// mask (power of 2) => name
branchNames map[uint]string
// name => mask (power of 2)
branchIDs map[string]uint
metaData IndexMetadata
repoMetaData Repository
subRepos []uint32
subRepoPaths []string
// Checksums for all the files, at 8-byte intervals
checksums []byte
// languages for all the files.
languages []byte
// inverse of LanguageMap in metaData
languageMap map[byte]string
repoListEntry RepoListEntry
}
func (d *indexData) getChecksum(idx uint32) []byte {
start := crc64.Size * idx
return d.checksums[start : start+crc64.Size]
}
func (d *indexData) calculateStats() {
var last uint32
if len(d.boundaries) > 0 {
last += d.boundaries[len(d.boundaries)-1]
}
lastFN := last
if len(d.fileNameIndex) > 0 {
lastFN = d.fileNameIndex[len(d.fileNameIndex)-1]
}
stats := RepoStats{
IndexBytes: int64(d.memoryUse()),
ContentBytes: int64(int(last) + int(lastFN)),
Documents: len(d.newlinesIndex) - 1,
Shards: 1,
}
d.repoListEntry = RepoListEntry{
Repository: d.repoMetaData,
IndexMetadata: d.metaData,
Stats: stats,
}
}
func (d *indexData) String() string {
return fmt.Sprintf("shard(%s)", d.file.Name())
}
func (d *indexData) memoryUse() int {
sz := 0
for _, a := range [][]uint32{
d.newlinesIndex, d.docSectionsIndex,
d.boundaries, d.fileNameIndex,
d.runeOffsets, d.fileNameRuneOffsets,
d.fileEndRunes, d.fileNameEndRunes,
} {
sz += 4 * len(a)
}
sz += 8 * len(d.runeDocSections)
sz += 8 * len(d.fileBranchMasks)
sz += 12 * len(d.ngrams)
for _, v := range d.fileNameNgrams {
sz += 4*len(v) + 4
}
return sz
}
const maxUInt32 = 0xffffffff
func firstMinarg(xs []uint32) uint32 {
m := uint32(maxUInt32)
j := len(xs)
for i, x := range xs {
if x < m {
m = x
j = i
}
}
return uint32(j)
}
func lastMinarg(xs []uint32) uint32 {
m := uint32(maxUInt32)
j := len(xs)
for i, x := range xs {
if x <= m {
m = x
j = i
}
}
return uint32(j)
}
func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
if filename {
return uint32(len(data.fileNameNgrams[ng]))
}
return data.ngrams[ng].sz
}
type ngramIterationResults struct {
matchIterator
caseSensitive bool
fileName bool
substrBytes []byte
substrLowered []byte
}
func (r *ngramIterationResults) String() string {
return fmt.Sprintf("wrapper(%v)", r.matchIterator)
}
func (r *ngramIterationResults) candidates() []*candidateMatch {
cs := r.matchIterator.candidates()
for _, c := range cs {
c.caseSensitive = r.caseSensitive
c.fileName = r.fileName
c.substrBytes = r.substrBytes
c.substrLowered = r.substrLowered
}
return cs
}
func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
str := query.Pattern
// Find the 2 least common ngrams from the string.
ngramOffs := splitNGrams([]byte(query.Pattern))
frequencies := make([]uint32, 0, len(ngramOffs))
for _, o := range ngramOffs {
var freq uint32
if query.CaseSensitive {
freq = d.ngramFrequency(o.ngram, query.FileName)
} else {
for _, v := range generateCaseNgrams(o.ngram) {
freq += d.ngramFrequency(v, query.FileName)
}
}
if freq == 0 {
return &ngramIterationResults{
matchIterator: &noMatchTree{
Why: "freq=0",
},
}, nil
}
frequencies = append(frequencies, freq)
}
firstI := firstMinarg(frequencies)
frequencies[firstI] = maxUInt32
lastI := lastMinarg(frequencies)
if firstI > lastI {
lastI, firstI = firstI, lastI
}
firstNG := ngramOffs[firstI].ngram
lastNG := ngramOffs[lastI].ngram
iter := &ngramDocIterator{
leftPad: firstI,
rightPad: uint32(utf8.RuneCountInString(str)) - firstI,
}
if query.FileName {
iter.ends = d.fileNameEndRunes
} else {
iter.ends = d.fileEndRunes
}
if firstI != lastI {
i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
}
iter.iter = i
} else {
hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
}
iter.iter = hitIter
}
patBytes := []byte(query.Pattern)
lowerPatBytes := toLower(patBytes)
return &ngramIterationResults{
matchIterator: iter,
caseSensitive: query.CaseSensitive,
fileName: query.FileName,
substrBytes: patBytes,
substrLowered: lowerPatBytes,
}, nil
}
func (d *indexData) fileName(i uint32) []byte {
return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
}
func (s *indexData) Close() {
s.file.Close()
}