indexdata.go - zoekt - Git at Google

 // Copyright 2016 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package zoekt

 import (
 	"fmt"
 	"hash/crc64"
 	"unicode/utf8"

 	"github.com/google/zoekt/query"
 )

 // indexData holds the pattern-independent data that we have to have
 // in memory to search. Most of the memory is taken up by the ngram =>
 // offset index.
 type indexData struct {
 	file IndexFile

 	ngrams map[ngram]simpleSection

 	newlinesStart uint32
 	newlinesIndex []uint32

 	docSectionsStart uint32
 	docSectionsIndex []uint32

 	runeDocSections []DocumentSection

 	// rune offset=>byte offset mapping, relative to the start of the content corpus
 	runeOffsets []uint32

 	// offsets of file contents; includes end of last file
 	boundariesStart uint32
 	boundaries      []uint32

 	// rune offsets for the file content boundaries
 	fileEndRunes []uint32

 	fileNameContent []byte
 	fileNameIndex   []uint32
 	fileNameNgrams  map[ngram][]uint32

 	// rune offset=>byte offset mapping, relative to the start of the filename corpus
 	fileNameRuneOffsets []uint32

 	// rune offsets for the file name boundaries
 	fileNameEndRunes []uint32

 	fileBranchMasks []uint64

 	// mask (power of 2) => name
 	branchNames map[uint]string

 	// name => mask (power of 2)
 	branchIDs map[string]uint

 	metaData     IndexMetadata
 	repoMetaData Repository

 	subRepos     []uint32
 	subRepoPaths []string

 	// Checksums for all the files, at 8-byte intervals
 	checksums []byte

 	// languages for all the files.
 	languages []byte

 	// inverse of LanguageMap in metaData
 	languageMap map[byte]string

 	repoListEntry RepoListEntry
 }

 func (d *indexData) getChecksum(idx uint32) []byte {
 	start := crc64.Size * idx
 	return d.checksums[start : start+crc64.Size]
 }

 func (d *indexData) calculateStats() {
 	var last uint32
 	if len(d.boundaries) > 0 {
 		last += d.boundaries[len(d.boundaries)-1]
 	}

 	lastFN := last
 	if len(d.fileNameIndex) > 0 {
 		lastFN = d.fileNameIndex[len(d.fileNameIndex)-1]
 	}

 	stats := RepoStats{
 		IndexBytes:   int64(d.memoryUse()),
 		ContentBytes: int64(int(last) + int(lastFN)),
 		Documents:    len(d.newlinesIndex) - 1,
 	}
 	d.repoListEntry = RepoListEntry{
 		Repository:    d.repoMetaData,
 		IndexMetadata: d.metaData,
 		Stats:         stats,
 	}
 }

 func (d *indexData) String() string {
 	return fmt.Sprintf("shard(%s)", d.file.Name())
 }

 func (d *indexData) memoryUse() int {
 	sz := 0
 	for _, a := range [][]uint32{
 		d.newlinesIndex, d.docSectionsIndex,
 		d.boundaries, d.fileNameIndex,
 		d.runeOffsets, d.fileNameRuneOffsets,
 		d.fileEndRunes, d.fileNameEndRunes,
 	} {
 		sz += 4 * len(a)
 	}
 	sz += 8 * len(d.runeDocSections)
 	sz += 8 * len(d.fileBranchMasks)
 	sz += 12 * len(d.ngrams)
 	for _, v := range d.fileNameNgrams {
 		sz += 4*len(v) + 4
 	}
 	return sz
 }

 const maxUInt32 = 0xffffffff

 func firstMinarg(xs []uint32) uint32 {
 	m := uint32(maxUInt32)
 	j := len(xs)
 	for i, x := range xs {
 		if x < m {
 			m = x
 			j = i
 		}
 	}
 	return uint32(j)
 }

 func lastMinarg(xs []uint32) uint32 {
 	m := uint32(maxUInt32)
 	j := len(xs)
 	for i, x := range xs {
 		if x <= m {
 			m = x
 			j = i
 		}
 	}
 	return uint32(j)
 }

 func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
 	if filename {
 		return uint32(len(data.fileNameNgrams[ng]))
 	}

 	return data.ngrams[ng].sz
 }

 type ngramIterationResults struct {
 	matchIterator

 	// The ngram matches cover the pattern, so no need to check
 	// contents.
 	coversContent bool

 	caseSensitive bool
 	fileName      bool
 	substrBytes   []byte
 	substrLowered []byte
 	byteMatchSz   uint32
 }

 func (r *ngramIterationResults) String() string {
 	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
 }

 func (r *ngramIterationResults) candidates() []*candidateMatch {
 	cs := r.matchIterator.candidates()
 	for _, c := range cs {
 		c.caseSensitive = r.caseSensitive
 		c.fileName = r.fileName
 		c.substrBytes = r.substrBytes
 		c.substrLowered = r.substrLowered
 		c.byteMatchSz = r.byteMatchSz
 	}
 	return cs
 }

 func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
 	str := query.Pattern

 	// Find the 2 least common ngrams from the string.
 	ngramOffs := splitNGrams([]byte(query.Pattern))
 	frequencies := make([]uint32, 0, len(ngramOffs))
 	for _, o := range ngramOffs {
 		var freq uint32
 		if query.CaseSensitive {
 			freq = d.ngramFrequency(o.ngram, query.FileName)
 		} else {
 			for _, v := range generateCaseNgrams(o.ngram) {
 				freq += d.ngramFrequency(v, query.FileName)
 			}
 		}

 		if freq == 0 {
 			return &ngramIterationResults{
 				matchIterator: &noMatchTree{
 					Why: "freq=0",
 				},
 			}, nil
 		}

 		frequencies = append(frequencies, freq)
 	}
 	firstI := firstMinarg(frequencies)
 	frequencies[firstI] = maxUInt32
 	lastI := lastMinarg(frequencies)
 	if firstI > lastI {
 		lastI, firstI = firstI, lastI
 	}

 	firstNG := ngramOffs[firstI].ngram
 	lastNG := ngramOffs[lastI].ngram
 	iter := &ngramDocIterator{
 		leftPad:  firstI,
 		rightPad: uint32(utf8.RuneCountInString(str)) - firstI,
 	}
 	if query.FileName {
 		iter.ends = d.fileNameEndRunes
 	} else {
 		iter.ends = d.fileEndRunes
 	}

 	var coversContent bool
 	if firstI != lastI {
 		i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName)

 		if err != nil {
 			return nil, err
 		}

 		iter.iter = i

 		coversContent = (lastI-firstI <= ngramSize && iter.leftPad == 0 && iter.rightPad == (lastI+ngramSize))
 	} else {
 		hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName)
 		if err != nil {
 			return nil, err
 		}
 		iter.iter = hitIter
 		coversContent = (iter.leftPad == 0 && iter.rightPad == 3)
 	}

 	patBytes := []byte(query.Pattern)
 	lowerPatBytes := toLower(patBytes)

 	return &ngramIterationResults{
 		matchIterator: iter,
 		coversContent: coversContent,
 		caseSensitive: query.CaseSensitive,
 		fileName:      query.FileName,
 		substrBytes:   patBytes,
 		substrLowered: lowerPatBytes,
 		// TODO - this is wrong for casefolding searches.
 		byteMatchSz: uint32(len(lowerPatBytes)),
 	}, nil
 }

 func (d *indexData) fileName(i uint32) []byte {
 	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
 }

 func (s *indexData) Close() {
 	s.file.Close()
 }
	// Copyright 2016 Google Inc. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package zoekt

	import (
	"fmt"
	"hash/crc64"
	"unicode/utf8"

	"github.com/google/zoekt/query"
	)

	// indexData holds the pattern-independent data that we have to have
	// in memory to search. Most of the memory is taken up by the ngram =>
	// offset index.
	type indexData struct {
	file IndexFile

	ngrams map[ngram]simpleSection

	newlinesStart uint32
	newlinesIndex []uint32

	docSectionsStart uint32
	docSectionsIndex []uint32

	runeDocSections []DocumentSection

	// rune offset=>byte offset mapping, relative to the start of the content corpus
	runeOffsets []uint32

	// offsets of file contents; includes end of last file
	boundariesStart uint32
	boundaries []uint32

	// rune offsets for the file content boundaries
	fileEndRunes []uint32

	fileNameContent []byte
	fileNameIndex []uint32
	fileNameNgrams map[ngram][]uint32

	// rune offset=>byte offset mapping, relative to the start of the filename corpus
	fileNameRuneOffsets []uint32

	// rune offsets for the file name boundaries
	fileNameEndRunes []uint32

	fileBranchMasks []uint64

	// mask (power of 2) => name
	branchNames map[uint]string

	// name => mask (power of 2)
	branchIDs map[string]uint

	metaData IndexMetadata
	repoMetaData Repository

	subRepos []uint32
	subRepoPaths []string

	// Checksums for all the files, at 8-byte intervals
	checksums []byte

	// languages for all the files.
	languages []byte

	// inverse of LanguageMap in metaData
	languageMap map[byte]string

	repoListEntry RepoListEntry
	}

	func (d *indexData) getChecksum(idx uint32) []byte {
	start := crc64.Size * idx
	return d.checksums[start : start+crc64.Size]
	}

	func (d *indexData) calculateStats() {
	var last uint32
	if len(d.boundaries) > 0 {
	last += d.boundaries[len(d.boundaries)-1]
	}

	lastFN := last
	if len(d.fileNameIndex) > 0 {
	lastFN = d.fileNameIndex[len(d.fileNameIndex)-1]
	}

	stats := RepoStats{
	IndexBytes: int64(d.memoryUse()),
	ContentBytes: int64(int(last) + int(lastFN)),
	Documents: len(d.newlinesIndex) - 1,
	}
	d.repoListEntry = RepoListEntry{
	Repository: d.repoMetaData,
	IndexMetadata: d.metaData,
	Stats: stats,
	}
	}

	func (d *indexData) String() string {
	return fmt.Sprintf("shard(%s)", d.file.Name())
	}

	func (d *indexData) memoryUse() int {
	sz := 0
	for _, a := range [][]uint32{
	d.newlinesIndex, d.docSectionsIndex,
	d.boundaries, d.fileNameIndex,
	d.runeOffsets, d.fileNameRuneOffsets,
	d.fileEndRunes, d.fileNameEndRunes,
	} {
	sz += 4 * len(a)
	}
	sz += 8 * len(d.runeDocSections)
	sz += 8 * len(d.fileBranchMasks)
	sz += 12 * len(d.ngrams)
	for _, v := range d.fileNameNgrams {
	sz += 4*len(v) + 4
	}
	return sz
	}

	const maxUInt32 = 0xffffffff

	func firstMinarg(xs []uint32) uint32 {
	m := uint32(maxUInt32)
	j := len(xs)
	for i, x := range xs {
	if x < m {
	m = x
	j = i
	}
	}
	return uint32(j)
	}

	func lastMinarg(xs []uint32) uint32 {
	m := uint32(maxUInt32)
	j := len(xs)
	for i, x := range xs {
	if x <= m {
	m = x
	j = i
	}
	}
	return uint32(j)
	}

	func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
	if filename {
	return uint32(len(data.fileNameNgrams[ng]))
	}

	return data.ngrams[ng].sz
	}

	type ngramIterationResults struct {
	matchIterator

	// The ngram matches cover the pattern, so no need to check
	// contents.
	coversContent bool

	caseSensitive bool
	fileName bool
	substrBytes []byte
	substrLowered []byte
	byteMatchSz uint32
	}

	func (r *ngramIterationResults) String() string {
	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
	}

	func (r ngramIterationResults) candidates() []candidateMatch {
	cs := r.matchIterator.candidates()
	for _, c := range cs {
	c.caseSensitive = r.caseSensitive
	c.fileName = r.fileName
	c.substrBytes = r.substrBytes
	c.substrLowered = r.substrLowered
	c.byteMatchSz = r.byteMatchSz
	}
	return cs
	}

	func (d indexData) iterateNgrams(query query.Substring) (*ngramIterationResults, error) {
	str := query.Pattern

	// Find the 2 least common ngrams from the string.
	ngramOffs := splitNGrams([]byte(query.Pattern))
	frequencies := make([]uint32, 0, len(ngramOffs))
	for _, o := range ngramOffs {
	var freq uint32
	if query.CaseSensitive {
	freq = d.ngramFrequency(o.ngram, query.FileName)
	} else {
	for _, v := range generateCaseNgrams(o.ngram) {
	freq += d.ngramFrequency(v, query.FileName)
	}
	}

	if freq == 0 {
	return &ngramIterationResults{
	matchIterator: &noMatchTree{
	Why: "freq=0",
	},
	}, nil
	}

	frequencies = append(frequencies, freq)
	}
	firstI := firstMinarg(frequencies)
	frequencies[firstI] = maxUInt32
	lastI := lastMinarg(frequencies)
	if firstI > lastI {
	lastI, firstI = firstI, lastI
	}

	firstNG := ngramOffs[firstI].ngram
	lastNG := ngramOffs[lastI].ngram
	iter := &ngramDocIterator{
	leftPad: firstI,
	rightPad: uint32(utf8.RuneCountInString(str)) - firstI,
	}
	if query.FileName {
	iter.ends = d.fileNameEndRunes
	} else {
	iter.ends = d.fileEndRunes
	}

	var coversContent bool
	if firstI != lastI {
	i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName)

	if err != nil {
	return nil, err
	}

	iter.iter = i

	coversContent = (lastI-firstI <= ngramSize && iter.leftPad == 0 && iter.rightPad == (lastI+ngramSize))
	} else {
	hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName)
	if err != nil {
	return nil, err
	}
	iter.iter = hitIter
	coversContent = (iter.leftPad == 0 && iter.rightPad == 3)
	}

	patBytes := []byte(query.Pattern)
	lowerPatBytes := toLower(patBytes)

	return &ngramIterationResults{
	matchIterator: iter,
	coversContent: coversContent,
	caseSensitive: query.CaseSensitive,
	fileName: query.FileName,
	substrBytes: patBytes,
	substrLowered: lowerPatBytes,
	// TODO - this is wrong for casefolding searches.
	byteMatchSz: uint32(len(lowerPatBytes)),
	}, nil
	}

	func (d *indexData) fileName(i uint32) []byte {
	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
	}

	func (s *indexData) Close() {
	s.file.Close()
	}