blob: bac97b193ffd0eacdafee4e2bd1332d6b0b827ce [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zoekt
import (
"encoding/binary"
"encoding/json"
"fmt"
"sort"
)
// IndexFile is a file suitable for concurrent read access. For performance
// reasons, it allows a mmap'd implementation.
type IndexFile interface {
Read(off uint32, sz uint32) ([]byte, error)
Size() (uint32, error)
Close()
Name() string
}
// reader is a stateful file
type reader struct {
r IndexFile
off uint32
}
func (r *reader) seek(off uint32) {
r.off = off
}
func (r *reader) U32() (uint32, error) {
b, err := r.r.Read(r.off, 4)
r.off += 4
if err != nil {
return 0, err
}
return binary.BigEndian.Uint32(b), nil
}
func (r *reader) U64() (uint64, error) {
b, err := r.r.Read(r.off, 8)
r.off += 8
if err != nil {
return 0, err
}
return binary.BigEndian.Uint64(b), nil
}
func (r *reader) readTOC(toc *indexTOC) error {
sz, err := r.r.Size()
if err != nil {
return err
}
r.off = sz - 8
var tocSection simpleSection
if err := tocSection.read(r); err != nil {
return err
}
r.seek(tocSection.off)
sectionCount, err := r.U32()
if err != nil {
return err
}
secs := toc.sections()
if len(secs) != int(sectionCount) {
return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
}
for _, s := range toc.sections() {
if err := s.read(r); err != nil {
return err
}
}
return nil
}
func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
return r.file.Read(sec.off, sec.sz)
}
func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
if sec.sz%4 != 0 {
return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
}
blob, err := f.Read(sec.off, sec.sz)
if err != nil {
return nil, err
}
arr := make([]uint32, 0, len(blob)/4)
for len(blob) > 0 {
arr = append(arr, binary.BigEndian.Uint32(blob))
blob = blob[4:]
}
return arr, nil
}
func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
if sec.sz%8 != 0 {
return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
}
blob, err := f.Read(sec.off, sec.sz)
if err != nil {
return nil, err
}
arr := make([]uint64, 0, len(blob)/8)
for len(blob) > 0 {
arr = append(arr, binary.BigEndian.Uint64(blob))
blob = blob[8:]
}
return arr, nil
}
func (r *reader) readJSON(data interface{}, sec *simpleSection) error {
blob, err := r.r.Read(sec.off, sec.sz)
if err != nil {
return err
}
return json.Unmarshal(blob, data)
}
func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
d := indexData{
file: r.r,
ngrams: map[ngram]simpleSection{},
fileNameNgrams: map[ngram][]uint32{},
branchIDs: map[string]uint{},
branchNames: map[uint]string{},
}
blob, err := d.readSectionBlob(toc.metaData)
if err != nil {
return nil, err
}
if err := json.Unmarshal(blob, &d.metaData); err != nil {
return nil, err
}
if d.metaData.IndexFormatVersion != IndexFormatVersion {
return nil, fmt.Errorf("file is v%d, want v%d", d.metaData.IndexFormatVersion, IndexFormatVersion)
}
blob, err = d.readSectionBlob(toc.repoMetaData)
if err != nil {
return nil, err
}
if err := json.Unmarshal(blob, &d.repoMetaData); err != nil {
return nil, err
}
d.boundariesStart = toc.fileContents.data.off
d.boundaries = toc.fileContents.relativeIndex()
d.newlinesStart = toc.newlines.data.off
d.newlinesIndex = toc.newlines.relativeIndex()
d.docSectionsStart = toc.fileSections.data.off
d.docSectionsIndex = toc.fileSections.relativeIndex()
d.checksums, err = d.readSectionBlob(toc.contentChecksums)
if err != nil {
return nil, err
}
d.languages, err = d.readSectionBlob(toc.languages)
if err != nil {
return nil, err
}
d.ngrams, err = d.readNgrams(toc)
if err != nil {
return nil, err
}
d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
if err != nil {
return nil, err
}
d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
if err != nil {
return nil, err
}
d.fileNameIndex = toc.fileNames.relativeIndex()
d.fileNameNgrams, err = d.readFileNameNgrams(toc)
if err != nil {
return nil, err
}
for j, br := range d.repoMetaData.Branches {
id := uint(1) << uint(j)
d.branchIDs[br.Name] = id
d.branchNames[id] = br.Name
}
blob, err = d.readSectionBlob(toc.runeDocSections)
if err != nil {
return nil, err
}
d.runeDocSections = unmarshalDocSections(blob, nil)
for sect, dest := range map[simpleSection]*[]uint32{
toc.subRepos: &d.subRepos,
toc.runeOffsets: &d.runeOffsets,
toc.nameRuneOffsets: &d.fileNameRuneOffsets,
toc.nameEndRunes: &d.fileNameEndRunes,
toc.fileEndRunes: &d.fileEndRunes,
} {
if blob, err := d.readSectionBlob(sect); err != nil {
return nil, err
} else {
*dest = fromSizedDeltas(blob, nil)
}
}
keys := []string{""}
for k := range d.repoMetaData.SubRepoMap {
if k != "" { // we used to marshal "" in SubRepoMap. Prevent adding twice.
keys = append(keys, k)
}
}
sort.Strings(keys)
d.subRepoPaths = keys
d.languageMap = map[byte]string{}
for k, v := range d.metaData.LanguageMap {
d.languageMap[v] = k
}
if err := d.verify(); err != nil {
return nil, err
}
d.calculateStats()
return &d, nil
}
const ngramEncoding = 8
func (d *indexData) readNgrams(toc *indexTOC) (map[ngram]simpleSection, error) {
textContent, err := d.readSectionBlob(toc.ngramText)
if err != nil {
return nil, err
}
postingsIndex := toc.postings.relativeIndex()
ngrams := make(map[ngram]simpleSection, len(textContent)/ngramEncoding)
for i := 0; i < len(textContent); i += ngramEncoding {
j := i / ngramEncoding
ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
ngrams[ng] = simpleSection{
toc.postings.data.off + postingsIndex[j],
postingsIndex[j+1] - postingsIndex[j],
}
}
return ngrams, nil
}
func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]uint32, error) {
nameNgramText, err := d.readSectionBlob(toc.nameNgramText)
if err != nil {
return nil, err
}
fileNamePostingsData, err := d.readSectionBlob(toc.namePostings.data)
if err != nil {
return nil, err
}
fileNamePostingsIndex := toc.namePostings.relativeIndex()
fileNameNgrams := make(map[ngram][]uint32, len(nameNgramText)/ngramEncoding)
for i := 0; i < len(nameNgramText); i += ngramEncoding {
j := i / ngramEncoding
off := fileNamePostingsIndex[j]
end := fileNamePostingsIndex[j+1]
ng := ngram(binary.BigEndian.Uint64(nameNgramText[i : i+ngramEncoding]))
fileNameNgrams[ng] = fromDeltas(fileNamePostingsData[off:end], nil)
}
return fileNameNgrams, nil
}
func (d *indexData) verify() error {
// This is not an exhaustive check: the postings can easily
// generate OOB acccesses, and are expensive to check, but this lets us rule out
// other sources of OOB access.
n := len(d.fileNameIndex)
if n == 0 {
return nil
}
n--
for what, got := range map[string]int{
"boundaries": len(d.boundaries) - 1,
"branch masks": len(d.fileBranchMasks),
"doc section index": len(d.docSectionsIndex) - 1,
"newlines index": len(d.newlinesIndex) - 1,
} {
if got != n {
return fmt.Errorf("got %s %d, want %d", what, got, n)
}
}
return nil
}
func (d *indexData) readContents(i uint32) ([]byte, error) {
return d.readSectionBlob(simpleSection{
off: d.boundariesStart + d.boundaries[i],
sz: d.boundaries[i+1] - d.boundaries[i],
})
}
func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
// TODO(hanwen): cap result if it is at the end of the content
// section.
return d.readSectionBlob(simpleSection{
off: d.boundariesStart + off,
sz: sz,
})
}
func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
sec := simpleSection{
off: d.newlinesStart + d.newlinesIndex[i],
sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
}
blob, err := d.readSectionBlob(sec)
if err != nil {
return nil, 0, err
}
return fromSizedDeltas(blob, buf), sec.sz, nil
}
func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
sec := simpleSection{
off: d.docSectionsStart + d.docSectionsIndex[i],
sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
}
blob, err := d.readSectionBlob(sec)
if err != nil {
return nil, 0, err
}
return unmarshalDocSections(blob, buf), sec.sz, nil
}
// NewSearcher creates a Searcher for a single index file. Search
// results coming from this searcher are valid only for the lifetime
// of the Searcher itself, ie. []byte members should be copied into
// fresh buffers if the result is to survive closing the shard.
func NewSearcher(r IndexFile) (Searcher, error) {
rd := &reader{r: r}
var toc indexTOC
if err := rd.readTOC(&toc); err != nil {
return nil, err
}
indexData, err := rd.readIndexData(&toc)
if err != nil {
return nil, err
}
indexData.file = r
return indexData, nil
}
// ReadMetadata returns the metadata of index shard without reading
// the index data. The IndexFile is not closed.
func ReadMetadata(inf IndexFile) (*Repository, *IndexMetadata, error) {
rd := &reader{r: inf}
var toc indexTOC
if err := rd.readTOC(&toc); err != nil {
return nil, nil, err
}
var md IndexMetadata
if err := rd.readJSON(&md, &toc.metaData); err != nil {
return nil, nil, err
}
var repo Repository
if err := rd.readJSON(&repo, &toc.repoMetaData); err != nil {
return nil, nil, err
}
return &repo, &md, nil
}