indexbuilder.go - zoekt - Git at Google

 // Copyright 2016 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package zoekt

 import (
 	"bytes"
 	"encoding/binary"
 	"fmt"
 	"hash/crc64"
 	"html/template"
 	"log"
 	"path/filepath"
 	"sort"
 	"unicode/utf8"
 )

 var _ = log.Println

 const ngramSize = 3

 type searchableString struct {
 	data []byte
 }

 // Filled by the linker (see build-deploy.sh)
 var Version string

 // Store character (unicode codepoint) offset (in bytes) this often.
 const runeOffsetFrequency = 100

 type postingsBuilder struct {
 	postings    map[ngram][]byte
 	lastOffsets map[ngram]uint32

 	// To support UTF-8 searching, we must map back runes to byte
 	// offsets. As a first attempt, we sample regularly. The
 	// precise offset can be found by walking from the recorded
 	// offset to the desired rune.
 	runeOffsets []uint32
 	runeCount   uint32

 	isPlainASCII bool

 	endRunes []uint32
 	endByte  uint32
 }

 func newPostingsBuilder() *postingsBuilder {
 	return &postingsBuilder{
 		postings:     map[ngram][]byte{},
 		lastOffsets:  map[ngram]uint32{},
 		isPlainASCII: true,
 	}
 }

 // Store trigram offsets for the given UTF-8 data. The
 // DocumentSections must correspond to rune boundaries in the UTF-8
 // data.
 func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
 	dest := searchableString{
 		data: data,
 	}
 	var buf [8]byte
 	var runeGram [3]rune

 	var runeIndex uint32
 	byteCount := 0
 	dataSz := uint32(len(data))

 	byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
 	for _, s := range byteSections {
 		byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
 	}
 	var runeSectionBoundaries []uint32

 	endRune := s.runeCount
 	for ; len(data) > 0; runeIndex++ {
 		c, sz := utf8.DecodeRune(data)
 		if sz > 1 {
 			s.isPlainASCII = false
 		}
 		data = data[sz:]

 		runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c

 		if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
 			s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
 		}
 		for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
 			runeSectionBoundaries = append(runeSectionBoundaries,
 				endRune+uint32(runeIndex))
 			byteSectionBoundaries = byteSectionBoundaries[1:]
 		}

 		byteCount += sz

 		if runeIndex < 2 {
 			continue
 		}

 		ng := runesToNGram(runeGram)
 		lastOff := s.lastOffsets[ng]
 		newOff := endRune + uint32(runeIndex) - 2

 		m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
 		s.postings[ng] = append(s.postings[ng], buf[:m]...)
 		s.lastOffsets[ng] = newOff
 	}
 	s.runeCount += runeIndex

 	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
 		return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
 	}

 	// Handle symbol definition that ends at file end. This can
 	// happen for labels at the end of .bat files.

 	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
 		runeSectionBoundaries = append(runeSectionBoundaries,
 			endRune+runeIndex)
 		byteSectionBoundaries = byteSectionBoundaries[1:]
 	}
 	runeSecs := make([]DocumentSection, 0, len(byteSections))
 	for i := 0; i < len(runeSectionBoundaries); i += 2 {
 		runeSecs = append(runeSecs, DocumentSection{
 			Start: runeSectionBoundaries[i],
 			End:   runeSectionBoundaries[i+1],
 		})
 	}

 	s.endRunes = append(s.endRunes, s.runeCount)
 	s.endByte += dataSz
 	return &dest, runeSecs, nil
 }

 // IndexBuilder builds a single index shard.
 type IndexBuilder struct {
 	contentStrings  []*searchableString
 	nameStrings     []*searchableString
 	docSections     [][]DocumentSection
 	runeDocSections []DocumentSection

 	checksums []byte

 	branchMasks []uint64
 	subRepos    []uint32

 	contentPostings *postingsBuilder
 	namePostings    *postingsBuilder

 	// root repository
 	repo Repository

 	// name to index.
 	subRepoIndices map[string]uint32

 	// language => language code
 	languageMap map[string]byte

 	// languages codes
 	languages []byte
 }

 func (d *Repository) verify() error {
 	for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} {
 		if _, err := template.New("").Parse(t); err != nil {
 			return err
 		}
 	}
 	return nil
 }

 // ContentSize returns the number of content bytes so far ingested.
 func (b *IndexBuilder) ContentSize() uint32 {
 	// Add the name too so we don't skip building index if we have
 	// lots of empty files.
 	return b.contentPostings.endByte + b.namePostings.endByte
 }

 // NewIndexBuilder creates a fresh IndexBuilder. The passed in
 // Repository contains repo metadata, and may be set to nil.
 func NewIndexBuilder(r *Repository) (*IndexBuilder, error) {
 	b := &IndexBuilder{
 		contentPostings: newPostingsBuilder(),
 		namePostings:    newPostingsBuilder(),
 		languageMap:     map[string]byte{},
 	}

 	if r == nil {
 		r = &Repository{}
 	}
 	if err := b.setRepository(r); err != nil {
 		return nil, err
 	}
 	return b, nil
 }

 func (b *IndexBuilder) setRepository(desc *Repository) error {
 	if len(b.contentStrings) > 0 {
 		return fmt.Errorf("setRepository called after adding files")
 	}
 	if err := desc.verify(); err != nil {
 		return err
 	}

 	if len(desc.Branches) > 64 {
 		return fmt.Errorf("too many branches")
 	}

 	b.repo = *desc

 	// copy subrepomap without root
 	b.repo.SubRepoMap = map[string]*Repository{}
 	for k, v := range desc.SubRepoMap {
 		if k != "" {
 			b.repo.SubRepoMap[k] = v
 		}
 	}

 	b.populateSubRepoIndices()
 	return nil
 }

 type DocumentSection struct {
 	Start, End uint32
 }

 // Document holds a document (file) to index.
 type Document struct {
 	Name              string
 	Content           []byte
 	Branches          []string
 	SubRepositoryPath string
 	Language          string

 	// If set, something is wrong with the file contents, and this
 	// is the reason it wasn't indexed.
 	SkipReason string

 	// Document sections for symbols. Offsets should use bytes.
 	Symbols []DocumentSection
 }

 type docSectionSlice []DocumentSection

 func (m docSectionSlice) Len() int           { return len(m) }
 func (m docSectionSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 func (m docSectionSlice) Less(i, j int) bool { return m[i].Start < m[j].Start }

 // AddFile is a convenience wrapper for Add
 func (b *IndexBuilder) AddFile(name string, content []byte) error {
 	return b.Add(Document{Name: name, Content: content})
 }

 // CheckText returns a reason why the given contents are probably not source texts.
 func CheckText(content []byte, maxTrigramCount int) error {
 	if len(content) == 0 {
 		return nil
 	}

 	if len(content) < ngramSize {
 		return fmt.Errorf("file size smaller than %d", ngramSize)
 	}

 	trigrams := map[ngram]struct{}{}

 	var cur [3]rune
 	byteCount := 0
 	for len(content) > 0 {
 		if content[0] == 0 {
 			return fmt.Errorf("binary data at byte offset %d", byteCount)
 		}

 		r, sz := utf8.DecodeRune(content)
 		content = content[sz:]
 		byteCount += sz

 		cur[0], cur[1], cur[2] = cur[1], cur[2], r
 		if cur[0] == 0 {
 			// start of file.
 			continue
 		}

 		trigrams[runesToNGram(cur)] = struct{}{}
 		if len(trigrams) > maxTrigramCount {
 			// probably not text.
 			return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
 		}
 	}
 	return nil
 }

 func (b *IndexBuilder) populateSubRepoIndices() {
 	if b.subRepoIndices != nil {
 		return
 	}
 	paths := []string{""}
 	for k := range b.repo.SubRepoMap {
 		paths = append(paths, k)
 	}
 	sort.Strings(paths)
 	b.subRepoIndices = make(map[string]uint32, len(paths))
 	for i, p := range paths {
 		b.subRepoIndices[p] = uint32(i)
 	}
 }

 const notIndexedMarker = "NOT-INDEXED: "

 // Add a file which only occurs in certain branches.
 func (b *IndexBuilder) Add(doc Document) error {
 	hasher := crc64.New(crc64.MakeTable(crc64.ISO))

 	if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
 		doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
 		doc.Language = "binary"
 	}

 	if doc.SkipReason != "" {
 		doc.Content = []byte(notIndexedMarker + doc.SkipReason)
 		doc.Symbols = nil
 		if doc.Language == "" {
 			doc.Language = "skipped"
 		}
 	}

 	sort.Sort(docSectionSlice(doc.Symbols))
 	var last DocumentSection
 	for i, s := range doc.Symbols {
 		if i > 0 {
 			if last.End > s.Start {
 				return fmt.Errorf("sections overlap")
 			}
 		}
 		last = s
 	}
 	if last.End > uint32(len(doc.Content)) {
 		return fmt.Errorf("section goes past end of content")
 	}

 	if doc.SubRepositoryPath != "" {
 		rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
 		if err != nil || rel == doc.Name {
 			return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
 		}
 	}
 	docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
 	if err != nil {
 		return err
 	}
 	nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
 	if err != nil {
 		return err
 	}

 	subRepoIdx, ok := b.subRepoIndices[doc.SubRepositoryPath]
 	if !ok {
 		return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
 	}

 	var mask uint64
 	for _, br := range doc.Branches {
 		m := b.branchMask(br)
 		if m == 0 {
 			return fmt.Errorf("no branch found for %s", br)
 		}
 		mask |= m
 	}

 	b.subRepos = append(b.subRepos, subRepoIdx)

 	hasher.Write(doc.Content)

 	b.contentStrings = append(b.contentStrings, docStr)
 	b.runeDocSections = append(b.runeDocSections, runeSecs...)

 	b.nameStrings = append(b.nameStrings, nameStr)
 	b.docSections = append(b.docSections, doc.Symbols)
 	b.branchMasks = append(b.branchMasks, mask)
 	b.checksums = append(b.checksums, hasher.Sum(nil)...)

 	langCode, ok := b.languageMap[doc.Language]
 	if !ok {
 		if len(b.languageMap) >= 255 {
 			return fmt.Errorf("too many languages")
 		}
 		langCode = byte(len(b.languageMap))
 		b.languageMap[doc.Language] = langCode
 	}
 	b.languages = append(b.languages, langCode)

 	return nil
 }

 func (b *IndexBuilder) branchMask(br string) uint64 {
 	for i, b := range b.repo.Branches {
 		if b.Name == br {
 			return uint64(1) << uint(i)
 		}
 	}
 	return 0
 }
	// Copyright 2016 Google Inc. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package zoekt

	import (
	"bytes"
	"encoding/binary"
	"fmt"
	"hash/crc64"
	"html/template"
	"log"
	"path/filepath"
	"sort"
	"unicode/utf8"
	)

	var _ = log.Println

	const ngramSize = 3

	type searchableString struct {
	data []byte
	}

	// Filled by the linker (see build-deploy.sh)
	var Version string

	// Store character (unicode codepoint) offset (in bytes) this often.
	const runeOffsetFrequency = 100

	type postingsBuilder struct {
	postings map[ngram][]byte
	lastOffsets map[ngram]uint32

	// To support UTF-8 searching, we must map back runes to byte
	// offsets. As a first attempt, we sample regularly. The
	// precise offset can be found by walking from the recorded
	// offset to the desired rune.
	runeOffsets []uint32
	runeCount uint32

	isPlainASCII bool

	endRunes []uint32
	endByte uint32
	}

	func newPostingsBuilder() *postingsBuilder {
	return &postingsBuilder{
	postings: map[ngram][]byte{},
	lastOffsets: map[ngram]uint32{},
	isPlainASCII: true,
	}
	}

	// Store trigram offsets for the given UTF-8 data. The
	// DocumentSections must correspond to rune boundaries in the UTF-8
	// data.
	func (s postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (searchableString, []DocumentSection, error) {
	dest := searchableString{
	data: data,
	}
	var buf [8]byte
	var runeGram [3]rune

	var runeIndex uint32
	byteCount := 0
	dataSz := uint32(len(data))

	byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections))
	for _, s := range byteSections {
	byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End)
	}
	var runeSectionBoundaries []uint32

	endRune := s.runeCount
	for ; len(data) > 0; runeIndex++ {
	c, sz := utf8.DecodeRune(data)
	if sz > 1 {
	s.isPlainASCII = false
	}
	data = data[sz:]

	runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c

	if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 {
	s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount))
	}
	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
	runeSectionBoundaries = append(runeSectionBoundaries,
	endRune+uint32(runeIndex))
	byteSectionBoundaries = byteSectionBoundaries[1:]
	}

	byteCount += sz

	if runeIndex < 2 {
	continue
	}

	ng := runesToNGram(runeGram)
	lastOff := s.lastOffsets[ng]
	newOff := endRune + uint32(runeIndex) - 2

	m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
	s.postings[ng] = append(s.postings[ng], buf[:m]...)
	s.lastOffsets[ng] = newOff
	}
	s.runeCount += runeIndex

	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) {
	return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0])
	}

	// Handle symbol definition that ends at file end. This can
	// happen for labels at the end of .bat files.

	for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) {
	runeSectionBoundaries = append(runeSectionBoundaries,
	endRune+runeIndex)
	byteSectionBoundaries = byteSectionBoundaries[1:]
	}
	runeSecs := make([]DocumentSection, 0, len(byteSections))
	for i := 0; i < len(runeSectionBoundaries); i += 2 {
	runeSecs = append(runeSecs, DocumentSection{
	Start: runeSectionBoundaries[i],
	End: runeSectionBoundaries[i+1],
	})
	}

	s.endRunes = append(s.endRunes, s.runeCount)
	s.endByte += dataSz
	return &dest, runeSecs, nil
	}

	// IndexBuilder builds a single index shard.
	type IndexBuilder struct {
	contentStrings []*searchableString
	nameStrings []*searchableString
	docSections [][]DocumentSection
	runeDocSections []DocumentSection

	checksums []byte

	branchMasks []uint64
	subRepos []uint32

	contentPostings *postingsBuilder
	namePostings *postingsBuilder

	// root repository
	repo Repository

	// name to index.
	subRepoIndices map[string]uint32

	// language => language code
	languageMap map[string]byte

	// languages codes
	languages []byte
	}

	func (d *Repository) verify() error {
	for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} {
	if _, err := template.New("").Parse(t); err != nil {
	return err
	}
	}
	return nil
	}

	// ContentSize returns the number of content bytes so far ingested.
	func (b *IndexBuilder) ContentSize() uint32 {
	// Add the name too so we don't skip building index if we have
	// lots of empty files.
	return b.contentPostings.endByte + b.namePostings.endByte
	}

	// NewIndexBuilder creates a fresh IndexBuilder. The passed in
	// Repository contains repo metadata, and may be set to nil.
	func NewIndexBuilder(r Repository) (IndexBuilder, error) {
	b := &IndexBuilder{
	contentPostings: newPostingsBuilder(),
	namePostings: newPostingsBuilder(),
	languageMap: map[string]byte{},
	}

	if r == nil {
	r = &Repository{}
	}
	if err := b.setRepository(r); err != nil {
	return nil, err
	}
	return b, nil
	}

	func (b IndexBuilder) setRepository(desc Repository) error {
	if len(b.contentStrings) > 0 {
	return fmt.Errorf("setRepository called after adding files")
	}
	if err := desc.verify(); err != nil {
	return err
	}

	if len(desc.Branches) > 64 {
	return fmt.Errorf("too many branches")
	}

	b.repo = *desc

	// copy subrepomap without root
	b.repo.SubRepoMap = map[string]*Repository{}
	for k, v := range desc.SubRepoMap {
	if k != "" {
	b.repo.SubRepoMap[k] = v
	}
	}

	b.populateSubRepoIndices()
	return nil
	}

	type DocumentSection struct {
	Start, End uint32
	}

	// Document holds a document (file) to index.
	type Document struct {
	Name string
	Content []byte
	Branches []string
	SubRepositoryPath string
	Language string

	// If set, something is wrong with the file contents, and this
	// is the reason it wasn't indexed.
	SkipReason string

	// Document sections for symbols. Offsets should use bytes.
	Symbols []DocumentSection
	}

	type docSectionSlice []DocumentSection

	func (m docSectionSlice) Len() int { return len(m) }
	func (m docSectionSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
	func (m docSectionSlice) Less(i, j int) bool { return m[i].Start < m[j].Start }

	// AddFile is a convenience wrapper for Add
	func (b *IndexBuilder) AddFile(name string, content []byte) error {
	return b.Add(Document{Name: name, Content: content})
	}

	// CheckText returns a reason why the given contents are probably not source texts.
	func CheckText(content []byte, maxTrigramCount int) error {
	if len(content) == 0 {
	return nil
	}

	if len(content) < ngramSize {
	return fmt.Errorf("file size smaller than %d", ngramSize)
	}

	trigrams := map[ngram]struct{}{}

	var cur [3]rune
	byteCount := 0
	for len(content) > 0 {
	if content[0] == 0 {
	return fmt.Errorf("binary data at byte offset %d", byteCount)
	}

	r, sz := utf8.DecodeRune(content)
	content = content[sz:]
	byteCount += sz

	cur[0], cur[1], cur[2] = cur[1], cur[2], r
	if cur[0] == 0 {
	// start of file.
	continue
	}

	trigrams[runesToNGram(cur)] = struct{}{}
	if len(trigrams) > maxTrigramCount {
	// probably not text.
	return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
	}
	}
	return nil
	}

	func (b *IndexBuilder) populateSubRepoIndices() {
	if b.subRepoIndices != nil {
	return
	}
	paths := []string{""}
	for k := range b.repo.SubRepoMap {
	paths = append(paths, k)
	}
	sort.Strings(paths)
	b.subRepoIndices = make(map[string]uint32, len(paths))
	for i, p := range paths {
	b.subRepoIndices[p] = uint32(i)
	}
	}

	const notIndexedMarker = "NOT-INDEXED: "

	// Add a file which only occurs in certain branches.
	func (b *IndexBuilder) Add(doc Document) error {
	hasher := crc64.New(crc64.MakeTable(crc64.ISO))

	if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
	doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
	doc.Language = "binary"
	}

	if doc.SkipReason != "" {
	doc.Content = []byte(notIndexedMarker + doc.SkipReason)
	doc.Symbols = nil
	if doc.Language == "" {
	doc.Language = "skipped"
	}
	}

	sort.Sort(docSectionSlice(doc.Symbols))
	var last DocumentSection
	for i, s := range doc.Symbols {
	if i > 0 {
	if last.End > s.Start {
	return fmt.Errorf("sections overlap")
	}
	}
	last = s
	}
	if last.End > uint32(len(doc.Content)) {
	return fmt.Errorf("section goes past end of content")
	}

	if doc.SubRepositoryPath != "" {
	rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)
	if err != nil \|\| rel == doc.Name {
	return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath)
	}
	}
	docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
	if err != nil {
	return err
	}
	nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)
	if err != nil {
	return err
	}

	subRepoIdx, ok := b.subRepoIndices[doc.SubRepositoryPath]
	if !ok {
	return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath)
	}

	var mask uint64
	for _, br := range doc.Branches {
	m := b.branchMask(br)
	if m == 0 {
	return fmt.Errorf("no branch found for %s", br)
	}
	mask \|= m
	}

	b.subRepos = append(b.subRepos, subRepoIdx)

	hasher.Write(doc.Content)

	b.contentStrings = append(b.contentStrings, docStr)
	b.runeDocSections = append(b.runeDocSections, runeSecs...)

	b.nameStrings = append(b.nameStrings, nameStr)
	b.docSections = append(b.docSections, doc.Symbols)
	b.branchMasks = append(b.branchMasks, mask)
	b.checksums = append(b.checksums, hasher.Sum(nil)...)

	langCode, ok := b.languageMap[doc.Language]
	if !ok {
	if len(b.languageMap) >= 255 {
	return fmt.Errorf("too many languages")
	}
	langCode = byte(len(b.languageMap))
	b.languageMap[doc.Language] = langCode
	}
	b.languages = append(b.languages, langCode)

	return nil
	}

	func (b *IndexBuilder) branchMask(br string) uint64 {
	for i, b := range b.repo.Branches {
	if b.Name == br {
	return uint64(1) << uint(i)
	}
	}
	return 0
	}