dociter.go - zoekt - Git at Google

 // Copyright 2016 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package zoekt

 import (
 	"bytes"
 	"fmt"
 	"log"
 	"sort"

 	"github.com/google/zoekt/query"
 )

 var _ = log.Println

 // candidateMatch is a candidate match for a substring.
 type candidateMatch struct {
 	caseSensitive bool
 	fileName      bool

 	substrBytes   []byte
 	substrLowered []byte

 	file uint32

 	// Offsets are relative to the start of the filename or file contents.
 	runeOffset  uint32
 	byteOffset  uint32
 	byteMatchSz uint32
 }

 func (m *candidateMatch) String() string {
 	return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
 }

 func (m *candidateMatch) matchContent(content []byte) bool {
 	if m.caseSensitive {
 		comp := bytes.Compare(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))]) == 0
 		return comp
 	} else {
 		// It is tempting to try a simple ASCII based
 		// comparison if possible, but we need more
 		// information. Simple ASCII chars have unicode upper
 		// case variants (the ASCII 'k' has the Kelvin symbol
 		// as upper case variant). We can only degrade to
 		// ASCII if we are sure that both the corpus and the
 		// query is ASCII only
 		return caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
 	}
 }

 // line returns the line holding the match. If the match starts with
 // the newline ending line M, we return M.  The line is characterized
 // by its linenumber (base-1, byte index of line start, byte index of
 // line end).  The line end is the index of a newline, or the filesize
 // (if matching the last line of the file.)
 func (m *candidateMatch) line(newlines []uint32, fileSize uint32) (lineNum, lineStart, lineEnd int) {
 	idx := sort.Search(len(newlines), func(n int) bool {
 		return newlines[n] >= m.byteOffset
 	})

 	end := int(fileSize)
 	if idx < len(newlines) {
 		end = int(newlines[idx])
 	}

 	start := 0
 	if idx > 0 {
 		start = int(newlines[idx-1] + 1)
 	}

 	return idx + 1, start, end
 }

 type docIterator interface {
 	// TODO - reconsider this name? Or don't get all the
 	// candidateMatch in one go?
 	next() []*candidateMatch
 	coversContent() bool
 }

 type ngramDocIterator struct {
 	ng1, ng2 ngram
 	query    *query.Substring

 	leftPad  uint32
 	rightPad uint32
 	distance uint32
 	first    []uint32
 	last     []uint32

 	fileIdx int
 	ends    []uint32

 	// The ngram matches cover the pattern, so no need to check
 	// contents.
 	_coversContent bool
 }

 func (s *ngramDocIterator) coversContent() bool {
 	return s._coversContent
 }

 func (s *ngramDocIterator) next() []*candidateMatch {
 	patBytes := []byte(s.query.Pattern)
 	lowerPatBytes := toLower(patBytes)

 	var candidates []*candidateMatch
 	for {
 		if len(s.first) == 0 || len(s.last) == 0 {
 			break
 		}
 		p1 := s.first[0]
 		p2 := s.last[0]

 		for s.fileIdx < len(s.ends) && s.ends[s.fileIdx] <= p1 {
 			s.fileIdx++
 		}

 		if p1+s.distance < p2 {
 			s.first = s.first[1:]
 		} else if p1+s.distance > p2 {
 			s.last = s.last[1:]
 		} else {
 			s.first = s.first[1:]
 			s.last = s.last[1:]

 			var fileStart uint32
 			if s.fileIdx > 0 {
 				fileStart = s.ends[s.fileIdx-1]
 			}
 			if p1 < s.leftPad+fileStart || p1+s.distance+ngramSize+s.rightPad > s.ends[s.fileIdx] {
 				continue
 			}

 			cand := &candidateMatch{
 				caseSensitive: s.query.CaseSensitive,
 				fileName:      s.query.FileName,
 				substrBytes:   patBytes,
 				substrLowered: lowerPatBytes,
 				// TODO - this is wrong for casefolding searches.
 				byteMatchSz: uint32(len(lowerPatBytes)),
 				file:        uint32(s.fileIdx),
 				runeOffset:  p1 - fileStart - s.leftPad,
 			}
 			candidates = append(candidates, cand)
 		}
 	}
 	return candidates
 }
	// Copyright 2016 Google Inc. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package zoekt

	import (
	"bytes"
	"fmt"
	"log"
	"sort"

	"github.com/google/zoekt/query"
	)

	var _ = log.Println

	// candidateMatch is a candidate match for a substring.
	type candidateMatch struct {
	caseSensitive bool
	fileName bool

	substrBytes []byte
	substrLowered []byte

	file uint32

	// Offsets are relative to the start of the filename or file contents.
	runeOffset uint32
	byteOffset uint32
	byteMatchSz uint32
	}

	func (m *candidateMatch) String() string {
	return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
	}

	func (m *candidateMatch) matchContent(content []byte) bool {
	if m.caseSensitive {
	comp := bytes.Compare(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))]) == 0
	return comp
	} else {
	// It is tempting to try a simple ASCII based
	// comparison if possible, but we need more
	// information. Simple ASCII chars have unicode upper
	// case variants (the ASCII 'k' has the Kelvin symbol
	// as upper case variant). We can only degrade to
	// ASCII if we are sure that both the corpus and the
	// query is ASCII only
	return caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
	}
	}

	// line returns the line holding the match. If the match starts with
	// the newline ending line M, we return M. The line is characterized
	// by its linenumber (base-1, byte index of line start, byte index of
	// line end). The line end is the index of a newline, or the filesize
	// (if matching the last line of the file.)
	func (m *candidateMatch) line(newlines []uint32, fileSize uint32) (lineNum, lineStart, lineEnd int) {
	idx := sort.Search(len(newlines), func(n int) bool {
	return newlines[n] >= m.byteOffset
	})

	end := int(fileSize)
	if idx < len(newlines) {
	end = int(newlines[idx])
	}

	start := 0
	if idx > 0 {
	start = int(newlines[idx-1] + 1)
	}

	return idx + 1, start, end
	}

	type docIterator interface {
	// TODO - reconsider this name? Or don't get all the
	// candidateMatch in one go?
	next() []*candidateMatch
	coversContent() bool
	}

	type ngramDocIterator struct {
	ng1, ng2 ngram
	query *query.Substring

	leftPad uint32
	rightPad uint32
	distance uint32
	first []uint32
	last []uint32

	fileIdx int
	ends []uint32

	// The ngram matches cover the pattern, so no need to check
	// contents.
	_coversContent bool
	}

	func (s *ngramDocIterator) coversContent() bool {
	return s._coversContent
	}

	func (s ngramDocIterator) next() []candidateMatch {
	patBytes := []byte(s.query.Pattern)
	lowerPatBytes := toLower(patBytes)

	var candidates []*candidateMatch
	for {
	if len(s.first) == 0 \|\| len(s.last) == 0 {
	break
	}
	p1 := s.first[0]
	p2 := s.last[0]

	for s.fileIdx < len(s.ends) && s.ends[s.fileIdx] <= p1 {
	s.fileIdx++
	}

	if p1+s.distance < p2 {
	s.first = s.first[1:]
	} else if p1+s.distance > p2 {
	s.last = s.last[1:]
	} else {
	s.first = s.first[1:]
	s.last = s.last[1:]

	var fileStart uint32
	if s.fileIdx > 0 {
	fileStart = s.ends[s.fileIdx-1]
	}
	if p1 < s.leftPad+fileStart \|\| p1+s.distance+ngramSize+s.rightPad > s.ends[s.fileIdx] {
	continue
	}

	cand := &candidateMatch{
	caseSensitive: s.query.CaseSensitive,
	fileName: s.query.FileName,
	substrBytes: patBytes,
	substrLowered: lowerPatBytes,
	// TODO - this is wrong for casefolding searches.
	byteMatchSz: uint32(len(lowerPatBytes)),
	file: uint32(s.fileIdx),
	runeOffset: p1 - fileStart - s.leftPad,
	}
	candidates = append(candidates, cand)
	}
	}
	return candidates
	}