blob: 74bfd5b7cc85e173ea72563f82826049b4587e62 [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zoekt
import (
"bytes"
"fmt"
"log"
"sort"
"github.com/google/zoekt/query"
)
var _ = log.Println
// candidateMatch is a candidate match for a substring.
type candidateMatch struct {
caseSensitive bool
fileName bool
substrBytes []byte
substrLowered []byte
file uint32
// Offsets are relative to the start of the filename or file contents.
runeOffset uint32
byteOffset uint32
byteMatchSz uint32
}
func (m *candidateMatch) String() string {
return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
}
func (m *candidateMatch) matchContent(content []byte) bool {
if m.caseSensitive {
comp := bytes.Compare(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))]) == 0
return comp
} else {
// It is tempting to try a simple ASCII based
// comparison if possible, but we need more
// information. Simple ASCII chars have unicode upper
// case variants (the ASCII 'k' has the Kelvin symbol
// as upper case variant). We can only degrade to
// ASCII if we are sure that both the corpus and the
// query is ASCII only
return caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
}
}
// line returns the line holding the match. If the match starts with
// the newline ending line M, we return M. The line is characterized
// by its linenumber (base-1, byte index of line start, byte index of
// line end). The line end is the index of a newline, or the filesize
// (if matching the last line of the file.)
func (m *candidateMatch) line(newlines []uint32, fileSize uint32) (lineNum, lineStart, lineEnd int) {
idx := sort.Search(len(newlines), func(n int) bool {
return newlines[n] >= m.byteOffset
})
end := int(fileSize)
if idx < len(newlines) {
end = int(newlines[idx])
}
start := 0
if idx > 0 {
start = int(newlines[idx-1] + 1)
}
return idx + 1, start, end
}
type docIterator interface {
// TODO - reconsider this name? Or don't get all the
// candidateMatch in one go?
next() []*candidateMatch
coversContent() bool
}
type ngramDocIterator struct {
ng1, ng2 ngram
query *query.Substring
leftPad uint32
rightPad uint32
distance uint32
first []uint32
last []uint32
fileIdx int
ends []uint32
// The ngram matches cover the pattern, so no need to check
// contents.
_coversContent bool
}
func (s *ngramDocIterator) coversContent() bool {
return s._coversContent
}
func (s *ngramDocIterator) next() []*candidateMatch {
patBytes := []byte(s.query.Pattern)
lowerPatBytes := toLower(patBytes)
var candidates []*candidateMatch
for {
if len(s.first) == 0 || len(s.last) == 0 {
break
}
p1 := s.first[0]
p2 := s.last[0]
for s.fileIdx < len(s.ends) && s.ends[s.fileIdx] <= p1 {
s.fileIdx++
}
if p1+s.distance < p2 {
s.first = s.first[1:]
} else if p1+s.distance > p2 {
s.last = s.last[1:]
} else {
s.first = s.first[1:]
s.last = s.last[1:]
var fileStart uint32
if s.fileIdx > 0 {
fileStart = s.ends[s.fileIdx-1]
}
if p1 < s.leftPad+fileStart || p1+s.distance+ngramSize+s.rightPad > s.ends[s.fileIdx] {
continue
}
cand := &candidateMatch{
caseSensitive: s.query.CaseSensitive,
fileName: s.query.FileName,
substrBytes: patBytes,
substrLowered: lowerPatBytes,
// TODO - this is wrong for casefolding searches.
byteMatchSz: uint32(len(lowerPatBytes)),
file: uint32(s.fileIdx),
runeOffset: p1 - fileStart - s.leftPad,
}
candidates = append(candidates, cand)
}
}
return candidates
}