blob: 62f61103fbe65cd6fbe81caa29557125b15813a2 [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zoekt
import (
"encoding/binary"
"unicode"
"unicode/utf8"
)
func generateCaseNgrams(g ngram) []ngram {
asRunes := ngramToRunes(g)
variants := make([]ngram, 0, 8)
cur := asRunes
for {
for i := 0; i < 3; i++ {
next := unicode.SimpleFold(cur[i])
cur[i] = next
if next != asRunes[i] {
break
}
}
variants = append(variants, runesToNGram(cur))
if cur == asRunes {
break
}
}
return variants
}
func toLower(in []byte) []byte {
out := make([]byte, 0, len(in))
var buf [4]byte
for _, c := range string(in) {
i := utf8.EncodeRune(buf[:], unicode.ToLower(c))
out = append(out, buf[:i]...)
}
return out
}
// compare 'lower' and 'mixed', where lower is the needle. 'mixed' may
// be larger than 'lower'. Returns whether there was a match, and if
// yes, the byte size of the match.
func caseFoldingEqualsRunes(lower, mixed []byte) (int, bool) {
matchTotal := 0
for len(lower) > 0 && len(mixed) > 0 {
lr, lsz := utf8.DecodeRune(lower)
lower = lower[lsz:]
mr, msz := utf8.DecodeRune(mixed)
mixed = mixed[msz:]
matchTotal += msz
if lr != unicode.ToLower(mr) {
return 0, false
}
}
return matchTotal, len(lower) == 0
}
type ngram uint64
func runesToNGram(b [ngramSize]rune) ngram {
return ngram(uint64(b[0])<<42 | uint64(b[1])<<21 | uint64(b[2]))
}
func bytesToNGram(b []byte) ngram {
return runesToNGram([ngramSize]rune{rune(b[0]), rune(b[1]), rune(b[2])})
}
func stringToNGram(s string) ngram {
return bytesToNGram([]byte(s))
}
func ngramToBytes(n ngram) []byte {
rs := ngramToRunes(n)
return []byte{byte(rs[0]), byte(rs[1]), byte(rs[2])}
}
const runeMask = 1<<21 - 1
func ngramToRunes(n ngram) [ngramSize]rune {
return [ngramSize]rune{rune((n >> 42) & runeMask), rune((n >> 21) & runeMask), rune(n & runeMask)}
}
func (n ngram) String() string {
rs := ngramToRunes(n)
return string(rs[:])
}
type runeNgramOff struct {
ngram ngram
byteSize uint32 // size of ngram
byteOff uint32
runeOff uint32
}
func splitNGrams(str []byte) []runeNgramOff {
var runeGram [3]rune
var off [3]uint32
var runeCount int
result := make([]runeNgramOff, 0, len(str))
var i uint32
chars := -1
for len(str) > 0 {
chars++
r, sz := utf8.DecodeRune(str)
str = str[sz:]
runeGram[0] = runeGram[1]
off[0] = off[1]
runeGram[1] = runeGram[2]
off[1] = off[2]
runeGram[2] = r
off[2] = uint32(i)
i += uint32(sz)
runeCount++
if runeCount < ngramSize {
continue
}
ng := runesToNGram(runeGram)
result = append(result, runeNgramOff{
ngram: ng,
byteSize: i - off[0],
byteOff: off[0],
runeOff: uint32(chars),
})
}
return result
}
const (
_classChar = 0
_classDigit = iota
_classPunct = iota
_classOther = iota
_classSpace = iota
)
func byteClass(c byte) int {
if (c >= 'a' && c <= 'z') || c >= 'A' && c <= 'Z' {
return _classChar
}
if c >= '0' && c <= '9' {
return _classDigit
}
switch c {
case ' ', '\n':
return _classSpace
case '.', ',', ';', '"', '\'':
return _classPunct
default:
return _classOther
}
}
func marshalDocSections(secs []DocumentSection) []byte {
ints := make([]uint32, 0, len(secs)*2)
for _, s := range secs {
ints = append(ints, uint32(s.Start), uint32(s.End))
}
return toSizedDeltas(ints)
}
func unmarshalDocSections(in []byte, buf []DocumentSection) (secs []DocumentSection) {
// TODO - ints is unnecessary garbage here.
ints := fromSizedDeltas(in, nil)
if cap(buf) >= len(ints)/2 {
buf = buf[:0]
} else {
buf = make([]DocumentSection, 0, len(ints)/2)
}
for len(ints) > 0 {
buf = append(buf, DocumentSection{ints[0], ints[1]})
ints = ints[2:]
}
return buf
}
type ngramSlice []ngram
func (p ngramSlice) Len() int { return len(p) }
func (p ngramSlice) Less(i, j int) bool {
return p[i] < p[j]
}
func (p ngramSlice) Swap(i, j int) {
p[i], p[j] = p[j], p[i]
}
func toSizedDeltas(offsets []uint32) []byte {
var enc [8]byte
deltas := make([]byte, 0, len(offsets)*2)
m := binary.PutUvarint(enc[:], uint64(len(offsets)))
deltas = append(deltas, enc[:m]...)
var last uint32
for _, p := range offsets {
delta := p - last
last = p
m := binary.PutUvarint(enc[:], uint64(delta))
deltas = append(deltas, enc[:m]...)
}
return deltas
}
func fromSizedDeltas(data []byte, ps []uint32) []uint32 {
sz, m := binary.Uvarint(data)
data = data[m:]
if cap(ps) < int(sz) {
ps = make([]uint32, 0, sz)
} else {
ps = ps[:0]
}
var last uint32
for len(data) > 0 {
delta, m := binary.Uvarint(data)
offset := last + uint32(delta)
last = offset
data = data[m:]
ps = append(ps, offset)
}
return ps
}
func fromDeltas(data []byte, buf []uint32) []uint32 {
buf = buf[:0]
if cap(buf) < len(data)/2 {
buf = make([]uint32, 0, len(data)/2)
}
var last uint32
for len(data) > 0 {
delta, m := binary.Uvarint(data)
offset := last + uint32(delta)
last = offset
data = data[m:]
buf = append(buf, offset)
}
return buf
}