ctags: skip long lines

Currently we will fail to index a repository if it contains a very large
symbol. This is due to bufio.Scanner returning bufio.ErrTooLong. The
documentation encourages using bufio.Reader to have a different behaviour,
which is what this commit implements.

We introduce a scanner struct which mimics a minimal API of bufio.Scanner. It
directly uses bufio.Reader, but taking care to skip lines that are too long or
empty. It will skip any line larger than 4096 (including newline).

Examples of repositories with very large symbols:
- https://github.com/noshotz/eldorito
- https://github.com/corretto/corretto-8
- https://github.com/ifsnow/facebooklivereaction

Change-Id: Ic60f39c79f63363ef2d4175c4d6de7baae90355c
diff --git a/ctags/json.go b/ctags/json.go
index 0e3a9e5..980b54b 100644
--- a/ctags/json.go
+++ b/ctags/json.go
@@ -33,7 +33,7 @@
 type ctagsProcess struct {
 	cmd     *exec.Cmd
 	in      io.WriteCloser
-	out     *bufio.Scanner
+	out     *scanner
 	outPipe io.ReadCloser
 }
 
@@ -58,7 +58,7 @@
 	proc := ctagsProcess{
 		cmd:     cmd,
 		in:      in,
-		out:     bufio.NewScanner(out),
+		out:     &scanner{r: bufio.NewReaderSize(out, 4096)},
 		outPipe: out,
 	}
 
@@ -82,15 +82,14 @@
 
 func (p *ctagsProcess) read(rep *reply) error {
 	if !p.out.Scan() {
-		// Some errors (eg. token too long) do not kill the
-		// parser. We would deadlock if we waited for the
-		// process to exit.
+		// Some errors do not kill the parser. We would deadlock if we waited
+		// for the process to exit.
 		err := p.out.Err()
 		p.Close()
 		return err
 	}
 	if debug {
-		log.Printf("read %s", p.out.Text())
+		log.Printf("read %q", p.out.Bytes())
 	}
 
 	// See https://github.com/universal-ctags/ctags/issues/1493
@@ -100,7 +99,7 @@
 
 	err := json.Unmarshal(p.out.Bytes(), rep)
 	if err != nil {
-		return fmt.Errorf("unmarshal(%s): %v", p.out.Text(), err)
+		return fmt.Errorf("unmarshal(%q): %v", p.out.Bytes(), err)
 	}
 	return nil
 }
@@ -188,6 +187,52 @@
 	return es, nil
 }
 
+// scanner is like bufio.Scanner but skips long lines instead of returning
+// bufio.ErrTooLong.
+//
+// Additionally it will skip empty lines.
+type scanner struct {
+	r    *bufio.Reader
+	line []byte
+	err  error
+}
+
+func (s *scanner) Scan() bool {
+	if s.err != nil {
+		return false
+	}
+
+	var (
+		err  error
+		line []byte
+	)
+
+	for err == nil && len(line) == 0 {
+		line, err = s.r.ReadSlice('\n')
+		for err == bufio.ErrBufferFull {
+			// make line empty so we ignore it
+			line = nil
+			_, err = s.r.ReadSlice('\n')
+		}
+		line = bytes.TrimSuffix(line, []byte{'\n'})
+		line = bytes.TrimSuffix(line, []byte{'\r'})
+	}
+
+	s.line, s.err = line, err
+	return len(line) > 0
+}
+
+func (s *scanner) Bytes() []byte {
+	return s.line
+}
+
+func (s *scanner) Err() error {
+	if s.err == io.EOF {
+		return nil
+	}
+	return s.err
+}
+
 type Parser interface {
 	Parse(name string, content []byte) ([]*Entry, error)
 }
diff --git a/ctags/json_test.go b/ctags/json_test.go
index 4a6891d..c6f1d3c 100644
--- a/ctags/json_test.go
+++ b/ctags/json_test.go
@@ -15,9 +15,13 @@
 package ctags
 
 import (
+	"bufio"
 	"os/exec"
 	"reflect"
+	"strings"
 	"testing"
+
+	"github.com/google/go-cmp/cmp"
 )
 
 func TestJSON(t *testing.T) {
@@ -104,3 +108,36 @@
 		}
 	}
 }
+
+func TestScanner(t *testing.T) {
+	size := 20
+
+	input := strings.Join([]string{
+		"aaaaaaaaa",
+		strings.Repeat("B", 3*size+3),
+		strings.Repeat("C", size) + strings.Repeat("D", size+1),
+		"",
+		strings.Repeat("e", size-1),
+		"f\r",
+		"gg",
+	}, "\n")
+	want := []string{
+		"aaaaaaaaa",
+		strings.Repeat("e", size-1),
+		"f",
+		"gg",
+	}
+
+	var got []string
+	r := &scanner{r: bufio.NewReaderSize(strings.NewReader(input), size)}
+	for r.Scan() {
+		got = append(got, string(r.Bytes()))
+	}
+	if err := r.Err(); err != nil {
+		t.Fatal(err)
+	}
+
+	if !cmp.Equal(got, want) {
+		t.Errorf("mismatch (-want +got):\n%s", cmp.Diff(want, got))
+	}
+}