// Copyright (C) 2019 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.googlesource.gerrit.plugins.copyright.lib;

import com.google.common.base.Preconditions;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.MalformedInputException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnmappableCharacterException;
import java.util.ArrayList;
import java.util.Collections;

/**
 * Class for reading character streams to scan for copyright declarations while indexing the
 * newlines for quick line number lookups.
 *
 * <p>Interprets the bytes of the input source as UTF-8 when it can. Reinterprets some non-UTF-8
 * bytes that empirically appear in or near copyrights. In many cases, these correspond to the
 * low-byte of a UTF-16 character stored as-is without requisite escaping for UTF-8. In other cases,
 * these are just characters from other arbitrary code pages.
 *
 * <p>Replaces all other non-UTF-8 (i.e. binary) bytes with '?' because it matches neither name,
 * whitespace, nor comment charactes and expresses appropriate uncertainty.
 */
public class IndexedLineReader implements Readable, Closeable {

  public static final int BUFFER_SIZE = 2048;
  private static final int INITIAL_LINES_CAPACITY = 1024;
  private static final int FALLBACK_BUFFER_SIZE = 16;

  private String name; // identifies input source
  private InputStream source; // raw data (bytes) to read
  private ByteBuffer bb; // io buffer

  private CharBuffer cb; // Decoded but unread characters.

  private int currChar; // Count of previously read chars.
  private int currLine; // Count of previously read newlines.

  private ArrayList<Integer> lineIndex; // Count of chars up to end of each line.

  private CharsetDecoder decoder; // Converts UTF-8 bytes to chars.
  private boolean atEof; // False until entire source is read.

  public int firstBinary;
  public int numBinary;

  /**
   * @param name Identifies the input source.
   * @param size Hints number of bytes in source. Use -1 if unknown.
   * @param source Input source of bytes (usually UTF-8 encoded) to scan.
   */
  public IndexedLineReader(String name, long size, InputStream source) {
    this.name = name;
    this.source = source;

    int bufferSize = size < 1 || size > BUFFER_SIZE ? BUFFER_SIZE : (int) size;
    bb = ByteBuffer.wrap(new byte[bufferSize > 8 ? bufferSize : 8]);
    bb.flip();

    cb = CharBuffer.allocate(FALLBACK_BUFFER_SIZE);
    cb.flip();

    currChar = 0;

    int initialLines =
        size < 30 || size > 30 * INITIAL_LINES_CAPACITY ? INITIAL_LINES_CAPACITY : (int) size / 30;
    lineIndex = new ArrayList<>(initialLines);
    lineIndex.add(0);

    firstBinary = -1;
    numBinary = 0;

    decoder =
        StandardCharsets.UTF_8
            .newDecoder()
            .onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);
  }

  /**
   * Attempts to read characters into the specified character buffer. The buffer is used as a
   * repository of characters as-is: the only changes made are the results of a put operation. No
   * flipping or rewinding of the buffer is performed.
   *
   * @param dest The buffer into which the read characters are put.
   * @return The number of {@code char} values added to the buffer, or -1 if this source of
   *     characters is at its end.
   * @throws IOException if an I/O error occurs
   * @throws NullPointerException if dest is null
   * @throws java.nio.ReadOnlyBufferException if dest is a read only buffer
   */
  @Override
  @SuppressWarnings("ReferenceEquality")
  public int read(CharBuffer dest) throws IOException {
    Preconditions.checkNotNull(dest);
    Preconditions.checkArgument(dest.remaining() >= 2);
    try {
      int nPrev = 0;
      if (atEof && !this.cb.hasRemaining() && !bb.hasRemaining()) {
        // At end with nothing left in the buffers -- time to indicate EOF.
        return -1;
      }
      if (!dest.hasRemaining()) {
        throw new BufferOverflowException();
      }
      int nRead = 0;
      if (this.cb.hasRemaining() && dest != this.cb) {
        // Copy the previously decoded characters (either all of them or enough to fill dest) into
        // dest.
        nPrev = Math.min(dest.remaining(), this.cb.remaining());
        dest.put(this.cb);
      }
      while (dest.hasRemaining()) {
        int oldCharOffset = dest.position() - nPrev;
        nPrev = 0;
        CoderResult cr = decoder.decode(bb, dest, atEof);
        nRead += dest.position() - oldCharOffset;
        // Scan decoded characters to index the line endings.
        for (int i = oldCharOffset; i < dest.position(); i++) {
          char c = dest.array()[dest.arrayOffset() + i];
          currChar++;
          if (c == '\n') {
            lineIndex.set(currLine, currChar);
            currLine++;
            lineIndex.add(currChar);
          } else if (c == '&') {
            if (!replaceAt(dest, i, "&quot;", '"')) {
              nRead -= cutAt(dest, i);
              return nRead;
            }
            if (!replaceAt(dest, i, "&#34;", '"')) {
              nRead -= cutAt(dest, i);
              return nRead;
            }
          } else if (c == '<') {
            if (!replaceAt(dest, i, "<var>", '"')) {
              nRead -= cutAt(dest, i);
              return nRead;
            }
            if (!replaceAt(dest, i, "</var>", '"')) {
              nRead -= cutAt(dest, i);
              return nRead;
            }
          }
          lineIndex.set(currLine, currChar);
        }
        if (cr.isUnderflow()) { // all bytes decoded -- read more if possible.
          if (atEof) {
            break;
          }
          bb.compact();
          int n =
              (numBinary > currLine)
                  ? -1
                  : source.read(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
          if (n > 0) {
            bb.position(bb.position() + n);
          }
          bb.flip();
          if (n < 0) {
            atEof = true;
          }
          decoder.reset();
          continue;
        } else if (cr.isOverflow()) {
          // dest filled or dest has space for 1 character, but next byte sequence to decode is a
          // surrogate pair requiring 2 characters to represent.
          if (nRead == 0) {
            // Presumably a surrogate pair -- need to buffer the un-read 2nd character of the pair.
            this.cb.clear();
            int oldPosition = bb.position();
            decoder.reset();
            cr = decoder.decode(bb, this.cb, false);
            int n = bb.position() - oldPosition;
            this.cb.flip();
            if (n == 0 || !this.cb.hasRemaining()) {
              // cr must be an error i.e. next byte not part of valid UTF-8 character.
              dest.put('?');
              bb.position(bb.position() + 1);
            } else {
              dest.put(this.cb.get());
            }
            nRead++;
          }
          break;
        } else if (cr.isError()) {
          // not valid utf-8 sequence -- binary file or other code page...
          if (firstBinary < 0) {
            firstBinary = currChar;
          }
          numBinary += cr.length();
          nRead += cr.length();
          if (!dest.hasRemaining()) {
            break;
          }
          byte b = bb.array()[bb.arrayOffset() + bb.position()];
          char c = '?'; // By default, replace binary data with '?'

          // There is no need to try to translate all binary data -- some is just binary.
          //
          // Empirically the non-UTF-8 characters below sometimes appear in or near copyrights.
          // In some cases, the file may be encoded with a different code page, or a UTF
          // character above 128 may have been stored without proper escaping. Making these
          // substitutions improves readability of extracted matches and licenses.
          //
          // The range U+00c0 to U+00ff are mostly accented characters, which require escaping in
          // UTF-8. The low-order byte sometimes appears without escaping -- perhaps this
          // corresponds to a different code page? In any case, just interpreting as chars works in
          // files that include them in copyrights, and doesn't matter when they appear in other
          // binary sequences...
          if (b >= (byte) 0xc0 && b <= (byte) 0xff) {
            c = (char) ('\u0000' | (b & 0xff));
          }
          switch (b) {
            case (byte) 0: // preserve nul character
              c = '\000';
              break;
            case (byte) 0x87: // sometimes appears where one might expect bullet
            case (byte) 0xb7: // middle-dot could be bullet -- unescaped U+00b7
              c = '*'; // treat bullets the same as comment character '*' -- ignored as whitespace
              break;
            case (byte) 0x85: // sometimes appears where one might expect (TM)
            case (byte) 0x99: // sometimes appears where one might expect (TM)
              c = '™';
              break;
            case (byte) 0xa0: // non-breaking space -- unescaped U+00a0
            case (byte) 0xa7: // section symbol -- unescapd U+00a7
            case (byte) 0xad: // soft hyphen -- unescaped U+00ad
            case (byte) 0xb6: // pilcrow or paragraph symbol -- unescaped U+00b6
              // treat as white space
              c = ' ';
              break;
            case (byte) 0xa9: // copright -- unescaped U+00a9
              c = '©';
              break;
            case (byte) 0xae: // registered -- unescaped U+00ae
              c = '®';
              break;
            case (byte) 0x94: // sometimes appears in place of ö in Björn
              c = 'ö';
              break;
          }
          dest.put(c);
          bb.position(bb.position() + 1);
          decoder.reset();
          continue;
        }
        assert false : "Unexpected CoderResult state: " + cr.toString();
      }
      return nRead;
    } catch (CharacterCodingException e) {
      throw binaryFile(e);
    } catch (IOException e) {
      throw ioException(e);
    }
  }

  /**
   * Reads a string from the file up to the next delimiter `delim` (or until eof if no delimiter)
   * appending the string to buffer `sb`.
   *
   * <p>Resulting string does not include the delimiter.
   *
   * @param delim The string delimiter. e.g. '\n' or '\000'
   * @param sb A string builder into which the string is read without the delimiter.
   * @return The number of characters read from the stream including the delimiter.
   */
  public int readString(char delim, StringBuilder sb) throws IOException {
    char[] buf = new char[FALLBACK_BUFFER_SIZE];
    CharBuffer cb = CharBuffer.wrap(buf);
    if (this.cb.hasRemaining()) {
      cb.put(this.cb);
    }
    cb.flip();
    int nRead = 0;
    int tries = 3;
    while (true) {
      while (cb.hasRemaining()) {
        char c = cb.get();
        nRead++;
        if (c == delim) {
          unput(cb);
          return nRead;
        }
        sb.append(c);
      }
      cb.clear();
      int n = read(cb);
      cb.flip();
      if (n < 0) {
        if (nRead == 0) {
          return -1;
        }
        break;
      } else if (n == 0) {
        tries--;
        if (tries < 1) {
          if (nRead == 0) {
            return -1;
          }
          break;
        }
      }
    }
    return nRead;
  }

  @Override
  public void close() throws IOException {
    source.close();
  }

  /** Returns the line number containing the given character position, `charPosn`. */
  public int getLineNumber(int charPosn) {
    int index = Collections.binarySearch(lineIndex, charPosn);
    if (index < 0) { // binarySearch returns inexact matches as negative indexes.
      index = -index - 1;
    }
    return index + 1;
  }

  /** Wrap a CharacterCodingException with a BinaryFileException describing file, line, etc. */
  private BinaryFileException binaryFile(CharacterCodingException cause) {
    int lineNumber = getLineNumber(currChar);
    int index = lineNumber - 1;
    int column = (index == 0 ? currChar : currChar - lineIndex.get(index - 1)) + 1;
    int length = 0;
    if (cause instanceof MalformedInputException) {
      MalformedInputException me = (MalformedInputException) cause;
      length = me.getInputLength();
    } else if (cause instanceof UnmappableCharacterException) {
      UnmappableCharacterException ue = (UnmappableCharacterException) cause;
      length = ue.getInputLength();
    }
    StringBuffer sb = new StringBuffer();
    sb.append(name);
    for (int i = 0; i < length; i++) {
      sb.append(String.format(" %02x", bb.array()[bb.arrayOffset() + bb.position() + i]));
    }
    return new BinaryFileException(sb.toString(), currChar, lineNumber, column, cause);
  }

  /** Wrap an IOException with a description of the current file, line number and column number. */
  private LineReaderIOException ioException(IOException cause) {
    int lineNumber = getLineNumber(currChar);
    int index = lineNumber - 1;
    int column = (index == 0 ? currChar : currChar - lineIndex.get(index)) + 1;
    return new LineReaderIOException(
        "IndexedLineReaderIOException " + cause.getMessage() + " " + name,
        currChar,
        lineNumber,
        column,
        cause);
  }

  /** Cut the current buffer `cb` at `position` putting the rest in `this.cb`. */
  @SuppressWarnings("ReferenceEquality")
  private int cutAt(CharBuffer cb, int position) {
    if (cb == this.cb) {
      throw new BufferOverflowException();
    }
    int nCut = cb.position() - position;
    this.cb.clear();
    this.cb.put(cb.array(), cb.arrayOffset() + position, nCut);
    cb.position(position);
    this.cb.flip();
    return nCut;
  }

  /** Save the remaining characters from `cb` onto `this.cb` for later. */
  private void unput(CharBuffer cb) {
    if (!this.cb.hasRemaining()) {
      this.cb.clear();
      this.cb.put(cb);
      this.cb.flip();
      return;
    }
    // Shift `this.cb` and prepend `cb`
    int len = cb.remaining();
    if (this.cb.limit() + len > this.cb.capacity()) {
      throw new BufferOverflowException();
    }
    this.cb.limit(this.cb.limit() + len);
    for (int i = this.cb.limit() - len - 1; i >= this.cb.position(); i--) {
      this.cb.put(i + len, this.cb.get(i));
    }
    for (int i = 0; i < len; i++) {
      this.cb.put(this.cb.position() + i, cb.get());
    }
  }

  /** Conditionally replaces `prefix` when found at `position` in `cb` with `replacement` char. */
  private static boolean replaceAt(CharBuffer cb, int position, String prefix, char replacement) {
    for (int i = 0; i < prefix.length(); i++) {
      if (position + i >= cb.position()) {
        return false;
      }
      if (cb.get(position + i) != prefix.charAt(i)) {
        return true;
      }
    }
    cb.put(position, replacement);
    int dst = position + 1;
    int src = position + prefix.length();
    while (src < cb.position()) {
      cb.put(dst, cb.get(src));
      src++;
      dst++;
    }
    cb.position(dst);
    return true;
  }

  /** Describes an IO error at a specific location in a file. */
  public static class LineReaderIOException extends IOException {
    private int charPosn;
    private int lineNumber;
    private int column;

    LineReaderIOException(
        String message, int charPosn, int lineNumber, int column, Throwable cause) {
      super(message, cause);
      this.charPosn = charPosn;
      this.lineNumber = lineNumber;
      this.column = column;
    }

    @Override
    public String getMessage() {
      StringBuffer m = new StringBuffer();
      m.append(super.getMessage())
          .append(" line ")
          .append(lineNumber)
          .append(" col ")
          .append(column)
          .append(" offset ")
          .append(charPosn);
      return m.toString();
    }
  }

  /** Thrown when a binary file is detected. */
  public static class BinaryFileException extends LineReaderIOException {
    BinaryFileException(
        String fileName, int charPosn, int lineNumber, int column, Throwable cause) {
      super("Binary file: " + fileName, charPosn, lineNumber, column, cause);
    }
  }
}
