| // Copyright (C) 2019 The Android Open Source Project |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package com.googlesource.gerrit.plugins.copyright.lib; |
| |
| import com.google.common.base.Preconditions; |
| import java.io.Closeable; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.nio.BufferOverflowException; |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.charset.CharacterCodingException; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CoderResult; |
| import java.nio.charset.CodingErrorAction; |
| import java.nio.charset.MalformedInputException; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.charset.UnmappableCharacterException; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| |
| /** |
| * Class for reading character streams to scan for copyright declarations while indexing the |
| * newlines for quick line number lookups. |
| * |
| * <p>Interprets the bytes of the input source as UTF-8 when it can. Reinterprets some non-UTF-8 |
| * bytes that empirically appear in or near copyrights. In many cases, these correspond to the |
| * low-byte of a UTF-16 character stored as-is without requisite escaping for UTF-8. In other cases, |
| * these are just characters from other arbitrary code pages. |
| * |
| * <p>Replaces all other non-UTF-8 (i.e. binary) bytes with '?' because it matches neither name, |
| * whitespace, nor comment charactes and expresses appropriate uncertainty. |
| */ |
| public class IndexedLineReader implements Readable, Closeable { |
| |
| public static final int BUFFER_SIZE = 2048; |
| private static final int INITIAL_LINES_CAPACITY = 1024; |
| private static final int FALLBACK_BUFFER_SIZE = 16; |
| |
| private String name; // identifies input source |
| private InputStream source; // raw data (bytes) to read |
| private ByteBuffer bb; // io buffer |
| |
| private CharBuffer cb; // Decoded but unread characters. |
| |
| private int currChar; // Count of previously read chars. |
| private int currLine; // Count of previously read newlines. |
| |
| private ArrayList<Integer> lineIndex; // Count of chars up to end of each line. |
| |
| private CharsetDecoder decoder; // Converts UTF-8 bytes to chars. |
| private boolean atEof; // False until entire source is read. |
| |
| public int firstBinary; |
| public int numBinary; |
| |
| /** |
| * @param name Identifies the input source. |
| * @param size Hints number of bytes in source. Use -1 if unknown. |
| * @param source Input source of bytes (usually UTF-8 encoded) to scan. |
| */ |
| public IndexedLineReader(String name, long size, InputStream source) { |
| this.name = name; |
| this.source = source; |
| |
| int bufferSize = size < 1 || size > BUFFER_SIZE ? BUFFER_SIZE : (int) size; |
| bb = ByteBuffer.wrap(new byte[bufferSize > 8 ? bufferSize : 8]); |
| bb.flip(); |
| |
| cb = CharBuffer.allocate(FALLBACK_BUFFER_SIZE); |
| cb.flip(); |
| |
| currChar = 0; |
| |
| int initialLines = |
| size < 30 || size > 30 * INITIAL_LINES_CAPACITY ? INITIAL_LINES_CAPACITY : (int) size / 30; |
| lineIndex = new ArrayList<>(initialLines); |
| lineIndex.add(0); |
| |
| firstBinary = -1; |
| numBinary = 0; |
| |
| decoder = |
| StandardCharsets.UTF_8 |
| .newDecoder() |
| .onMalformedInput(CodingErrorAction.REPORT) |
| .onUnmappableCharacter(CodingErrorAction.REPORT); |
| } |
| |
| /** |
| * Attempts to read characters into the specified character buffer. The buffer is used as a |
| * repository of characters as-is: the only changes made are the results of a put operation. No |
| * flipping or rewinding of the buffer is performed. |
| * |
| * @param dest The buffer into which the read characters are put. |
| * @return The number of {@code char} values added to the buffer, or -1 if this source of |
| * characters is at its end. |
| * @throws IOException if an I/O error occurs |
| * @throws NullPointerException if dest is null |
| * @throws java.nio.ReadOnlyBufferException if dest is a read only buffer |
| */ |
| @Override |
| @SuppressWarnings("ReferenceEquality") |
| public int read(CharBuffer dest) throws IOException { |
| Preconditions.checkNotNull(dest); |
| Preconditions.checkArgument(dest.remaining() >= 2); |
| try { |
| int nPrev = 0; |
| if (atEof && !this.cb.hasRemaining() && !bb.hasRemaining()) { |
| // At end with nothing left in the buffers -- time to indicate EOF. |
| return -1; |
| } |
| if (!dest.hasRemaining()) { |
| throw new BufferOverflowException(); |
| } |
| int nRead = 0; |
| if (this.cb.hasRemaining() && dest != this.cb) { |
| // Copy the previously decoded characters (either all of them or enough to fill dest) into |
| // dest. |
| nPrev = Math.min(dest.remaining(), this.cb.remaining()); |
| dest.put(this.cb); |
| } |
| while (dest.hasRemaining()) { |
| int oldCharOffset = dest.position() - nPrev; |
| nPrev = 0; |
| CoderResult cr = decoder.decode(bb, dest, atEof); |
| nRead += dest.position() - oldCharOffset; |
| // Scan decoded characters to index the line endings. |
| for (int i = oldCharOffset; i < dest.position(); i++) { |
| char c = dest.array()[dest.arrayOffset() + i]; |
| currChar++; |
| if (c == '\n') { |
| lineIndex.set(currLine, currChar); |
| currLine++; |
| lineIndex.add(currChar); |
| } else if (c == '&') { |
| if (!replaceAt(dest, i, """, '"')) { |
| nRead -= cutAt(dest, i); |
| return nRead; |
| } |
| if (!replaceAt(dest, i, """, '"')) { |
| nRead -= cutAt(dest, i); |
| return nRead; |
| } |
| } else if (c == '<') { |
| if (!replaceAt(dest, i, "<var>", '"')) { |
| nRead -= cutAt(dest, i); |
| return nRead; |
| } |
| if (!replaceAt(dest, i, "</var>", '"')) { |
| nRead -= cutAt(dest, i); |
| return nRead; |
| } |
| } |
| lineIndex.set(currLine, currChar); |
| } |
| if (cr.isUnderflow()) { // all bytes decoded -- read more if possible. |
| if (atEof) { |
| break; |
| } |
| bb.compact(); |
| int n = |
| (numBinary > currLine) |
| ? -1 |
| : source.read(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining()); |
| if (n > 0) { |
| bb.position(bb.position() + n); |
| } |
| bb.flip(); |
| if (n < 0) { |
| atEof = true; |
| } |
| decoder.reset(); |
| continue; |
| } else if (cr.isOverflow()) { |
| // dest filled or dest has space for 1 character, but next byte sequence to decode is a |
| // surrogate pair requiring 2 characters to represent. |
| if (nRead == 0) { |
| // Presumably a surrogate pair -- need to buffer the un-read 2nd character of the pair. |
| this.cb.clear(); |
| int oldPosition = bb.position(); |
| decoder.reset(); |
| cr = decoder.decode(bb, this.cb, false); |
| int n = bb.position() - oldPosition; |
| this.cb.flip(); |
| if (n == 0 || !this.cb.hasRemaining()) { |
| // cr must be an error i.e. next byte not part of valid UTF-8 character. |
| dest.put('?'); |
| bb.position(bb.position() + 1); |
| } else { |
| dest.put(this.cb.get()); |
| } |
| nRead++; |
| } |
| break; |
| } else if (cr.isError()) { |
| // not valid utf-8 sequence -- binary file or other code page... |
| if (firstBinary < 0) { |
| firstBinary = currChar; |
| } |
| numBinary += cr.length(); |
| nRead += cr.length(); |
| if (!dest.hasRemaining()) { |
| break; |
| } |
| byte b = bb.array()[bb.arrayOffset() + bb.position()]; |
| char c = '?'; // By default, replace binary data with '?' |
| |
| // There is no need to try to translate all binary data -- some is just binary. |
| // |
| // Empirically the non-UTF-8 characters below sometimes appear in or near copyrights. |
| // In some cases, the file may be encoded with a different code page, or a UTF |
| // character above 128 may have been stored without proper escaping. Making these |
| // substitutions improves readability of extracted matches and licenses. |
| // |
| // The range U+00c0 to U+00ff are mostly accented characters, which require escaping in |
| // UTF-8. The low-order byte sometimes appears without escaping -- perhaps this |
| // corresponds to a different code page? In any case, just interpreting as chars works in |
| // files that include them in copyrights, and doesn't matter when they appear in other |
| // binary sequences... |
| if (b >= (byte) 0xc0 && b <= (byte) 0xff) { |
| c = (char) ('\u0000' | (b & 0xff)); |
| } |
| switch (b) { |
| case (byte) 0: // preserve nul character |
| c = '\000'; |
| break; |
| case (byte) 0x87: // sometimes appears where one might expect bullet |
| case (byte) 0xb7: // middle-dot could be bullet -- unescaped U+00b7 |
| c = '*'; // treat bullets the same as comment character '*' -- ignored as whitespace |
| break; |
| case (byte) 0x85: // sometimes appears where one might expect (TM) |
| case (byte) 0x99: // sometimes appears where one might expect (TM) |
| c = 'â„¢'; |
| break; |
| case (byte) 0xa0: // non-breaking space -- unescaped U+00a0 |
| case (byte) 0xa7: // section symbol -- unescapd U+00a7 |
| case (byte) 0xad: // soft hyphen -- unescaped U+00ad |
| case (byte) 0xb6: // pilcrow or paragraph symbol -- unescaped U+00b6 |
| // treat as white space |
| c = ' '; |
| break; |
| case (byte) 0xa9: // copright -- unescaped U+00a9 |
| c = '©'; |
| break; |
| case (byte) 0xae: // registered -- unescaped U+00ae |
| c = '®'; |
| break; |
| case (byte) 0x94: // sometimes appears in place of ö in Björn |
| c = 'ö'; |
| break; |
| } |
| dest.put(c); |
| bb.position(bb.position() + 1); |
| decoder.reset(); |
| continue; |
| } |
| assert false : "Unexpected CoderResult state: " + cr.toString(); |
| } |
| return nRead; |
| } catch (CharacterCodingException e) { |
| throw binaryFile(e); |
| } catch (IOException e) { |
| throw ioException(e); |
| } |
| } |
| |
| /** |
| * Reads a string from the file up to the next delimiter `delim` (or until eof if no delimiter) |
| * appending the string to buffer `sb`. |
| * |
| * <p>Resulting string does not include the delimiter. |
| * |
| * @param delim The string delimiter. e.g. '\n' or '\000' |
| * @param sb A string builder into which the string is read without the delimiter. |
| * @return The number of characters read from the stream including the delimiter. |
| */ |
| public int readString(char delim, StringBuilder sb) throws IOException { |
| char[] buf = new char[FALLBACK_BUFFER_SIZE]; |
| CharBuffer cb = CharBuffer.wrap(buf); |
| if (this.cb.hasRemaining()) { |
| cb.put(this.cb); |
| } |
| cb.flip(); |
| int nRead = 0; |
| int tries = 3; |
| while (true) { |
| while (cb.hasRemaining()) { |
| char c = cb.get(); |
| nRead++; |
| if (c == delim) { |
| unput(cb); |
| return nRead; |
| } |
| sb.append(c); |
| } |
| cb.clear(); |
| int n = read(cb); |
| cb.flip(); |
| if (n < 0) { |
| if (nRead == 0) { |
| return -1; |
| } |
| break; |
| } else if (n == 0) { |
| tries--; |
| if (tries < 1) { |
| if (nRead == 0) { |
| return -1; |
| } |
| break; |
| } |
| } |
| } |
| return nRead; |
| } |
| |
| @Override |
| public void close() throws IOException { |
| source.close(); |
| } |
| |
| /** Returns the line number containing the given character position, `charPosn`. */ |
| public int getLineNumber(int charPosn) { |
| int index = Collections.binarySearch(lineIndex, charPosn); |
| if (index < 0) { // binarySearch returns inexact matches as negative indexes. |
| index = -index - 1; |
| } |
| return index + 1; |
| } |
| |
| /** Wrap a CharacterCodingException with a BinaryFileException describing file, line, etc. */ |
| private BinaryFileException binaryFile(CharacterCodingException cause) { |
| int lineNumber = getLineNumber(currChar); |
| int index = lineNumber - 1; |
| int column = (index == 0 ? currChar : currChar - lineIndex.get(index - 1)) + 1; |
| int length = 0; |
| if (cause instanceof MalformedInputException) { |
| MalformedInputException me = (MalformedInputException) cause; |
| length = me.getInputLength(); |
| } else if (cause instanceof UnmappableCharacterException) { |
| UnmappableCharacterException ue = (UnmappableCharacterException) cause; |
| length = ue.getInputLength(); |
| } |
| StringBuffer sb = new StringBuffer(); |
| sb.append(name); |
| for (int i = 0; i < length; i++) { |
| sb.append(String.format(" %02x", bb.array()[bb.arrayOffset() + bb.position() + i])); |
| } |
| return new BinaryFileException(sb.toString(), currChar, lineNumber, column, cause); |
| } |
| |
| /** Wrap an IOException with a description of the current file, line number and column number. */ |
| private LineReaderIOException ioException(IOException cause) { |
| int lineNumber = getLineNumber(currChar); |
| int index = lineNumber - 1; |
| int column = (index == 0 ? currChar : currChar - lineIndex.get(index)) + 1; |
| return new LineReaderIOException( |
| "IndexedLineReaderIOException " + cause.getMessage() + " " + name, |
| currChar, |
| lineNumber, |
| column, |
| cause); |
| } |
| |
| /** Cut the current buffer `cb` at `position` putting the rest in `this.cb`. */ |
| @SuppressWarnings("ReferenceEquality") |
| private int cutAt(CharBuffer cb, int position) { |
| if (cb == this.cb) { |
| throw new BufferOverflowException(); |
| } |
| int nCut = cb.position() - position; |
| this.cb.clear(); |
| this.cb.put(cb.array(), cb.arrayOffset() + position, nCut); |
| cb.position(position); |
| this.cb.flip(); |
| return nCut; |
| } |
| |
| /** Save the remaining characters from `cb` onto `this.cb` for later. */ |
| private void unput(CharBuffer cb) { |
| if (!this.cb.hasRemaining()) { |
| this.cb.clear(); |
| this.cb.put(cb); |
| this.cb.flip(); |
| return; |
| } |
| // Shift `this.cb` and prepend `cb` |
| int len = cb.remaining(); |
| if (this.cb.limit() + len > this.cb.capacity()) { |
| throw new BufferOverflowException(); |
| } |
| this.cb.limit(this.cb.limit() + len); |
| for (int i = this.cb.limit() - len - 1; i >= this.cb.position(); i--) { |
| this.cb.put(i + len, this.cb.get(i)); |
| } |
| for (int i = 0; i < len; i++) { |
| this.cb.put(this.cb.position() + i, cb.get()); |
| } |
| } |
| |
| /** Conditionally replaces `prefix` when found at `position` in `cb` with `replacement` char. */ |
| private static boolean replaceAt(CharBuffer cb, int position, String prefix, char replacement) { |
| for (int i = 0; i < prefix.length(); i++) { |
| if (position + i >= cb.position()) { |
| return false; |
| } |
| if (cb.get(position + i) != prefix.charAt(i)) { |
| return true; |
| } |
| } |
| cb.put(position, replacement); |
| int dst = position + 1; |
| int src = position + prefix.length(); |
| while (src < cb.position()) { |
| cb.put(dst, cb.get(src)); |
| src++; |
| dst++; |
| } |
| cb.position(dst); |
| return true; |
| } |
| |
| /** Describes an IO error at a specific location in a file. */ |
| public static class LineReaderIOException extends IOException { |
| private int charPosn; |
| private int lineNumber; |
| private int column; |
| |
| LineReaderIOException( |
| String message, int charPosn, int lineNumber, int column, Throwable cause) { |
| super(message, cause); |
| this.charPosn = charPosn; |
| this.lineNumber = lineNumber; |
| this.column = column; |
| } |
| |
| @Override |
| public String getMessage() { |
| StringBuffer m = new StringBuffer(); |
| m.append(super.getMessage()) |
| .append(" line ") |
| .append(lineNumber) |
| .append(" col ") |
| .append(column) |
| .append(" offset ") |
| .append(charPosn); |
| return m.toString(); |
| } |
| } |
| |
| /** Thrown when a binary file is detected. */ |
| public static class BinaryFileException extends LineReaderIOException { |
| BinaryFileException( |
| String fileName, int charPosn, int lineNumber, int column, Throwable cause) { |
| super("Binary file: " + fileName, charPosn, lineNumber, column, cause); |
| } |
| } |
| } |