// Copyright (C) 2019 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.googlesource.gerrit.plugins.copyright.lib;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Streams;
import java.io.IOException;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * Immutable file scanner for copyrights classifying the copyright matches it finds.
 *
 * <p>In general, configure the first-party (1p) and forbidden owners, and any generic owner matches
 * get classified as third-party (3p) automatically. Generally, only configure third-party (3p)
 * owners that the generic pattern will not match for some reason.
 *
 * <p>Licenses are different. Unknown licenses get identified as unknown and treated the same as
 * forbidden. Configure all of the known first-party (1p), third-party (3p) and forbidden licenes.
 *
 * <p>Configure the scanner using simplified regular expressions. The scanner will replace sequences
 * of whitespace with a regular sub-expression matching sequences of whitespace or comment
 * characters. Because the scanner makes this transformation, avoid including whitespace inside
 * character classes.
 *
 * <p>e.g. use "Android Open(?: |-)Source Project" not "Android Open[- ]Source Project"
 *
 * <p>When classifying matches as 1p, 3p or forbidden, the scanner looks for complete matches of
 * configured patterns. i.e. "re.match()" not "re.find()"
 *
 * <p>It's useful to include wildcards in configured patterns to match sub-sequences in generic
 * matches, but these can cause excessive backtracking leading to performance problems or even stack
 * exhaustion. The scanner replaces the wildcards '.*' and '.+' with expressions matching a more
 * limited set of characters for a shorter length that will generally match what is expected.
 *
 * <p>This allows simple configuration patterns like ".*Licen[cs]ed under the Apache Licen[cs]e,?"
 * without the risks normally caused by wildcard patterns.
 */
public final class CopyrightScanner {

  private final Pattern copyright; // Full regular expression for scanner to match.
  private final ImmutableList<Pattern> firstPartyLicenses; // Match 1p licenses.
  private final ImmutableList<Pattern> thirdPartyLicenses; // Match 3p licenses.
  private final ImmutableList<Pattern> forbiddenLicenses; // Match forbidden licences.
  private final ImmutableList<Pattern> firstPartyOwners; // Match 1p authors/matches.
  private final ImmutableList<Pattern> thirdPartyOwners; // Match 3p authors/matches.
  private final ImmutableList<Pattern> forbiddenOwners; // Match forbidden authors.
  private final ImmutableList<Pattern> contractWords; // Match license words.
  private final ImmutableList<Pattern> excludePatterns; // Exclude when found.

  // Most files that have a copyright or license declaration have 1 of them -- or at most 2 or 3.
  // NOTICE files can have thousands all derived from other files in the repository. No need to find
  // them all. Picked a small multiple of the expected number of licenses per file to catch any
  // long-tail files without wasting much effort on derivative NOTICE files etc.
  private static final int MATCH_THRESHOLD = 10;

  // Determined empirically by scanning millions of files on several hosts and looking at the offset
  // of the first matched copyright or license declaration. A couple .cpp files have copyright
  // declarations near the end of the file for some function or class copied from a third party.
  //
  // The only files where the first match appeared later than 230k or so were a few multi-gigabyte
  // build images derived entirely from other files in the repository. Picked a power of 2 large
  // enough to report all or virtually all of the source files with copyright declarations; even if
  // it doesn't report all of the declarations in the largest source files.
  //
  // There is an obvious trade-off for performance here. Increasing the maximum search length beyond
  // this threshold makes little or no difference for detecting problematic licenses, but does
  // increase scan durations at least linearly for larger files. Reducing the maximum search length
  // significantly below this threshold increases the risk a problematic license will go undetected.
  private static final int MAX_SEARCH_LENGTH = 256 * 1024;

  // All of the MAX parameters below have been chosen empirically similar to MATCH_SEARCH_LENGTH to
  // minimize computing cost while still catching virtually all of the important matches.

  /** Maximum length of consecutive text characters to match. */
  private static final int MAX_NAME_LENGTH = CopyrightPatterns.MAX_NAME_LENGTH;
  /** Maximum number of potential names to match. */
  private static final int MAX_NAME_REPETITION = CopyrightPatterns.MAX_NAME_REPETITION;
  /** Maximum length of consecutive space/comment characters to match. */
  private static final int MAX_SPACE_LENGTH = CopyrightPatterns.MAX_SPACE_LENGTH;
  /** Maximum repetition of potential dates to match. Might have to revisit this in future. */
  private static final int MAX_DATE_REPETITION = CopyrightPatterns.MAX_DATE_REPETITION;

  /** Regular expression matching whitespace or a comment character. */
  private static final String WS = CopyrightPatterns.WS;
  /** Regular expression matching whitespace, a comment character, or punctuation. */
  private static final String WSPCT = CopyrightPatterns.WSPCT;
  /** Regular experssion matching a web address. */
  private static final String URL = CopyrightPatterns.URL;
  /** Regular expression matching a text or email address. */
  public static final String NAME = CopyrightPatterns.NAME;
  /** Regular expression matching an UPPERCASE text. */
  public static final String UPPER_NAME = CopyrightPatterns.UPPER_NAME;
  /** Regular expression matching a Proper Case text. */
  public static final String PROPER_NAME = CopyrightPatterns.PROPER_NAME;
  /** Regular expression matching any text, email address, or quote character. */
  private static final String ANY_CHAR = CopyrightPatterns.ANY_CHAR;
  /** Regular expression matching any text, email address, or quoted string. */
  public static final String ANY_WORD = CopyrightPatterns.ANY_WORD;

  /**
   * Regular expressions to match arbitrary contract words.
   *
   * <p>Purposefully pushed the definition of common contract words to the lowest levels of the
   * library to make it difficult--but not impossible--to customize the word list.
   *
   * <p>There are many words one can think of that are common to license contracts that do not
   * appear here. For example, "grant" and "permission" lead to many false positives due to their
   * use associated with ACLs and visibility etc. The word "contributed" appears so many times in
   * .xml files in the Android code base that it adds significant latency and had to be removed.
   *
   * <p>Most license declarations will have multiple of these words so if a particular word causes a
   * problem in a particular code base, it is probably okay to remove it for all code bases without
   * too large a reduction in true positives. But please, check first.
   *
   * <p>Take care adding new words to make sure they do increase the number of true positives
   * without causing other problems. Remember that the existing word list was arrived at empirically
   * by adding many candidates and then pruning.
   *
   * <p>If the word lists really must diverge among different code bases, make the 2nd constructor
   * public, and provide different word lists at a higher level.
   */
  private static final ImmutableList<String> CONTRACT_WORDS =
      ImmutableList.of(
          "license>[-\\s\\p{Z}\\p{L}\\p{N}\\p{P}\\p{S}\\p{M}<>/#*.]{1,512}?</[^>]{0,25}licen[cs]e",
          "agree(?:s|d|ment)?",
          "amendments?",
          "applicable laws?",
          "any manner",
          "auth?or(?:s|ed|ship)?:?(?-i: \\p{Lu}\\p{Ll}*){2,5}",
          "breach",
          "(?:(?:required|return|allocated|allowed|contributed|copyrighted|generated|provided"
              + "|raised|understandable|used|written) )?by:? @[-\\p{L}\\p{N}._]+",
          "(?:(?:required|return|allocated|allowed|contributed|copyrighted|generated|provided"
              + "|raised|understandable|used|written) )?by:? [-\\p{L}\\p{N}._]+@[-\\p{L}\\p{N}._]+",
          "(?:(?:required|return|allocated|allowed|contributed|copyrighted|generated|provided"
              + "|raised|understandable|used|written) )?by:?(?-i: \\p{Lu}\\p{Ll}*){2,5}",
          "charge for",
          "constitut(?:e|es|ed|ing)",
          "contract(?:s|ed|ing|ual|ually)?",
          // contributed removed -- frequent appearance in large .xml files increases latency
          "contribut(?:e|es|or|ors|ion|ions)",
          "copyleft",
          "\\p{L}+ copyright(?:able)? \\p{L}+",
          "damages",
          "derivative",
          "disclaim(?:s|ed|er)?",
          "endorsements?",
          " [(]?EUPL[)]? ",
          "exemplary",
          "expressly",
          "fitness",
          "govern(?:s|ed|ing)?",
          "here(?:by|under)",
          "herein(?:after)?",
          "however caused",
          "incidental",
          "infring(?:e|es|ed|ing)",
          "injury",
          "jurisdictions?",
          "lawful",
          "liable",
          "liabilit(?:ies|y)",
          "(?:re)?licen[cs](?:e(?![:])|es|ed|ing|or)",
          "litigation",
          "merchantability",
          "must agree",
          "negligen(?:ce|t)",
          "no event",
          "no provision",
          "(?:non|un)enforce(?:s|d|able|ability)?",
          "nonexclusive",
          "notwithstanding",
          "obligations?",
          "otherwise agreed",
          "perpetu(?:al|ity)",
          "phonorecords?",
          "prior written",
          "provisions",
          "public domain",
          "(?-i:(?:" + UPPER_NAME + " ){0,5}PUBLIC LICEN[CS]E)",
          "(?-i:(?:" + PROPER_NAME + " ){0,5}Public Licen[cs]e)",
          "punitive",
          "pursuant",
          "redistribut(?:e|ion)",
          "right to",
          "royalties",
          "set forth",
          " [(]?SISSL[)]? ",
          "SPDX-License-Identifier[:]?",
          "stoppage",
          "terms and conditions",
          "the laws of",
          "third party",
          "tort(?:s|ious)?",
          "trademark",
          "waive(?:s|d|r)?",
          "warrant(?:s|y|ee|ed|ing)?",
          "whatsoever");

  public CopyrightScanner(
      Iterable<String> firstPartyLicenses,
      Iterable<String> thirdPartyLicenses,
      Iterable<String> forbiddenLicenses,
      Iterable<String> firstPartyOwners,
      Iterable<String> thirdPartyOwners,
      Iterable<String> forbiddenOwners,
      Iterable<String> excludePatterns) {
    this(
        firstPartyLicenses,
        thirdPartyLicenses,
        forbiddenLicenses,
        firstPartyOwners,
        thirdPartyOwners,
        forbiddenOwners,
        excludePatterns,
        CONTRACT_WORDS);
  }

  private CopyrightScanner(
      Iterable<String> firstPartyLicenses,
      Iterable<String> thirdPartyLicenses,
      Iterable<String> forbiddenLicenses,
      Iterable<String> firstPartyOwners,
      Iterable<String> thirdPartyOwners,
      Iterable<String> forbiddenOwners,
      Iterable<String> excludePatterns,
      Iterable<String> contractWords) {
    ImmutableList.Builder<Pattern> b = ImmutableList.builder();
    if (firstPartyLicenses != null) {
      for (String license : firstPartyLicenses) {
        b.add(patternizeKnownMatch(license));
      }
    }
    this.firstPartyLicenses = b.build();
    b = ImmutableList.builder();
    if (thirdPartyLicenses != null) {
      for (String license : thirdPartyLicenses) {
        b.add(patternizeKnownMatch(license));
      }
    }
    this.thirdPartyLicenses = b.build();
    b = ImmutableList.builder();
    if (forbiddenLicenses != null) {
      for (String license : forbiddenLicenses) {
        b.add(patternizeKnownMatch(license));
      }
    }
    this.forbiddenLicenses = b.build();
    b = ImmutableList.builder();
    if (firstPartyOwners != null) {
      for (String owner : firstPartyOwners) {
        b.add(patternizeKnownMatch(owner));
      }
    }
    this.firstPartyOwners = b.build();
    b = ImmutableList.builder();
    if (thirdPartyOwners != null) {
      for (String owner : thirdPartyOwners) {
        b.add(patternizeKnownMatch(owner));
      }
    }
    this.thirdPartyOwners = b.build();
    b = ImmutableList.builder();
    if (forbiddenOwners != null) {
      for (String owner : forbiddenOwners) {
        b.add(patternizeKnownMatch(owner));
      }
    }
    this.forbiddenOwners = b.build();
    b = ImmutableList.builder();
    for (String word : contractWords) {
      b.add(patternizeKnownMatch(word));
    }
    this.contractWords = b.build();
    Preconditions.checkArgument(!this.contractWords.isEmpty());
    b = ImmutableList.builder();
    if (excludePatterns != null) {
      for (String pattern : excludePatterns) {
        b.add(Pattern.compile(pattern)); // not transformed because applies to normalized matches
      }
    }
    this.excludePatterns = b.build();
    this.copyright = buildPattern();
  }

  @Override
  public boolean equals(Object other) {
    if (this == other) {
      return true;
    }
    if (other == null) {
      return false;
    }
    if (other instanceof CopyrightScanner) {
      CopyrightScanner otherScanner = (CopyrightScanner) other;
      return copyright.equals(otherScanner.copyright);
    }
    return false;
  }

  @Override
  public int hashCode() {
    return copyright.hashCode();
  }

  /**
   * Scans `source` for copyright notices returning found license/author/owner information.
   *
   * @param name Arbitrary string identifying the source. Usually a filename.
   * @param size Hint regarding the expected size of the input source. Use -1 if unknown.
   * @param source The source input stream with line endings indexed for lookup.
   * @return the list of matches found in the input stream -- never null.
   */
  public ImmutableList<Match> findMatches(String name, long size, IndexedLineReader source)
      throws IOException {
    Preconditions.checkNotNull(name);
    Preconditions.checkNotNull(source);

    ImmutableList.Builder<Match> builder = ImmutableList.builder();

    // Accumulates unknown licenses in case no known matches found.
    ArrayList<Match> unknowns = new ArrayList<>();

    // Allocate a character buffer using the size hint.
    int searchLength = size < 1 || size > MAX_SEARCH_LENGTH ? MAX_SEARCH_LENGTH : (int) size;
    char[] content = new char[searchLength > 2 ? searchLength : 2]; // minimum 2 chars required
    CharBuffer cb = CharBuffer.wrap(content);

    // Read the input into the character buffer.
    source.read(cb);
    cb.flip(); // Switch from tracking available space to read into to tracking amount read.

    int numUnknown = 0; // track number of contract words from unknown licenses found
    int numLicenses = 0; // track number of licenses versus owners added to the builder
    int numLicenseGroups = // First 2 or 3 captured groups are licenses. Rest are author/owner.
        firstPartyLicenses.isEmpty() && thirdPartyLicenses.isEmpty() && forbiddenLicenses.isEmpty()
            ? 2
            : 3;

    Matcher matcher = copyright.matcher(cb);
    while (matcher.find()) {
      MatchResult mr = matcher.toMatchResult();
      int numBuilt = 0; // track number of matches added to the builder
      for (int i = 1; i <= mr.groupCount(); i++) { // group 0 is entire match not a specific group
        String license = normalizeLicense(mr.group(i));
        if (license == null || license.trim().isEmpty() || isExcluded(license)) {
          continue;
        }
        String owner = normalizeOwner(license);
        if (isForbiddenLicense(license)) {
          builder.add(
              new Match(
                  PartyType.FORBIDDEN,
                  MatchType.LICENSE,
                  normalizeLicense(mr.group()),
                  source.getLineNumber(mr.start(i)),
                  source.getLineNumber(mr.end(i)),
                  mr.start(i),
                  mr.end(i)));
          numLicenses++;
        } else if (isThirdPartyLicense(license)) {
          builder.add(
              new Match(
                  PartyType.THIRD_PARTY,
                  MatchType.LICENSE,
                  normalizeLicense(mr.group()),
                  source.getLineNumber(mr.start(i)),
                  source.getLineNumber(mr.end(i)),
                  mr.start(i),
                  mr.end(i)));
          numLicenses++;
        } else if (isFirstPartyLicense(license)) {
          builder.add(
              new Match(
                  PartyType.FIRST_PARTY,
                  MatchType.LICENSE,
                  normalizeLicense(mr.group()),
                  source.getLineNumber(mr.start(i)),
                  source.getLineNumber(mr.end(i)),
                  mr.start(i),
                  mr.end(i)));
          numLicenses++;
        } else if (i <= numLicenseGroups) { // first 2 or 3 groups are licenses
          builder.add(
              new Match(
                  PartyType.UNKNOWN, // unknown licenses classified as unknown
                  MatchType.LICENSE,
                  normalizeLicense(mr.group()),
                  source.getLineNumber(mr.start(i)),
                  source.getLineNumber(mr.end(i)),
                  mr.start(i),
                  mr.end(i)));
          numLicenses++;
        } else if (license.toLowerCase().contains("license")
            || license.toLowerCase().contains("licence")) {
          builder.add(
              new Match(
                  PartyType.UNKNOWN, // unknown licenses classified as unknown
                  MatchType.LICENSE,
                  normalizeLicense(mr.group()),
                  source.getLineNumber(mr.start(i)),
                  source.getLineNumber(mr.end(i)),
                  mr.start(i),
                  mr.end(i)));
          numLicenses++;
        } else if (isForbiddenOwner(owner)) {
          builder.add(
              new Match(
                  PartyType.FORBIDDEN,
                  normalizeLicense(mr.group()),
                  source.getLineNumber(mr.start(i)),
                  source.getLineNumber(mr.end(i)),
                  mr.start(i),
                  mr.end(i)));
        } else if (isThirdPartyOwner(owner)) {
          builder.add(
              new Match(
                  PartyType.THIRD_PARTY,
                  normalizeLicense(mr.group()),
                  source.getLineNumber(mr.start(i)),
                  source.getLineNumber(mr.end(i)),
                  mr.start(i),
                  mr.end(i)));
        } else if (isFirstPartyOwner(owner)) {
          builder.add(
              new Match(
                  PartyType.FIRST_PARTY,
                  normalizeLicense(mr.group()),
                  source.getLineNumber(mr.start(i)),
                  source.getLineNumber(mr.end(i)),
                  mr.start(i),
                  mr.end(i)));
        } else { // remainder of groups are owner/author copyrights
          builder.add(
              new Match(
                  PartyType.THIRD_PARTY, // unknown authors classified as third party.
                  normalizeLicense(mr.group()),
                  source.getLineNumber(mr.start(i)),
                  source.getLineNumber(mr.end(i)),
                  mr.start(i),
                  mr.end(i)));
        }
        numBuilt++;
      }
      // If no capture group has content, the entire match is a word from an unknown contract.
      // Don't bother accumulating unknown contract matches after known patterns detected.
      if (numLicenses == 0 && numBuilt == 0 && numUnknown <= MATCH_THRESHOLD) {
        String license = normalizeLicense(mr.group());
        if (license.matches("(?i)no copyright(?:able)?.*")) { // exclude negated match
          continue;
        }
        if (isExcluded(license)) {
          continue;
        }
        if (license.matches( // exclude common implementation comments using the word `by`
            "(?i:required|return|allocated|allowed|generated|provided|raised|understandable"
                + "|used) by .*")) {}
        int startLine = source.getLineNumber(mr.start());
        int endLine = source.getLineNumber(mr.end());
        String owner = normalizeOwner(license);
        if (isForbiddenLicense(license)) {
          builder.add(
              new Match(
                  PartyType.FORBIDDEN,
                  MatchType.LICENSE,
                  license,
                  startLine,
                  endLine,
                  mr.start(),
                  mr.end()));
          numBuilt++;
          continue;
        } else if (isThirdPartyLicense(license)) {
          builder.add(
              new Match(
                  PartyType.THIRD_PARTY,
                  MatchType.LICENSE,
                  license,
                  startLine,
                  endLine,
                  mr.start(),
                  mr.end()));
          numBuilt++;
          continue;
        } else if (isFirstPartyLicense(license)) {
          builder.add(
              new Match(
                  PartyType.FIRST_PARTY,
                  MatchType.LICENSE,
                  license,
                  startLine,
                  endLine,
                  mr.start(),
                  mr.end()));
          numBuilt++;
          continue;
        } else if (isForbiddenOwner(owner)) {
          builder.add(
              new Match(PartyType.FORBIDDEN, license, startLine, endLine, mr.start(), mr.end()));
          numBuilt++;
          continue;
        } else if (isThirdPartyOwner(owner)) {
          builder.add(
              new Match(PartyType.THIRD_PARTY, license, startLine, endLine, mr.start(), mr.end()));
          numBuilt++;
          continue;
        } else if (isFirstPartyOwner(owner)) {
          builder.add(
              new Match(PartyType.FIRST_PARTY, license, startLine, endLine, mr.start(), mr.end()));
          numBuilt++;
          continue;
        }
        Match priorMatch = !unknowns.isEmpty() ? Iterables.getLast(unknowns) : null;
        // If close to an earlier match (within 6 lines or 300 chars), extend the match to include
        // the new word.
        if (priorMatch != null
            && (startLine - priorMatch.endLine < 6 || mr.start() - priorMatch.end < 300)) {
          priorMatch.text = priorMatch.text + "..." + license;
          priorMatch.endLine = endLine;
          priorMatch.end = mr.end();
        } else {
          // Otherwise, create a new match.
          if (numUnknown < MATCH_THRESHOLD) {
            unknowns.add(
                new Match(
                    PartyType.UNKNOWN,
                    MatchType.LICENSE,
                    license,
                    startLine,
                    endLine,
                    mr.start(),
                    mr.end()));
          }
          numUnknown++;
        }
      }
      // Stop the search early if enough known patterns already matched.
      if (numBuilt >= MATCH_THRESHOLD) {
        break;
      }
    }
    // Return unknown contracts only when found and no known patterns matched.
    if (numLicenses == 0) {
      builder.addAll(unknowns);
    }
    return builder.build();
  }

  /**
   * Constructs the search pattern incorporating the known matches into the generic regular
   * expression.
   *
   * <p>The first 2 or 3 match groups correspond to license matches. If the configuration specifies
   * known license patterns (1p, 3p or forbidden), the 1st match group will include these matches.
   *
   * <p>If the configuration specifies no known license patterns, the 1st and 2nd match groups will
   * include matches to the generic license pattern. Otherwise, the 2nd and 3rd match groups will
   * include these.
   *
   * <p>Subsequent match groups are all copyright author/owner matches.
   *
   * <p>The arbitrary contract words expression uses a non-capturing group. If none of the other
   * match groups contain any content, the entire match is treated as an unknown license word.
   */
  private Pattern buildPattern() {
    StringBuilder words = new StringBuilder();
    for (Pattern word : contractWords) {
      if (words.length() > 0) {
        words.append('|');
      }
      words.append(word);
    }

    StringBuilder owners = new StringBuilder();
    owners.append("(?:by");
    owners.append(WS);
    owners.append("{1,msl})?(?:the");
    owners.append(WS);
    owners.append("{1,msl})?("); // owner expression always captured here
    for (Pattern owner : thirdPartyOwners) {
      String s = owner.toString();
      int start = s.startsWith(".*") || s.startsWith(".+") ? 2 : 0;
      int end = s.endsWith(".*") || s.endsWith(".+") ? s.length() - 2 : s.length();
      owners.append(owner.toString().substring(start, end));
      owners.append('|');
    }
    for (Pattern owner : firstPartyOwners) {
      String s = owner.toString();
      int start = s.startsWith(".*") || s.startsWith(".+") ? 2 : 0;
      int end = s.endsWith(".*") || s.endsWith(".+") ? s.length() - 2 : s.length();
      owners.append(owner.toString().substring(start, end));
      owners.append('|');
    }
    for (Pattern owner : forbiddenOwners) {
      String s = owner.toString();
      int start = s.startsWith(".*") || s.startsWith(".+") ? 2 : 0;
      int end = s.endsWith(".*") || s.endsWith(".+") ? s.length() - 2 : s.length();
      owners.append(owner.toString().substring(start, end));
      owners.append('|');
    }
    owners.append("(?:");
    owners.append(NAME);
    owners.append("(?:");
    owners.append(WS);
    owners.append("{1,msl}");
    owners.append(NAME);
    owners.append("){0,mnr}))"); // end of owner capture

    // One of the frequent objections to regular expressions is the objection that long or complex
    // expressions are difficult to read, and they are. Avoid changes to the expressions below. If
    // given a choice between making a change below or adding a few "known owner"/"known license"
    // patterns to the configuration, bias toward configuration.
    //
    // If that is not possible, one of the most difficult tasks when maintaining these expressions
    // is balancing the parentheses and braces at the appropriate parts. The author of the below
    // expression added a System.err.println() statement to output:
    //   pattern.toString().replaceall("([(](?:[?][:])?)", "$1\n").replaceall("[)]", "\n$1")
    // inserting newlines after opening parentheses and before closing parentheses. The output
    // was then fed through an awk script to indent the nested expressions:

    /* awk '
         BEGIN {
           p="";
         }
         $0 ~ /^[)].*$/ {
           p=substr(p,1, length(p)-2);
         }
         {
           print p $0;
         }
         $0 ~ /[(]([?][:])?$/ {
           p=p "  ";
         }
       '
    */
    // From that output, it was possible to see where parentheses balanced and what changes to make
    // to edit the expression correctly. Not for the fainthearted.
    StringBuilder sb = new StringBuilder();

    // Optional known licence capture.
    if (!firstPartyLicenses.isEmpty()
        || !thirdPartyLicenses.isEmpty()
        || !forbiddenLicenses.isEmpty()) {
      sb.append("("); // start of optional 1st captured match group
      sb.append(
          Streams.concat(
                  thirdPartyLicenses.stream(),
                  firstPartyLicenses.stream(),
                  forbiddenLicenses.stream())
              .map(
                  input -> {
                    if (input == null) {
                      return "";
                    }
                    String s = input.toString();
                    int start = s.startsWith(".*") || s.startsWith(".+") ? 2 : 0;
                    int end = s.endsWith(".*") || s.endsWith(".+") ? s.length() - 2 : s.length();
                    return input.toString().substring(start, end);
                  })
              .collect(Collectors.joining("|")));
      sb.append(")|"); // end of optional 1st captured group and | to introduce 2nd captured group.
    }

    // Other license captures. -- ends with License
    sb.append("(?:is"); // not captured -- helps confirm license but interferes with matching 1p,3p
    sb.append(WS);
    sb.append("{1,msl}(?:distributed|provided)");
    sb.append(WS);
    sb.append("{1,msl}under(?:");
    sb.append(WS);
    sb.append("{1,msl}(?:the|this))?");
    sb.append(WS);
    sb.append("{1,msl}((?:"); // start of 1st or 2nd captured match group
    sb.append(NAME);
    sb.append(WS);
    sb.append(
        "{1,msl}){1,mnr}?licen[cs]e))[,.;]{0,3}(?![:])"); // end of 1st or 2nd captured match group

    // Other license captures. -- Line starting with License:
    sb.append(
        "|(?-ms:licen[cs]e:[\\p{Z}\\s]{1,msl}("); // start of the 2nd or 3rd captured match group
    sb.append(NAME);
    sb.append("(?:[\\p{Z}\\s]{1,msl}");
    sb.append(NAME);
    sb.append("){0,mnr})\\n)"); // end of 2nd or 3rd captured match group

    // "Author is" copyright capture.
    sb.append("|\\b(?:(?:the"); // not captured--helps confirm but interferes with 1p, 3p, forbidden
    sb.append(WS);
    sb.append("{1,msl}author");
    sb.append(WS);
    sb.append("{1,msl}of");
    sb.append(WS);
    sb.append("{1,msl}this");
    sb.append(WS);
    sb.append("{1,msl}software");
    sb.append(WS);
    sb.append("{1,msl}is|\\b(?:(?:principal");
    sb.append(WS);
    sb.append("{1,msl})?author:?))");
    sb.append(WS);
    sb.append("{1,msl}");
    sb.append(owners.toString()); // owner pattern includes capture group
    sb.append(")");

    // Copyright+year(s)+owner copyright capture.
    sb.append("|(?:"); // not captureed -- helps confirm but interferes with 1p, 3p, forbidden
    sb.append(WS);
    sb.append("{0,msl}(?:[(]c[)]|&copy;|©)");
    sb.append(WS);
    sb.append("{0,msl})?(?:(?:copy(?:right|left)(?:");
    sb.append(WS);
    sb.append("{1,msl}notice)?(?:");
    sb.append(WS);
    sb.append("{0,msl}(?:[(]c[)]|&copy;|©))?)|(?:[(]c[)]|&copy;|©))");
    sb.append(WS);
    sb.append("{1,msl}(?:");
    sb.append("[\\p{N}]{2,4}(?:"); // year(s)+owner
    sb.append(WSPCT);
    sb.append("{1,msl}(?:and");
    sb.append(WSPCT);
    sb.append("{1,msl})?[\\p{N}]{2,4}){0,mdr}(?:"); // allows pre-y2k 2-digit years
    sb.append(WSPCT);
    sb.append("{1,msl}(?:present|now))?");
    sb.append(WSPCT);
    sb.append("{1,msl}");
    sb.append(owners.toString()); // owner pattern includes capture group
    sb.append("|"); // owner+year(s)
    sb.append(owners.toString()); // owner pattern includes capture group
    sb.append(WS);
    sb.append("{1,msl}[\\p{N}]{2,4}(?:"); // allows pre-y2k 2-digit years
    sb.append(WSPCT);
    sb.append("{1,msl}(?:and");
    sb.append(WSPCT);
    sb.append("{1,msl})?[\\p{N}]{2,4}){0,mdr}(?:"); // allows pre-y2k 2-digit years
    sb.append(WSPCT);
    sb.append("{1,msl}(?:present|now))?");
    sb.append(")(?:(?:portions)?");
    sb.append(WS);
    sb.append("{0,msl}(?:[(]c[)]|&copy;|©)?");
    sb.append(WS);
    sb.append("{1,msl}copy(?:right|left)(?:");
    sb.append(WS);
    sb.append("{0,msl}(?:[(]c[)]|&copy;|©))?");
    sb.append(WS);
    sb.append("{1,msl}(?:");
    sb.append("[\\p{N}]{2,4}(?:"); // year(s)+owner
    sb.append(WSPCT);
    sb.append("{1,msl}(?:and");
    sb.append(WSPCT);
    sb.append("{1,msl})?[\\p{N}]{2,4}){0,mdr}(?:"); // allows pre-y2k 2-digit years
    sb.append(WSPCT);
    sb.append("{1,msl}(?:present|now))?");
    sb.append(WSPCT);
    sb.append("{1,msl}");
    sb.append(owners.toString()); // owner pattern (repeated) includes capture group
    sb.append("|"); // owner+year(s)
    sb.append(owners.toString()); // owner pattern (repeated) includes capture group
    sb.append(WS);
    sb.append("{1,msl}[\\p{N}]{2,4}(?:"); // allows pre-y2k 2-digit years
    sb.append(WSPCT);
    sb.append("{1,msl}(?:and");
    sb.append(WSPCT);
    sb.append("{1,msl})?[\\p{N}]{2,4}){0,mdr}(?:"); // allows pre-y2k 2-digit years
    sb.append(WSPCT);
    sb.append("{1,msl}(?:present|now))?");
    sb.append(")){0,5}"); // captures 0 to 5 additional author/owner declarations

    // Detect contract words to detect unknown licenses.
    sb.append("|(?:(?:[<\\p{Pi}]|\\b)(?:"); // unknown licenses use non-capturing group
    sb.append(words);
    sb.append(")(?:");
    sb.append(WS);
    sb.append("(?:");
    sb.append(words);
    sb.append(")){0,mnr}(?:[>,.;:\\p{Pf}]|\\b))");

    return Pattern.compile(
        sb.toString()
            .replaceAll("[,]mnl[}]", "," + MAX_NAME_LENGTH + "}")
            .replaceAll("[,]msl[}]", "," + MAX_SPACE_LENGTH + "}")
            .replaceAll("[,]mnr[}]", "," + MAX_NAME_REPETITION + "}")
            .replaceAll("[,]mdr[}]", "," + MAX_DATE_REPETITION + "}"),
        Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.UNICODE_CASE | Pattern.DOTALL);
  }

  /** Returns true when `owner` matches any known first party owner. */
  private boolean isExcluded(String match) {
    for (Pattern p : excludePatterns) {
      if (p.matcher(match).find()) {
        return true;
      }
    }
    return false;
  }

  /** Returns true when `owner` matches any known first party owner. */
  private boolean isFirstPartyOwner(String owner) {
    if (owner == null || owner.isEmpty()) {
      return false;
    }
    for (Pattern p : firstPartyOwners) {
      if (p.matcher(owner).matches()) {
        return true;
      }
    }
    return false;
  }

  /** Returns true when `owner` matches any known forbidden owner. */
  private boolean isForbiddenOwner(String owner) {
    if (owner == null || owner.isEmpty()) {
      return false;
    }
    for (Pattern p : forbiddenOwners) {
      if (p.matcher(owner).matches()) {
        return true;
      }
    }
    return false;
  }

  /** Returns true when `owner` matches any known third party owner. */
  private boolean isThirdPartyOwner(String owner) {
    if (owner == null || owner.isEmpty()) {
      return false;
    }
    for (Pattern p : thirdPartyOwners) {
      if (p.matcher(owner).matches()) {
        return true;
      }
    }
    return false;
  }

  /** Returns true when `license` matches any known first party license. */
  private boolean isFirstPartyLicense(String license) {
    for (Pattern p : firstPartyLicenses) {
      if (p.matcher(license).matches()) {
        return true;
      }
    }
    return false;
  }

  /** Returns true when `license` matches any known forbidden license. */
  private boolean isForbiddenLicense(String license) {
    for (Pattern p : forbiddenLicenses) {
      if (p.matcher(license).matches()) {
        return true;
      }
    }
    return false;
  }

  /** Returns true when `license` matches any known third party license. */
  private boolean isThirdPartyLicense(String license) {
    for (Pattern p : thirdPartyLicenses) {
      if (p.matcher(license).matches()) {
        return true;
      }
    }
    return false;
  }

  /**
   * Converts a known matching pattern written in a simplified regular expression language into a
   * regular expression treating comment characters as whitespace and replacing unlimited wildcard
   * expressions with expressions using a limited set of characters and a limited quantifier.
   */
  private static Pattern patternizeKnownMatch(String match) {
    Preconditions.checkNotNull(match);
    Preconditions.checkArgument(!match.isEmpty(), "Non-empty pattern required.");
    // Disallow capture groups which will interfere with 1p, 3p, or forbidden classification.
    Preconditions.checkArgument(
        !match.matches("(?:^|.*[^\\[])[(][^?](?:[^:].*|$)"),
        "Capturing group found in /" + match + "/. Use non-capturing (?:...) instead of (...).");
    // Disallow spaces inside character classes because they will get replaced.
    Preconditions.checkArgument(
        !match.matches(".*\\[[^]]*[\\s\\p{Z}][]].*"),
        "Character class with space in /" + match + "/. Use (?: |...) instead of space in [...].");
    // Replace unlimited "any char" wildcards that can cost too much backtracking with patterns that
    // match a smaller subset of characters with more limited quantifiers.
    //
    // Replace any sequence of whitespace with a regular expression to match any non-empty sequence
    // of whitespace or comment characters.
    String prefix = "";
    if (match.startsWith(".*")) {
      prefix = ".*";
    } else if (match.startsWith(".+")) {
      prefix = ".*";
    }
    String suffix = "";
    if (match.endsWith(".*")) {
      suffix = ".*";
    } else if (match.endsWith(".+")) {
      suffix = ".*";
    }
    return Pattern.compile(
        prefix
            + match
                .substring(prefix.length(), match.length() - suffix.length())
                .replaceAll(
                    "[.][*]",
                    ("(?: "
                            + ANY_CHAR
                            + "{1,"
                            + MAX_NAME_LENGTH
                            + "}){0,"
                            + MAX_NAME_REPETITION
                            + "}")
                        .replace("\\", "\\\\"))
                .replaceAll(
                    "[.][+]",
                    ("(?: "
                            + ANY_CHAR
                            + "{1,"
                            + MAX_NAME_LENGTH
                            + "}){1,"
                            + MAX_NAME_REPETITION
                            + "}")
                        .replace("\\", "\\\\"))
                .replaceAll(
                    "[\\s\\p{Z}]+[?]", WS.replace("\\", "\\\\") + "{0," + MAX_SPACE_LENGTH + "}")
                .replaceAll(
                    "[\\s\\p{Z}]+", WS.replace("\\", "\\\\") + "{1," + MAX_SPACE_LENGTH + "}")
            + suffix,
        Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.UNICODE_CASE | Pattern.DOTALL);
  }

  /**
   * Replaces sequences of whitespace and comment characters with a single space preserving URLs,
   * which often contain `/` or `#` as non-comment characters.
   */
  private static String normalizeLicense(String match) {
    if (match == null) {
      return null;
    }
    StringBuilder sb = new StringBuilder();
    Matcher m = Pattern.compile(URL + "|[<][/]").matcher(match); // URL or end-tag
    int nextIndex = 0;
    while (m.find()) {
      int start = m.start();
      if (nextIndex < start) {
        sb.append(match.substring(nextIndex, start).replaceAll(WS + "+", " "));
      }
      sb.append(m.group());
      nextIndex = m.end();
    }
    if (nextIndex < match.length()) {
      sb.append(match.substring(nextIndex).replaceAll(WS + "+", " "));
    }
    return sb.toString().trim();
  }

  /**
   * Strips common non-author/owner suffixes that get picked up unintentionally from previously
   * normalized license with sequences of whitespace and comment characters replaced with a single
   * space preserving URLS, which often contain `/` or `#` as non-comment characters.
   *
   * <p>The generic license pattern always ends by matching the word `license` or stops at the end
   * of the line so it does not pick up spurious additional text. The generic owner pattern does not
   * end in a specific word so it often includes spurious additional words like #ifdef or #ifndef,
   * which interfere when comparing the match against known author/owner patterns. This method
   * strips the most common non-author/owner words from the end of the match.
   */
  private static String normalizeOwner(String license) {
    if (license == null) {
      return null;
    }
    return license
        .split(
            "(?i)[ ](?:all rights|(?:the|this) [^ ]+(?: [^ ]+){0,2} (?:is|assumes|may)"
                + "|permission|copyright|version \\p{N}|for conditions|include |include$"
                + "|modification|however|open source license|please (?:use|read)|libname"
                + "|if defined|usage|this is free|added|generic|redistribution|ifdef|ifndef"
                + "|for (?:more|terms)|copying and|you (?:may|can)|released under|see the"
                + "|full source|freedom to use|this program and|distributed|https?|unit ?test"
                + "|import|static|by obtaining|by using|by copying|example|namespace|config\\b"
                + "|public (?:static|final|class)|package (?:org|com)|[^ ]+ is hereby)")[0];
  }

  /** Identifies the relevant party as 1p, 3p, forbidden, or unknown. */
  public enum PartyType {
    FIRST_PARTY,
    THIRD_PARTY,
    UNKNOWN,
    FORBIDDEN,
  }

  /** Identifies whether text matched by author/owner pattern or by license pattern. */
  public enum MatchType {
    AUTHOR_OWNER,
    LICENSE,
  }

  /**
   * Describes a copyright author/owner or license `text` match found in the input stream.
   *
   * <p>Identifies the relevant party as `FIRST_PARTY`, `THIRD_PARTY`, `FORBIDDEN`, or `UNKNOWN`.
   *
   * <p>Identifies the match as `AUTHOR_OWNER` or `LICENSE`.
   *
   * <p>Includes a normalized version of the matched text including where it was found in the file.
   */
  public static class Match {
    /** Classifies relevant party as 1p, 3p, forbidden, or unknown. */
    public PartyType partyType;
    /** Classifies match as author/owner or as license. */
    public MatchType matchType;
    /** Matched text with spaces and comment characters replaced by a single space. */
    public String text;
    /** The line number in the file where the match starts. */
    public int startLine;
    /** The line number in the file where the match ends. */
    public int endLine;
    /** The character offset into the file where the match starts. */
    public int start;
    /** The character offset into the file where the match ends. */
    public int end;

    public Match(PartyType partyType, String text, int startLine, int endLine, int start, int end) {
      this(partyType, MatchType.AUTHOR_OWNER, text, startLine, endLine, start, end);
    }

    public Match(
        PartyType partyType,
        MatchType matchType,
        String text,
        int startLine,
        int endLine,
        int start,
        int end) {
      this.partyType = partyType;
      this.matchType = matchType;
      this.text = text;
      this.startLine = startLine;
      this.endLine = endLine;
      this.start = start;
      this.end = end;
    }
  }
}
