| // Copyright (C) 2019 The Android Open Source Project |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package com.googlesource.gerrit.plugins.copyright.lib; |
| |
| import com.google.common.base.Preconditions; |
| import com.google.common.collect.ImmutableList; |
| import com.google.common.collect.Iterables; |
| import com.google.common.collect.Streams; |
| import java.io.IOException; |
| import java.nio.CharBuffer; |
| import java.util.ArrayList; |
| import java.util.regex.MatchResult; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| import java.util.stream.Collectors; |
| |
| /** |
| * Immutable file scanner for copyrights classifying the copyright matches it finds. |
| * |
| * <p>In general, configure the first-party (1p) and forbidden owners, and any generic owner matches |
| * get classified as third-party (3p) automatically. Generally, only configure third-party (3p) |
| * owners that the generic pattern will not match for some reason. |
| * |
| * <p>Licenses are different. Unknown licenses get identified as unknown and treated the same as |
| * forbidden. Configure all of the known first-party (1p), third-party (3p) and forbidden licenes. |
| * |
| * <p>Configure the scanner using simplified regular expressions. The scanner will replace sequences |
| * of whitespace with a regular sub-expression matching sequences of whitespace or comment |
| * characters. Because the scanner makes this transformation, avoid including whitespace inside |
| * character classes. |
| * |
| * <p>e.g. use "Android Open(?: |-)Source Project" not "Android Open[- ]Source Project" |
| * |
| * <p>When classifying matches as 1p, 3p or forbidden, the scanner looks for complete matches of |
| * configured patterns. i.e. "re.match()" not "re.find()" |
| * |
| * <p>It's useful to include wildcards in configured patterns to match sub-sequences in generic |
| * matches, but these can cause excessive backtracking leading to performance problems or even stack |
| * exhaustion. The scanner replaces the wildcards '.*' and '.+' with expressions matching a more |
| * limited set of characters for a shorter length that will generally match what is expected. |
| * |
| * <p>This allows simple configuration patterns like ".*Licen[cs]ed under the Apache Licen[cs]e,?" |
| * without the risks normally caused by wildcard patterns. |
| */ |
| public final class CopyrightScanner { |
| |
| private final Pattern copyright; // Full regular expression for scanner to match. |
| private final ImmutableList<Pattern> firstPartyLicenses; // Match 1p licenses. |
| private final ImmutableList<Pattern> thirdPartyLicenses; // Match 3p licenses. |
| private final ImmutableList<Pattern> forbiddenLicenses; // Match forbidden licences. |
| private final ImmutableList<Pattern> firstPartyOwners; // Match 1p authors/matches. |
| private final ImmutableList<Pattern> thirdPartyOwners; // Match 3p authors/matches. |
| private final ImmutableList<Pattern> forbiddenOwners; // Match forbidden authors. |
| private final ImmutableList<Pattern> contractWords; // Match license words. |
| private final ImmutableList<Pattern> excludePatterns; // Exclude when found. |
| |
| // Most files that have a copyright or license declaration have 1 of them -- or at most 2 or 3. |
| // NOTICE files can have thousands all derived from other files in the repository. No need to find |
| // them all. Picked a small multiple of the expected number of licenses per file to catch any |
| // long-tail files without wasting much effort on derivative NOTICE files etc. |
| private static final int MATCH_THRESHOLD = 10; |
| |
| // Determined empirically by scanning millions of files on several hosts and looking at the offset |
| // of the first matched copyright or license declaration. A couple .cpp files have copyright |
| // declarations near the end of the file for some function or class copied from a third party. |
| // |
| // The only files where the first match appeared later than 230k or so were a few multi-gigabyte |
| // build images derived entirely from other files in the repository. Picked a power of 2 large |
| // enough to report all or virtually all of the source files with copyright declarations; even if |
| // it doesn't report all of the declarations in the largest source files. |
| // |
| // There is an obvious trade-off for performance here. Increasing the maximum search length beyond |
| // this threshold makes little or no difference for detecting problematic licenses, but does |
| // increase scan durations at least linearly for larger files. Reducing the maximum search length |
| // significantly below this threshold increases the risk a problematic license will go undetected. |
| private static final int MAX_SEARCH_LENGTH = 256 * 1024; |
| |
| // All of the MAX parameters below have been chosen empirically similar to MATCH_SEARCH_LENGTH to |
| // minimize computing cost while still catching virtually all of the important matches. |
| |
| /** Maximum length of consecutive text characters to match. */ |
| private static final int MAX_NAME_LENGTH = CopyrightPatterns.MAX_NAME_LENGTH; |
| /** Maximum number of potential names to match. */ |
| private static final int MAX_NAME_REPETITION = CopyrightPatterns.MAX_NAME_REPETITION; |
| /** Maximum length of consecutive space/comment characters to match. */ |
| private static final int MAX_SPACE_LENGTH = CopyrightPatterns.MAX_SPACE_LENGTH; |
| /** Maximum repetition of potential dates to match. Might have to revisit this in future. */ |
| private static final int MAX_DATE_REPETITION = CopyrightPatterns.MAX_DATE_REPETITION; |
| |
| /** Regular expression matching whitespace or a comment character. */ |
| private static final String WS = CopyrightPatterns.WS; |
| /** Regular expression matching whitespace, a comment character, or punctuation. */ |
| private static final String WSPCT = CopyrightPatterns.WSPCT; |
| /** Regular experssion matching a web address. */ |
| private static final String URL = CopyrightPatterns.URL; |
| /** Regular expression matching a text or email address. */ |
| public static final String NAME = CopyrightPatterns.NAME; |
| /** Regular expression matching an UPPERCASE text. */ |
| public static final String UPPER_NAME = CopyrightPatterns.UPPER_NAME; |
| /** Regular expression matching a Proper Case text. */ |
| public static final String PROPER_NAME = CopyrightPatterns.PROPER_NAME; |
| /** Regular expression matching any text, email address, or quote character. */ |
| private static final String ANY_CHAR = CopyrightPatterns.ANY_CHAR; |
| /** Regular expression matching any text, email address, or quoted string. */ |
| public static final String ANY_WORD = CopyrightPatterns.ANY_WORD; |
| |
| /** |
| * Regular expressions to match arbitrary contract words. |
| * |
| * <p>Purposefully pushed the definition of common contract words to the lowest levels of the |
| * library to make it difficult--but not impossible--to customize the word list. |
| * |
| * <p>There are many words one can think of that are common to license contracts that do not |
| * appear here. For example, "grant" and "permission" lead to many false positives due to their |
| * use associated with ACLs and visibility etc. The word "contributed" appears so many times in |
| * .xml files in the Android code base that it adds significant latency and had to be removed. |
| * |
| * <p>Most license declarations will have multiple of these words so if a particular word causes a |
| * problem in a particular code base, it is probably okay to remove it for all code bases without |
| * too large a reduction in true positives. But please, check first. |
| * |
| * <p>Take care adding new words to make sure they do increase the number of true positives |
| * without causing other problems. Remember that the existing word list was arrived at empirically |
| * by adding many candidates and then pruning. |
| * |
| * <p>If the word lists really must diverge among different code bases, make the 2nd constructor |
| * public, and provide different word lists at a higher level. |
| */ |
| private static final ImmutableList<String> CONTRACT_WORDS = |
| ImmutableList.of( |
| "agree(?:s|d|ment)?", |
| "amendments?", |
| "applicable laws?", |
| "any manner", |
| "auth?or(?:s|ed|ship)?:?(?-i: \\p{Lu}\\p{Ll}*){2,5}", |
| "breach", |
| "(?:(?:required|return|allocated|allowed|contributed|copyrighted|generated|provided" |
| + "|raised|understandable|used|written) )?by:? @[-\\p{L}\\p{N}._]+", |
| "(?:(?:required|return|allocated|allowed|contributed|copyrighted|generated|provided" |
| + "|raised|understandable|used|written) )?by:? [-\\p{L}\\p{N}._]+@[-\\p{L}\\p{N}._]+", |
| "(?:(?:required|return|allocated|allowed|contributed|copyrighted|generated|provided" |
| + "|raised|understandable|used|written) )?by:?(?-i: \\p{Lu}\\p{Ll}*){2,5}", |
| "charge for", |
| "constitut(?:e|es|ed|ing)", |
| "contract(?:s|ed|ing|ual|ually)?", |
| // contributed removed -- frequent appearance in large .xml files increases latency |
| "contribut(?:e|es|or|ors|ion|ions)", |
| "copyleft", |
| "\\p{L}+ copyright(?:able)? \\p{L}+", |
| "damages", |
| "derivative", |
| "disclaim(?:s|ed|er)?", |
| "endorsements?", |
| " [(]?EUPL[)]? ", |
| "exemplary", |
| "expressly", |
| "fitness", |
| "govern(?:s|ed|ing)?", |
| "here(?:by|under)", |
| "herein(?:after)?", |
| "however caused", |
| "incidental", |
| "infring(?:e|es|ed|ing)", |
| "injury", |
| "jurisdictions?", |
| "lawful", |
| "liable", |
| "liabilit(?:ies|y)", |
| "(?:re)?licen[cs](?:e(?![:])|es|ed|ing|or)", |
| "litigation", |
| "merchantability", |
| "must agree", |
| "negligen(?:ce|t)", |
| "no event", |
| "no provision", |
| "(?:non|un)enforce(?:s|d|able|ability)?", |
| "nonexclusive", |
| "notwithstanding", |
| "obligations?", |
| "otherwise agreed", |
| "perpetu(?:al|ity)", |
| "phonorecords?", |
| "prior written", |
| "provisions", |
| "public domain", |
| "(?-i:(?:" + UPPER_NAME + " ){0,5}PUBLIC LICEN[CS]E)", |
| "(?-i:(?:" + PROPER_NAME + " ){0,5}Public Licen[cs]e)", |
| "punitive", |
| "pursuant", |
| "redistribut(?:e|ion)", |
| "right to", |
| "royalties", |
| "set forth", |
| " [(]?SISSL[)]? ", |
| "SPDX-License-Identifier[:]?", |
| "stoppage", |
| "terms and conditions", |
| "the laws of", |
| "third party", |
| "tort(?:s|ious)?", |
| "trademark", |
| "waive(?:s|d|r)?", |
| "warrant(?:s|y|ee|ed|ing)?", |
| "whatsoever"); |
| |
| public CopyrightScanner( |
| Iterable<String> firstPartyLicenses, |
| Iterable<String> thirdPartyLicenses, |
| Iterable<String> forbiddenLicenses, |
| Iterable<String> firstPartyOwners, |
| Iterable<String> thirdPartyOwners, |
| Iterable<String> forbiddenOwners, |
| Iterable<String> excludePatterns) { |
| this( |
| firstPartyLicenses, |
| thirdPartyLicenses, |
| forbiddenLicenses, |
| firstPartyOwners, |
| thirdPartyOwners, |
| forbiddenOwners, |
| excludePatterns, |
| CONTRACT_WORDS); |
| } |
| |
| private CopyrightScanner( |
| Iterable<String> firstPartyLicenses, |
| Iterable<String> thirdPartyLicenses, |
| Iterable<String> forbiddenLicenses, |
| Iterable<String> firstPartyOwners, |
| Iterable<String> thirdPartyOwners, |
| Iterable<String> forbiddenOwners, |
| Iterable<String> excludePatterns, |
| Iterable<String> contractWords) { |
| ImmutableList.Builder<Pattern> b = ImmutableList.builder(); |
| if (firstPartyLicenses != null) { |
| for (String license : firstPartyLicenses) { |
| b.add(patternizeKnownMatch(license)); |
| } |
| } |
| this.firstPartyLicenses = b.build(); |
| b = ImmutableList.builder(); |
| if (thirdPartyLicenses != null) { |
| for (String license : thirdPartyLicenses) { |
| b.add(patternizeKnownMatch(license)); |
| } |
| } |
| this.thirdPartyLicenses = b.build(); |
| b = ImmutableList.builder(); |
| if (forbiddenLicenses != null) { |
| for (String license : forbiddenLicenses) { |
| b.add(patternizeKnownMatch(license)); |
| } |
| } |
| this.forbiddenLicenses = b.build(); |
| b = ImmutableList.builder(); |
| if (firstPartyOwners != null) { |
| for (String owner : firstPartyOwners) { |
| b.add(patternizeKnownMatch(owner)); |
| } |
| } |
| this.firstPartyOwners = b.build(); |
| b = ImmutableList.builder(); |
| if (thirdPartyOwners != null) { |
| for (String owner : thirdPartyOwners) { |
| b.add(patternizeKnownMatch(owner)); |
| } |
| } |
| this.thirdPartyOwners = b.build(); |
| b = ImmutableList.builder(); |
| if (forbiddenOwners != null) { |
| for (String owner : forbiddenOwners) { |
| b.add(patternizeKnownMatch(owner)); |
| } |
| } |
| this.forbiddenOwners = b.build(); |
| b = ImmutableList.builder(); |
| for (String word : contractWords) { |
| b.add(patternizeKnownMatch(word)); |
| } |
| this.contractWords = b.build(); |
| Preconditions.checkArgument(!this.contractWords.isEmpty()); |
| b = ImmutableList.builder(); |
| if (excludePatterns != null) { |
| for (String pattern : excludePatterns) { |
| b.add(Pattern.compile(pattern)); // not transformed because applies to normalized matches |
| } |
| } |
| this.excludePatterns = b.build(); |
| this.copyright = buildPattern(); |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| if (this == other) { |
| return true; |
| } |
| if (other == null) { |
| return false; |
| } |
| if (other instanceof CopyrightScanner) { |
| CopyrightScanner otherScanner = (CopyrightScanner) other; |
| return copyright.equals(otherScanner.copyright); |
| } |
| return false; |
| } |
| |
| @Override |
| public int hashCode() { |
| return copyright.hashCode(); |
| } |
| |
| /** |
| * Scans `source` for copyright notices returning found license/author/owner information. |
| * |
| * @param name Arbitrary string identifying the source. Usually a filename. |
| * @param size Hint regarding the expected size of the input source. Use -1 if unknown. |
| * @param source The source input stream with line endings indexed for lookup. |
| * @return the list of matches found in the input stream -- never null. |
| */ |
| public ImmutableList<Match> findMatches(String name, long size, IndexedLineReader source) |
| throws IOException { |
| Preconditions.checkNotNull(name); |
| Preconditions.checkNotNull(source); |
| |
| ImmutableList.Builder<Match> builder = ImmutableList.builder(); |
| |
| // Accumulates unknown licenses in case no known matches found. |
| ArrayList<Match> unknowns = new ArrayList<>(); |
| |
| // Allocate a character buffer using the size hint. |
| int searchLength = size < 1 || size > MAX_SEARCH_LENGTH ? MAX_SEARCH_LENGTH : (int) size; |
| char[] content = new char[searchLength > 2 ? searchLength : 2]; // minimum 2 chars required |
| CharBuffer cb = CharBuffer.wrap(content); |
| |
| // Read the input into the character buffer. |
| source.read(cb); |
| cb.flip(); // Switch from tracking available space to read into to tracking amount read. |
| |
| int numUnknown = 0; // track number of contract words from unknown licenses found |
| int numLicenses = 0; // track number of licenses versus owners added to the builder |
| int numLicenseGroups = // First 2 or 3 captured groups are licenses. Rest are author/owner. |
| firstPartyLicenses.isEmpty() && thirdPartyLicenses.isEmpty() && forbiddenLicenses.isEmpty() |
| ? 2 |
| : 3; |
| |
| Matcher matcher = copyright.matcher(cb); |
| while (matcher.find()) { |
| MatchResult mr = matcher.toMatchResult(); |
| int numBuilt = 0; // track number of matches added to the builder |
| for (int i = 1; i <= mr.groupCount(); i++) { // group 0 is entire match not a specific group |
| String license = normalizeLicense(mr.group(i)); |
| if (license == null || license.trim().isEmpty() || isExcluded(license)) { |
| continue; |
| } |
| String owner = normalizeOwner(license); |
| if (isForbiddenLicense(license)) { |
| builder.add( |
| new Match( |
| PartyType.FORBIDDEN, |
| MatchType.LICENSE, |
| normalizeLicense(mr.group()), |
| source.getLineNumber(mr.start(i)), |
| source.getLineNumber(mr.end(i)), |
| mr.start(i), |
| mr.end(i))); |
| numLicenses++; |
| } else if (isThirdPartyLicense(license)) { |
| builder.add( |
| new Match( |
| PartyType.THIRD_PARTY, |
| MatchType.LICENSE, |
| normalizeLicense(mr.group()), |
| source.getLineNumber(mr.start(i)), |
| source.getLineNumber(mr.end(i)), |
| mr.start(i), |
| mr.end(i))); |
| numLicenses++; |
| } else if (isFirstPartyLicense(license)) { |
| builder.add( |
| new Match( |
| PartyType.FIRST_PARTY, |
| MatchType.LICENSE, |
| normalizeLicense(mr.group()), |
| source.getLineNumber(mr.start(i)), |
| source.getLineNumber(mr.end(i)), |
| mr.start(i), |
| mr.end(i))); |
| numLicenses++; |
| } else if (i <= numLicenseGroups) { // first 2 or 3 groups are licenses |
| builder.add( |
| new Match( |
| PartyType.UNKNOWN, // unknown licenses classified as unknown |
| MatchType.LICENSE, |
| normalizeLicense(mr.group()), |
| source.getLineNumber(mr.start(i)), |
| source.getLineNumber(mr.end(i)), |
| mr.start(i), |
| mr.end(i))); |
| numLicenses++; |
| } else if (license.toLowerCase().contains("license") |
| || license.toLowerCase().contains("licence")) { |
| builder.add( |
| new Match( |
| PartyType.UNKNOWN, // unknown licenses classified as unknown |
| MatchType.LICENSE, |
| normalizeLicense(mr.group()), |
| source.getLineNumber(mr.start(i)), |
| source.getLineNumber(mr.end(i)), |
| mr.start(i), |
| mr.end(i))); |
| numLicenses++; |
| } else if (isForbiddenOwner(owner)) { |
| builder.add( |
| new Match( |
| PartyType.FORBIDDEN, |
| normalizeLicense(mr.group()), |
| source.getLineNumber(mr.start(i)), |
| source.getLineNumber(mr.end(i)), |
| mr.start(i), |
| mr.end(i))); |
| } else if (isThirdPartyOwner(owner)) { |
| builder.add( |
| new Match( |
| PartyType.THIRD_PARTY, |
| normalizeLicense(mr.group()), |
| source.getLineNumber(mr.start(i)), |
| source.getLineNumber(mr.end(i)), |
| mr.start(i), |
| mr.end(i))); |
| } else if (isFirstPartyOwner(owner)) { |
| builder.add( |
| new Match( |
| PartyType.FIRST_PARTY, |
| normalizeLicense(mr.group()), |
| source.getLineNumber(mr.start(i)), |
| source.getLineNumber(mr.end(i)), |
| mr.start(i), |
| mr.end(i))); |
| } else { // remainder of groups are owner/author copyrights |
| builder.add( |
| new Match( |
| PartyType.THIRD_PARTY, // unknown authors classified as third party. |
| normalizeLicense(mr.group()), |
| source.getLineNumber(mr.start(i)), |
| source.getLineNumber(mr.end(i)), |
| mr.start(i), |
| mr.end(i))); |
| } |
| numBuilt++; |
| } |
| // If no capture group has content, the entire match is a word from an unknown contract. |
| // Don't bother accumulating unknown contract matches after known patterns detected. |
| if (numLicenses == 0 && numBuilt == 0 && numUnknown <= MATCH_THRESHOLD) { |
| String license = normalizeLicense(mr.group()); |
| if (license.matches("(?i)no copyright(?:able)?.*")) { // exclude negated match |
| continue; |
| } |
| if (isExcluded(license)) { |
| continue; |
| } |
| if (license.matches( // exclude common implementation comments using the word `by` |
| "(?i:required|return|allocated|allowed|generated|provided|raised|understandable" |
| + "|used) by .*")) {} |
| int startLine = source.getLineNumber(mr.start()); |
| int endLine = source.getLineNumber(mr.end()); |
| String owner = normalizeOwner(license); |
| if (isForbiddenLicense(license)) { |
| builder.add( |
| new Match( |
| PartyType.FORBIDDEN, |
| MatchType.LICENSE, |
| license, |
| startLine, |
| endLine, |
| mr.start(), |
| mr.end())); |
| numBuilt++; |
| continue; |
| } else if (isThirdPartyLicense(license)) { |
| builder.add( |
| new Match( |
| PartyType.THIRD_PARTY, |
| MatchType.LICENSE, |
| license, |
| startLine, |
| endLine, |
| mr.start(), |
| mr.end())); |
| numBuilt++; |
| continue; |
| } else if (isFirstPartyLicense(license)) { |
| builder.add( |
| new Match( |
| PartyType.FIRST_PARTY, |
| MatchType.LICENSE, |
| license, |
| startLine, |
| endLine, |
| mr.start(), |
| mr.end())); |
| numBuilt++; |
| continue; |
| } else if (isForbiddenOwner(owner)) { |
| builder.add( |
| new Match(PartyType.FORBIDDEN, license, startLine, endLine, mr.start(), mr.end())); |
| numBuilt++; |
| continue; |
| } else if (isThirdPartyOwner(owner)) { |
| builder.add( |
| new Match(PartyType.THIRD_PARTY, license, startLine, endLine, mr.start(), mr.end())); |
| numBuilt++; |
| continue; |
| } else if (isFirstPartyOwner(owner)) { |
| builder.add( |
| new Match(PartyType.FIRST_PARTY, license, startLine, endLine, mr.start(), mr.end())); |
| numBuilt++; |
| continue; |
| } |
| Match priorMatch = !unknowns.isEmpty() ? Iterables.getLast(unknowns) : null; |
| // If close to an earlier match (within 6 lines or 300 chars), extend the match to include |
| // the new word. |
| if (priorMatch != null |
| && (startLine - priorMatch.endLine < 6 || mr.start() - priorMatch.end < 300)) { |
| priorMatch.text = priorMatch.text + "..." + license; |
| priorMatch.endLine = endLine; |
| priorMatch.end = mr.end(); |
| } else { |
| // Otherwise, create a new match. |
| if (numUnknown < MATCH_THRESHOLD) { |
| unknowns.add( |
| new Match( |
| PartyType.UNKNOWN, |
| MatchType.LICENSE, |
| license, |
| startLine, |
| endLine, |
| mr.start(), |
| mr.end())); |
| } |
| numUnknown++; |
| } |
| } |
| // Stop the search early if enough known patterns already matched. |
| if (numBuilt >= MATCH_THRESHOLD) { |
| break; |
| } |
| } |
| // Return unknown contracts only when found and no known patterns matched. |
| if (numLicenses == 0) { |
| builder.addAll(unknowns); |
| } |
| return builder.build(); |
| } |
| |
| /** |
| * Constructs the search pattern incorporating the known matches into the generic regular |
| * expression. |
| * |
| * <p>The first 2 or 3 match groups correspond to license matches. If the configuration specifies |
| * known license patterns (1p, 3p or forbidden), the 1st match group will include these matches. |
| * |
| * <p>If the configuration specifies no known license patterns, the 1st and 2nd match groups will |
| * include matches to the generic license pattern. Otherwise, the 2nd and 3rd match groups will |
| * include these. |
| * |
| * <p>Subsequent match groups are all copyright author/owner matches. |
| * |
| * <p>The arbitrary contract words expression uses a non-capturing group. If none of the other |
| * match groups contain any content, the entire match is treated as an unknown license word. |
| */ |
| private Pattern buildPattern() { |
| StringBuilder words = new StringBuilder(); |
| for (Pattern word : contractWords) { |
| if (words.length() > 0) { |
| words.append('|'); |
| } |
| words.append(word); |
| } |
| |
| StringBuilder owners = new StringBuilder(); |
| owners.append("(?:by"); |
| owners.append(WS); |
| owners.append("{1,msl})?(?:the"); |
| owners.append(WS); |
| owners.append("{1,msl})?("); // owner expression always captured here |
| for (Pattern owner : thirdPartyOwners) { |
| String s = owner.toString(); |
| int start = s.startsWith(".*") || s.startsWith(".+") ? 2 : 0; |
| int end = s.endsWith(".*") || s.endsWith(".+") ? s.length() - 2 : s.length(); |
| owners.append(owner.toString().substring(start, end)); |
| owners.append('|'); |
| } |
| for (Pattern owner : firstPartyOwners) { |
| String s = owner.toString(); |
| int start = s.startsWith(".*") || s.startsWith(".+") ? 2 : 0; |
| int end = s.endsWith(".*") || s.endsWith(".+") ? s.length() - 2 : s.length(); |
| owners.append(owner.toString().substring(start, end)); |
| owners.append('|'); |
| } |
| for (Pattern owner : forbiddenOwners) { |
| String s = owner.toString(); |
| int start = s.startsWith(".*") || s.startsWith(".+") ? 2 : 0; |
| int end = s.endsWith(".*") || s.endsWith(".+") ? s.length() - 2 : s.length(); |
| owners.append(owner.toString().substring(start, end)); |
| owners.append('|'); |
| } |
| owners.append("(?:"); |
| owners.append(NAME); |
| owners.append("(?:"); |
| owners.append(WS); |
| owners.append("{1,msl}"); |
| owners.append(NAME); |
| owners.append("){0,mnr}))"); // end of owner capture |
| |
| // One of the frequent objections to regular expressions is the objection that long or complex |
| // expressions are difficult to read, and they are. Avoid changes to the expressions below. If |
| // given a choice between making a change below or adding a few "known owner"/"known license" |
| // patterns to the configuration, bias toward configuration. |
| // |
| // If that is not possible, one of the most difficult tasks when maintaining these expressions |
| // is balancing the parentheses and braces at the appropriate parts. The author of the below |
| // expression added a System.err.println() statement to output: |
| // pattern.toString().replaceall("([(](?:[?][:])?)", "$1\n").replaceall("[)]", "\n$1") |
| // inserting newlines after opening parentheses and before closing parentheses. The output |
| // was then fed through an awk script to indent the nested expressions: |
| |
| /* awk ' |
| BEGIN { |
| p=""; |
| } |
| $0 ~ /^[)].*$/ { |
| p=substr(p,1, length(p)-2); |
| } |
| { |
| print p $0; |
| } |
| $0 ~ /[(]([?][:])?$/ { |
| p=p " "; |
| } |
| ' |
| */ |
| // From that output, it was possible to see where parentheses balanced and what changes to make |
| // to edit the expression correctly. Not for the fainthearted. |
| StringBuilder sb = new StringBuilder(); |
| |
| // Optional known licence capture. |
| if (!firstPartyLicenses.isEmpty() |
| || !thirdPartyLicenses.isEmpty() |
| || !forbiddenLicenses.isEmpty()) { |
| sb.append("("); // start of optional 1st captured match group |
| sb.append( |
| Streams.concat( |
| thirdPartyLicenses.stream(), |
| firstPartyLicenses.stream(), |
| forbiddenLicenses.stream()) |
| .map( |
| input -> { |
| if (input == null) { |
| return ""; |
| } |
| String s = input.toString(); |
| int start = s.startsWith(".*") || s.startsWith(".+") ? 2 : 0; |
| int end = s.endsWith(".*") || s.endsWith(".+") ? s.length() - 2 : s.length(); |
| return input.toString().substring(start, end); |
| }) |
| .collect(Collectors.joining("|"))); |
| sb.append(")|"); // end of optional 1st captured group and | to introduce 2nd captured group. |
| } |
| |
| // Other license captures. -- ends with License |
| sb.append("(?:is"); // not captured -- helps confirm license but interferes with matching 1p,3p |
| sb.append(WS); |
| sb.append("{1,msl}(?:distributed|provided)"); |
| sb.append(WS); |
| sb.append("{1,msl}under(?:"); |
| sb.append(WS); |
| sb.append("{1,msl}(?:the|this))?"); |
| sb.append(WS); |
| sb.append("{1,msl}((?:"); // start of 1st or 2nd captured match group |
| sb.append(NAME); |
| sb.append(WS); |
| sb.append( |
| "{1,msl}){2,mnr}?licen[cs]e))[,.;]{0,3}(?![:])"); // end of 1st or 2nd captured match group |
| |
| // Other license captures. -- Line starting with License: |
| sb.append("|(?-ms:licen[cs]e:\\s{1,msl}("); // start of the 2nd or 3rd captured match group |
| sb.append(NAME); |
| sb.append("(?:\\s{1,msl}"); |
| sb.append(NAME); |
| sb.append("){0,mnr})\\n)"); // end of 2nd or 3rd captured match group |
| |
| // "Author is" copyright capture. |
| sb.append("|\\b(?:(?:the"); // not captured--helps confirm but interferes with 1p, 3p, forbidden |
| sb.append(WS); |
| sb.append("{1,msl}author"); |
| sb.append(WS); |
| sb.append("{1,msl}of"); |
| sb.append(WS); |
| sb.append("{1,msl}this"); |
| sb.append(WS); |
| sb.append("{1,msl}software"); |
| sb.append(WS); |
| sb.append("{1,msl}is|\\b(?:(?:principal"); |
| sb.append(WS); |
| sb.append("{1,msl})?author:?))"); |
| sb.append(WS); |
| sb.append("{1,msl}"); |
| sb.append(owners.toString()); // owner pattern includes capture group |
| sb.append(")"); |
| |
| // Copyright+year(s)+owner copyright capture. |
| sb.append("|(?:"); // not captureed -- helps confirm but interferes with 1p, 3p, forbidden |
| sb.append(WS); |
| sb.append("{0,msl}(?:[(]c[)]|©|©)"); |
| sb.append(WS); |
| sb.append("{0,msl})?(?:(?:copy(?:right|left)(?:"); |
| sb.append(WS); |
| sb.append("{1,msl}notice)?(?:"); |
| sb.append(WS); |
| sb.append("{0,msl}(?:[(]c[)]|©|©))?)|(?:[(]c[)]|©|©))"); |
| sb.append(WS); |
| sb.append("{1,msl}(?:"); |
| sb.append("[\\p{N}]{2,4}(?:"); // year(s)+owner |
| sb.append(WSPCT); |
| sb.append("{1,msl}(?:and"); |
| sb.append(WSPCT); |
| sb.append("{1,msl})?[\\p{N}]{2,4}){0,mdr}(?:"); // allows pre-y2k 2-digit years |
| sb.append(WSPCT); |
| sb.append("{1,msl}(?:present|now))?"); |
| sb.append(WSPCT); |
| sb.append("{1,msl}"); |
| sb.append(owners.toString()); // owner pattern includes capture group |
| sb.append("|"); // owner+year(s) |
| sb.append(owners.toString()); // owner pattern includes capture group |
| sb.append(WS); |
| sb.append("{1,msl}[\\p{N}]{2,4}(?:"); // allows pre-y2k 2-digit years |
| sb.append(WSPCT); |
| sb.append("{1,msl}(?:and"); |
| sb.append(WSPCT); |
| sb.append("{1,msl})?[\\p{N}]{2,4}){0,mdr}(?:"); // allows pre-y2k 2-digit years |
| sb.append(WSPCT); |
| sb.append("{1,msl}(?:present|now))?"); |
| sb.append(")(?:(?:portions)?"); |
| sb.append(WS); |
| sb.append("{0,msl}(?:[(]c[)]|©|©)?"); |
| sb.append(WS); |
| sb.append("{1,msl}copy(?:right|left)(?:"); |
| sb.append(WS); |
| sb.append("{0,msl}(?:[(]c[)]|©|©))?"); |
| sb.append(WS); |
| sb.append("{1,msl}(?:"); |
| sb.append("[\\p{N}]{2,4}(?:"); // year(s)+owner |
| sb.append(WSPCT); |
| sb.append("{1,msl}(?:and"); |
| sb.append(WSPCT); |
| sb.append("{1,msl})?[\\p{N}]{2,4}){0,mdr}(?:"); // allows pre-y2k 2-digit years |
| sb.append(WSPCT); |
| sb.append("{1,msl}(?:present|now))?"); |
| sb.append(WSPCT); |
| sb.append("{1,msl}"); |
| sb.append(owners.toString()); // owner pattern (repeated) includes capture group |
| sb.append("|"); // owner+year(s) |
| sb.append(owners.toString()); // owner pattern (repeated) includes capture group |
| sb.append(WS); |
| sb.append("{1,msl}[\\p{N}]{2,4}(?:"); // allows pre-y2k 2-digit years |
| sb.append(WSPCT); |
| sb.append("{1,msl}(?:and"); |
| sb.append(WSPCT); |
| sb.append("{1,msl})?[\\p{N}]{2,4}){0,mdr}(?:"); // allows pre-y2k 2-digit years |
| sb.append(WSPCT); |
| sb.append("{1,msl}(?:present|now))?"); |
| sb.append(")){0,5}"); // captures 0 to 5 additional author/owner declarations |
| |
| // Detect contract words to detect unknown licenses. |
| sb.append("|(?:(?:\\b|\\p{Pi})(?:"); // unknown licenses use non-capturing group |
| sb.append(words); |
| sb.append(")(?:"); |
| sb.append(WS); |
| sb.append("(?:"); |
| sb.append(words); |
| sb.append(")){0,mnr}(?:\\b|[,.;:\\p{Pf}]))"); |
| |
| return Pattern.compile( |
| sb.toString() |
| .replaceAll("[,]mnl[}]", "," + MAX_NAME_LENGTH + "}") |
| .replaceAll("[,]msl[}]", "," + MAX_SPACE_LENGTH + "}") |
| .replaceAll("[,]mnr[}]", "," + MAX_NAME_REPETITION + "}") |
| .replaceAll("[,]mdr[}]", "," + MAX_DATE_REPETITION + "}"), |
| Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.UNICODE_CASE | Pattern.DOTALL); |
| } |
| |
| /** Returns true when `owner` matches any known first party owner. */ |
| private boolean isExcluded(String match) { |
| for (Pattern p : excludePatterns) { |
| if (p.matcher(match).find()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** Returns true when `owner` matches any known first party owner. */ |
| private boolean isFirstPartyOwner(String owner) { |
| if (owner == null || owner.isEmpty()) { |
| return false; |
| } |
| for (Pattern p : firstPartyOwners) { |
| if (p.matcher(owner).matches()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** Returns true when `owner` matches any known forbidden owner. */ |
| private boolean isForbiddenOwner(String owner) { |
| if (owner == null || owner.isEmpty()) { |
| return false; |
| } |
| for (Pattern p : forbiddenOwners) { |
| if (p.matcher(owner).matches()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** Returns true when `owner` matches any known third party owner. */ |
| private boolean isThirdPartyOwner(String owner) { |
| if (owner == null || owner.isEmpty()) { |
| return false; |
| } |
| for (Pattern p : thirdPartyOwners) { |
| if (p.matcher(owner).matches()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** Returns true when `license` matches any known first party license. */ |
| private boolean isFirstPartyLicense(String license) { |
| for (Pattern p : firstPartyLicenses) { |
| if (p.matcher(license).matches()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** Returns true when `license` matches any known forbidden license. */ |
| private boolean isForbiddenLicense(String license) { |
| for (Pattern p : forbiddenLicenses) { |
| if (p.matcher(license).matches()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** Returns true when `license` matches any known third party license. */ |
| private boolean isThirdPartyLicense(String license) { |
| for (Pattern p : thirdPartyLicenses) { |
| if (p.matcher(license).matches()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * Converts a known matching pattern written in a simplified regular expression language into a |
| * regular expression treating comment characters as whitespace and replacing unlimited wildcard |
| * expressions with expressions using a limited set of characters and a limited quantifier. |
| */ |
| private static Pattern patternizeKnownMatch(String match) { |
| Preconditions.checkNotNull(match); |
| Preconditions.checkArgument(!match.isEmpty(), "Non-empty pattern required."); |
| // Disallow capture groups which will interfere with 1p, 3p, or forbidden classification. |
| Preconditions.checkArgument( |
| !match.matches("(?:^|.*[^\\[])[(][^?](?:[^:].*|$)"), |
| "Capturing group found in /" + match + "/. Use non-capturing (?:...) instead of (...)."); |
| // Disallow spaces inside character classes because they will get replaced. |
| Preconditions.checkArgument( |
| !match.matches(".*\\[[^]]*\\s[]].*"), |
| "Character class with space in /" + match + "/. Use (?: |...) instead of space in [...]."); |
| // Replace unlimited "any char" wildcards that can cost too much backtracking with patterns that |
| // match a smaller subset of characters with more limited quantifiers. |
| // |
| // Replace any sequence of whitespace with a regular expression to match any non-empty sequence |
| // of whitespace or comment characters. |
| String prefix = ""; |
| if (match.startsWith(".*")) { |
| prefix = ".*"; |
| } else if (match.startsWith(".+")) { |
| prefix = ".*"; |
| } |
| String suffix = ""; |
| if (match.endsWith(".*")) { |
| suffix = ".*"; |
| } else if (match.endsWith(".+")) { |
| suffix = ".*"; |
| } |
| return Pattern.compile( |
| prefix |
| + match |
| .substring(prefix.length(), match.length() - suffix.length()) |
| .replaceAll( |
| "[.][*]", |
| ("(?: " |
| + ANY_CHAR |
| + "{1," |
| + MAX_NAME_LENGTH |
| + "}){0," |
| + MAX_NAME_REPETITION |
| + "}") |
| .replace("\\", "\\\\")) |
| .replaceAll( |
| "[.][+]", |
| ("(?: " |
| + ANY_CHAR |
| + "{1," |
| + MAX_NAME_LENGTH |
| + "}){1," |
| + MAX_NAME_REPETITION |
| + "}") |
| .replace("\\", "\\\\")) |
| .replaceAll("\\s+[?]", WS.replace("\\", "\\\\") + "{0," + MAX_SPACE_LENGTH + "}") |
| .replaceAll("\\s+", WS.replace("\\", "\\\\") + "{1," + MAX_SPACE_LENGTH + "}") |
| + suffix, |
| Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.UNICODE_CASE | Pattern.DOTALL); |
| } |
| |
| /** |
| * Replaces sequences of whitespace and comment characters with a single space preserving URLs, |
| * which often contain `/` or `#` as non-comment characters. |
| */ |
| private static String normalizeLicense(String match) { |
| if (match == null) { |
| return null; |
| } |
| StringBuilder sb = new StringBuilder(); |
| Matcher m = Pattern.compile(URL).matcher(match); |
| int nextIndex = 0; |
| while (m.find()) { |
| int start = m.start(); |
| if (nextIndex < start) { |
| sb.append(match.substring(nextIndex, start).replaceAll(WS + "+", " ")); |
| } |
| sb.append(m.group()); |
| nextIndex = m.end(); |
| } |
| if (nextIndex < match.length()) { |
| sb.append(match.substring(nextIndex).replaceAll(WS + "+", " ")); |
| } |
| return sb.toString().trim(); |
| } |
| |
| /** |
| * Strips common non-author/owner suffixes that get picked up unintentionally from previously |
| * normalized license with sequences of whitespace and comment characters replaced with a single |
| * space preserving URLS, which often contain `/` or `#` as non-comment characters. |
| * |
| * <p>The generic license pattern always ends by matching the word `license` or stops at the end |
| * of the line so it does not pick up spurious additional text. The generic owner pattern does not |
| * end in a specific word so it often includes spurious additional words like #ifdef or #ifndef, |
| * which interfere when comparing the match against known author/owner patterns. This method |
| * strips the most common non-author/owner words from the end of the match. |
| */ |
| private static String normalizeOwner(String license) { |
| if (license == null) { |
| return null; |
| } |
| return license |
| .split( |
| "(?i)[ ](?:all rights|(?:the|this) [^ ]+(?: [^ ]+){0,2} (?:is|assumes|may)" |
| + "|permission|copyright|version \\p{N}|for conditions|include |include$" |
| + "|modification|however|open source license|please (?:use|read)|libname" |
| + "|if defined|usage|this is free|added|generic|redistribution|ifdef|ifndef" |
| + "|for (?:more|terms)|copying and|you (?:may|can)|released under|see the" |
| + "|full source|freedom to use|this program and|distributed|https?|unit ?test" |
| + "|import|static|by obtaining|by using|by copying|example|namespace|config\\b" |
| + "|public (?:static|final|class)|package (?:org|com)|[^ ]+ is hereby)")[0]; |
| } |
| |
| /** Identifies the relevant party as 1p, 3p, forbidden, or unknown. */ |
| public enum PartyType { |
| FIRST_PARTY, |
| THIRD_PARTY, |
| UNKNOWN, |
| FORBIDDEN, |
| } |
| |
| /** Identifies whether text matched by author/owner pattern or by license pattern. */ |
| public enum MatchType { |
| AUTHOR_OWNER, |
| LICENSE, |
| } |
| |
| /** |
| * Describes a copyright author/owner or license `text` match found in the input stream. |
| * |
| * <p>Identifies the relevant party as `FIRST_PARTY`, `THIRD_PARTY`, `FORBIDDEN`, or `UNKNOWN`. |
| * |
| * <p>Identifies the match as `AUTHOR_OWNER` or `LICENSE`. |
| * |
| * <p>Includes a normalized version of the matched text including where it was found in the file. |
| */ |
| public static class Match { |
| /** Classifies relevant party as 1p, 3p, forbidden, or unknown. */ |
| public PartyType partyType; |
| /** Classifies match as author/owner or as license. */ |
| public MatchType matchType; |
| /** Matched text with spaces and comment characters replaced by a single space. */ |
| public String text; |
| /** The line number in the file where the match starts. */ |
| public int startLine; |
| /** The line number in the file where the match ends. */ |
| public int endLine; |
| /** The character offset into the file where the match starts. */ |
| public int start; |
| /** The character offset into the file where the match ends. */ |
| public int end; |
| |
| public Match(PartyType partyType, String text, int startLine, int endLine, int start, int end) { |
| this(partyType, MatchType.AUTHOR_OWNER, text, startLine, endLine, start, end); |
| } |
| |
| public Match( |
| PartyType partyType, |
| MatchType matchType, |
| String text, |
| int startLine, |
| int endLine, |
| int start, |
| int end) { |
| this.partyType = partyType; |
| this.matchType = matchType; |
| this.text = text; |
| this.startLine = startLine; |
| this.endLine = endLine; |
| this.start = start; |
| this.end = end; |
| } |
| } |
| } |