Check jgit RawText#isBinary before MimeTypes
JGit's isBinary function implements a simple heuristic: it considers
a file binary if and only if one of the first 8000 bytes is NUL.
This is the same heuristic used to decide whether to show a diff as
text.
Keep the MimeTypes based check to allow users to continue to exclude
text-based checks for some file types. A followup change will allow
whitelisting some files or commits using notes in the
refs/notes/uploadvalidator ref, allowing the MimeTypes check to be
dropped.
Change-Id: I2726af4bf3235ee7a03cc06d88c9877f7081ae4b
diff --git a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidator.java b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidator.java
index 975a6cf..add3731 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidator.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidator.java
@@ -39,6 +39,7 @@
import com.google.inject.Inject;
import com.google.inject.name.Named;
+import org.eclipse.jgit.diff.RawText;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.Repository;
@@ -47,6 +48,7 @@
import java.io.BufferedReader;
import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.text.MessageFormat;
@@ -161,8 +163,10 @@
Map<String, ObjectId> content = CommitUtils.getChangedContent(repo, c, revWalk);
for (String path : content.keySet()) {
ObjectLoader ol = repo.open(content.get(path));
- if (contentTypeUtil.isBinary(ol, path, cfg)) {
- continue;
+ try (InputStream in = ol.openStream()) {
+ if (RawText.isBinary(in) || contentTypeUtil.isBlacklistedBinaryContentType(ol, path, cfg)) {
+ continue;
+ }
}
checkFileForBlockedKeywords(blockedKeywordPartterns, messages, path, ol);
}
diff --git a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java
index a0f8ea4..2fe6479 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java
@@ -77,7 +77,7 @@
this.patternCache = patternCache;
}
- public boolean isBinary(ObjectLoader ol, String pathname, PluginConfig cfg)
+ public boolean isBlacklistedBinaryContentType(ObjectLoader ol, String pathname, PluginConfig cfg)
throws IOException, ExecutionException {
try (InputStream is = ol.openStream()) {
return matchesAny(getContentType(is, pathname), getBinaryTypes(cfg));
diff --git a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidator.java b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidator.java
index 00dc478..7f5d79f 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidator.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidator.java
@@ -31,6 +31,7 @@
import com.google.inject.AbstractModule;
import com.google.inject.Inject;
+import org.eclipse.jgit.diff.RawText;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.Repository;
@@ -38,6 +39,7 @@
import org.eclipse.jgit.revwalk.RevWalk;
import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
@@ -133,8 +135,10 @@
Map<String, ObjectId> content = CommitUtils.getChangedContent(repo, c, revWalk);
for (String path : content.keySet()) {
ObjectLoader ol = repo.open(content.get(path));
- if (contentTypeUtil.isBinary(ol, path, cfg)) {
- continue;
+ try (InputStream in = ol.openStream()) {
+ if (RawText.isBinary(in) || contentTypeUtil.isBlacklistedBinaryContentType(ol, path, cfg)) {
+ continue;
+ }
}
try (InputStreamReader isr = new InputStreamReader(ol.openStream(), StandardCharsets.UTF_8)) {
if (doesInputStreanContainCR(isr)) {
diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md
index 04c04a1..c85039c 100644
--- a/src/main/resources/Documentation/config.md
+++ b/src/main/resources/Documentation/config.md
@@ -76,6 +76,9 @@
Using this option it is possible to configure which content types are
considered binary types.
+ If there is a NUL byte in the first 8k then the file will be considered
+ binary regardless of this setting.
+
To detect content types [Apache Tika library][2] is used.
Content type can be specified as a string, wildcard or a regular expression,