Check jgit RawText#isBinary before MimeTypes

JGit's isBinary function implements a simple heuristic: it considers
a file binary if and only if one of the first 8000 bytes is NUL.
This is the same heuristic used to decide whether to show a diff as
text.

Keep the MimeTypes based check to allow users to continue to exclude
text-based checks for some file types. A followup change will allow
whitelisting some files or commits using notes in the
refs/notes/uploadvalidator ref, allowing the MimeTypes check to be
dropped.

Change-Id: I2726af4bf3235ee7a03cc06d88c9877f7081ae4b
diff --git a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidator.java b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidator.java
index 975a6cf..add3731 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidator.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidator.java
@@ -39,6 +39,7 @@
 import com.google.inject.Inject;
 import com.google.inject.name.Named;
 
+import org.eclipse.jgit.diff.RawText;
 import org.eclipse.jgit.lib.ObjectId;
 import org.eclipse.jgit.lib.ObjectLoader;
 import org.eclipse.jgit.lib.Repository;
@@ -47,6 +48,7 @@
 
 import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.text.MessageFormat;
@@ -161,8 +163,10 @@
     Map<String, ObjectId> content = CommitUtils.getChangedContent(repo, c, revWalk);
     for (String path : content.keySet()) {
       ObjectLoader ol = repo.open(content.get(path));
-      if (contentTypeUtil.isBinary(ol, path, cfg)) {
-        continue;
+      try (InputStream in = ol.openStream()) {
+        if (RawText.isBinary(in) || contentTypeUtil.isBlacklistedBinaryContentType(ol, path, cfg)) {
+          continue;
+        }
       }
       checkFileForBlockedKeywords(blockedKeywordPartterns, messages, path, ol);
     }
diff --git a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java
index a0f8ea4..2fe6479 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java
@@ -77,7 +77,7 @@
     this.patternCache = patternCache;
   }
 
-  public boolean isBinary(ObjectLoader ol, String pathname, PluginConfig cfg)
+  public boolean isBlacklistedBinaryContentType(ObjectLoader ol, String pathname, PluginConfig cfg)
       throws IOException, ExecutionException {
     try (InputStream is = ol.openStream()) {
       return matchesAny(getContentType(is, pathname), getBinaryTypes(cfg));
diff --git a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidator.java b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidator.java
index 00dc478..7f5d79f 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidator.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidator.java
@@ -31,6 +31,7 @@
 import com.google.inject.AbstractModule;
 import com.google.inject.Inject;
 
+import org.eclipse.jgit.diff.RawText;
 import org.eclipse.jgit.lib.ObjectId;
 import org.eclipse.jgit.lib.ObjectLoader;
 import org.eclipse.jgit.lib.Repository;
@@ -38,6 +39,7 @@
 import org.eclipse.jgit.revwalk.RevWalk;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.util.Collections;
@@ -133,8 +135,10 @@
     Map<String, ObjectId> content = CommitUtils.getChangedContent(repo, c, revWalk);
     for (String path : content.keySet()) {
       ObjectLoader ol = repo.open(content.get(path));
-      if (contentTypeUtil.isBinary(ol, path, cfg)) {
-        continue;
+      try (InputStream in = ol.openStream()) {
+        if (RawText.isBinary(in) || contentTypeUtil.isBlacklistedBinaryContentType(ol, path, cfg)) {
+          continue;
+        }
       }
       try (InputStreamReader isr = new InputStreamReader(ol.openStream(), StandardCharsets.UTF_8)) {
         if (doesInputStreanContainCR(isr)) {
diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md
index 04c04a1..c85039c 100644
--- a/src/main/resources/Documentation/config.md
+++ b/src/main/resources/Documentation/config.md
@@ -76,6 +76,9 @@
 	Using this option it is possible to configure which content types are
 	considered binary types.
 
+	If there is a NUL byte in the first 8k then the file will be considered
+	binary regardless of this setting.
+
 	To detect content types [Apache Tika library][2] is used.
 
 	Content type can be specified as a string, wildcard or a regular expression,