Revert "Remove Tika dependency for uploadvalidator"

This reverts commit be977bde860f755b7002019f561f6b16265aeab6.

With change [1], Tika was removed in favor of the default
MimeUtils2 library for detecting mime types. This leads
to a regression where plain ASCII files without a (known) file
extension are falsely identified as "application/octet-stream"
and therefore result in rejected changes.

This change fixes this by reintroducing Tika to restore
previous behavior. However, I'm not sure if this is the
right approach since the issue probably also could affect
Gerrit core which uses MimeUtils2. Since MimeUtils2 is
not under active maintenance, I'm not sure how feasible
a fix would be there.

The following three cases where identified:
cat << EOF > testfile
TEST
EOF

should be: text/plain

cat << EOF > testfile.csv
TEST
EOF

should be: text/csv

cat << EOF > testfile.y
TEST
EOF

should be: text/x-c or text/plain

[1] https://gerrit.googlesource.com/plugins/uploadvalidator/+/be977bde860f755b7002019f561f6b16265aeab6%5E%21/

Issue: 12706
Change-Id: I75d8b85d7be0e11dffeee410d4b3ed3c5dcfc08c
diff --git a/BUILD b/BUILD
index 4a6aab6..2d26f72 100644
--- a/BUILD
+++ b/BUILD
@@ -12,12 +12,16 @@
         "Gerrit-HttpModule: com.googlesource.gerrit.plugins.uploadvalidator.HttpModule",
     ],
     resources = glob(["src/main/resources/**/*"]),
+    deps = [
+        "@tika-core//jar",
+    ],
 )
 
 TEST_SRCS = "src/test/java/**/*Test.java"
 
 TEST_DEPS = PLUGIN_DEPS + PLUGIN_TEST_DEPS + [
-    "@commons_io//jar",
+    "@commons-io//jar",
+    "@tika-core//jar",
     ":uploadvalidator__plugin",
 ]
 
diff --git a/external_plugin_deps.bzl b/external_plugin_deps.bzl
index 93746e1..65cd5e6 100644
--- a/external_plugin_deps.bzl
+++ b/external_plugin_deps.bzl
@@ -2,7 +2,12 @@
 
 def external_plugin_deps():
     maven_jar(
-        name = "commons_io",
+        name = "commons-io",
         artifact = "commons-io:commons-io:1.4",
         sha1 = "a8762d07e76cfde2395257a5da47ba7c1dbd3dce",
     )
+    maven_jar(
+        name = "tika-core",
+        artifact = "org.apache.tika:tika-core:1.12",
+        sha1 = "5ab95580d22fe1dee79cffbcd98bb509a32da09b",
+    )
diff --git a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java
index ddb18db..67ae289 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtil.java
@@ -22,7 +22,6 @@
 import com.google.gerrit.extensions.api.projects.ProjectConfigEntryType;
 import com.google.gerrit.server.config.PluginConfig;
 import com.google.gerrit.server.config.ProjectConfigEntry;
-import com.google.gerrit.server.mime.FileTypeRegistry;
 import com.google.inject.AbstractModule;
 import com.google.inject.Inject;
 import com.google.inject.name.Named;
@@ -30,6 +29,10 @@
 import java.io.InputStream;
 import java.util.concurrent.ExecutionException;
 import java.util.regex.Pattern;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
 import org.eclipse.jgit.lib.ObjectLoader;
 
 public class ContentTypeUtil {
@@ -65,24 +68,24 @@
   }
 
   private final LoadingCache<String, Pattern> patternCache;
-  private final FileTypeRegistry mimeUtil;
+  private final Tika tika = new Tika(TikaConfig.getDefaultConfig());
 
   @Inject
-  ContentTypeUtil(
-      @Named(CACHE_NAME) LoadingCache<String, Pattern> patternCache, FileTypeRegistry mimeUtil) {
+  ContentTypeUtil(@Named(CACHE_NAME) LoadingCache<String, Pattern> patternCache) {
     this.patternCache = patternCache;
-    this.mimeUtil = mimeUtil;
   }
 
   public boolean isBlacklistedBinaryContentType(ObjectLoader ol, String pathname, PluginConfig cfg)
       throws IOException, ExecutionException {
-    return matchesAny(getContentType(ol, pathname), getBinaryTypes(cfg));
+    try (InputStream is = ol.openStream()) {
+      return matchesAny(getContentType(is, pathname), getBinaryTypes(cfg));
+    }
   }
 
-  public String getContentType(ObjectLoader ol, String pathname) throws IOException {
-    try (InputStream is = ol.openStream()) {
-      return mimeUtil.getMimeType(pathname, is).toString();
-    }
+  public String getContentType(InputStream is, String pathname) throws IOException {
+    Metadata metadata = new Metadata();
+    metadata.set(Metadata.RESOURCE_NAME_KEY, pathname);
+    return tika.detect(TikaInputStream.get(is), metadata);
   }
 
   @VisibleForTesting
diff --git a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeValidator.java b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeValidator.java
index 1cc5fd4..0d8f2e4 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeValidator.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeValidator.java
@@ -38,6 +38,7 @@
 import java.util.concurrent.ExecutionException;
 import org.eclipse.jgit.lib.ObjectId;
 import org.eclipse.jgit.lib.ObjectLoader;
+import org.eclipse.jgit.lib.ObjectStream;
 import org.eclipse.jgit.lib.Repository;
 import org.eclipse.jgit.revwalk.RevCommit;
 import org.eclipse.jgit.revwalk.RevWalk;
@@ -153,13 +154,15 @@
     List<CommitValidationMessage> messages = new LinkedList<>();
     Map<String, ObjectId> content = CommitUtils.getChangedContent(repo, c, revWalk);
     for (String path : content.keySet()) {
-      ObjectLoader ol = revWalk.getObjectReader().open(content.get(path));
-      String contentType = contentTypeUtil.getContentType(ol, path);
-      if ((contentTypeUtil.matchesAny(contentType, blockedTypes) && !whitelist)
-          || (!contentTypeUtil.matchesAny(contentType, blockedTypes) && whitelist)) {
-        messages.add(
-            new CommitValidationMessage(
-                "found blocked content type (" + contentType + ") in file: " + path, true));
+      ObjectLoader ol = repo.open(content.get(path));
+      try (ObjectStream os = ol.openStream()) {
+        String contentType = contentTypeUtil.getContentType(os, path);
+        if ((contentTypeUtil.matchesAny(contentType, blockedTypes) && !whitelist)
+            || (!contentTypeUtil.matchesAny(contentType, blockedTypes) && whitelist)) {
+          messages.add(
+              new CommitValidationMessage(
+                  "found blocked content type (" + contentType + ") in file: " + path, true));
+        }
       }
     }
     return messages;
diff --git a/src/main/resources/Documentation/build.md b/src/main/resources/Documentation/build.md
index 3aafe70..c41f14e 100644
--- a/src/main/resources/Documentation/build.md
+++ b/src/main/resources/Documentation/build.md
@@ -60,6 +60,15 @@
   ln -s ../../@PLUGIN@ .
 ```
 
+Put the external dependency Bazel build file into the Gerrit /plugins
+directory, replacing the existing empty one.
+
+```
+  cd gerrit/plugins
+  rm external_plugin_deps.bzl
+  ln -s @PLUGIN@/external_plugin_deps.bzl .
+```
+
 From Gerrit source tree issue the command:
 
 ```
diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md
index 95c6775..4402118 100644
--- a/src/main/resources/Documentation/config.md
+++ b/src/main/resources/Documentation/config.md
@@ -109,7 +109,7 @@
     If there is a NUL byte in the first 8k then the file will be considered
     binary regardless of this setting.
 
-    To detect content types, the [MimeUtil2 library][2] is used.
+	To detect content types [Apache Tika library][2] is used.
 
     Content type can be specified as a string, wildcard or a regular expression,
     for example:
@@ -154,8 +154,8 @@
     This check does not run on [binary files][4]
 
 [1]: https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html
-[2]: https://mvnrepository.com/artifact/eu.medsea.mimeutil/mime-util
-[3]: https://gerrit.googlesource.com/gerrit/+/refs/heads/master/gerrit-server/src/main/resources/com/google/gerrit/server/mime/mime-types.properties
+[2]: https://tika.apache.org/
+[3]: https://tika.apache.org/1.12/formats.html#Full_list_of_Supported_Formats
 [4]: #binary_type
 
 plugin.@PLUGIN@.blockedContentType
@@ -164,7 +164,7 @@
     This check looks for blocked content types. If the check finds a
     blocked content type the push will be rejected.
 
-    To detect content types the [MimeUtil2 library][2] is used.
+	To detect content types [Apache Tika library][2] is used.
 
     Content type can be specified as a string, wildcard or a regular expression,
     for example:
diff --git a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidatorTest.java b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidatorTest.java
index 20f2e83..a22baa0 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidatorTest.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/BlockedKeywordValidatorTest.java
@@ -76,12 +76,7 @@
       RevCommit c = makeCommit(rw);
       BlockedKeywordValidator validator =
           new BlockedKeywordValidator(
-              null,
-              new ContentTypeUtil(PATTERN_CACHE, new FakeMimeUtilFileTypeRegistry()),
-              PATTERN_CACHE,
-              null,
-              null,
-              null);
+              null, new ContentTypeUtil(PATTERN_CACHE), PATTERN_CACHE, null, null, null);
       List<CommitValidationMessage> m =
           validator.performValidation(repo, c, rw, getPatterns().values(), EMPTY_PLUGIN_CONFIG);
       Set<String> expected =
diff --git a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtilTest.java b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtilTest.java
index a8e2713..91df4c6 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtilTest.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeUtilTest.java
@@ -18,19 +18,16 @@
 import static com.googlesource.gerrit.plugins.uploadvalidator.TestUtils.EMPTY_PLUGIN_CONFIG;
 import static com.googlesource.gerrit.plugins.uploadvalidator.TestUtils.PATTERN_CACHE;
 
-import com.google.gerrit.server.mime.MimeUtilFileTypeRegistry;
-import com.google.inject.Inject;
 import java.util.concurrent.ExecutionException;
 import org.junit.Before;
 import org.junit.Test;
 
 public class ContentTypeUtilTest {
   private ContentTypeUtil ctu;
-  @Inject private MimeUtilFileTypeRegistry mimeUtil;
 
   @Before
   public void setUp() {
-    ctu = new ContentTypeUtil(PATTERN_CACHE, mimeUtil);
+    ctu = new ContentTypeUtil(PATTERN_CACHE);
   }
 
   @Test
diff --git a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeValidatorTest.java b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeValidatorTest.java
index b57ef49..6477189 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeValidatorTest.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/ContentTypeValidatorTest.java
@@ -60,12 +60,7 @@
   @Before
   public void setUp() {
     validator =
-        new ContentTypeValidator(
-            null,
-            new ContentTypeUtil(PATTERN_CACHE, new FakeMimeUtilFileTypeRegistry()),
-            null,
-            null,
-            null);
+        new ContentTypeValidator(null, new ContentTypeUtil(PATTERN_CACHE), null, null, null);
   }
 
   @Test
diff --git a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidatorTest.java b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidatorTest.java
index 17209d2..e0b2b68 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidatorTest.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/InvalidLineEndingValidatorTest.java
@@ -53,11 +53,7 @@
       RevCommit c = makeCommit(rw);
       InvalidLineEndingValidator validator =
           new InvalidLineEndingValidator(
-              null,
-              new ContentTypeUtil(PATTERN_CACHE, new FakeMimeUtilFileTypeRegistry()),
-              null,
-              null,
-              null);
+              null, new ContentTypeUtil(PATTERN_CACHE), null, null, null);
       List<CommitValidationMessage> m =
           validator.performValidation(repo, c, rw, EMPTY_PLUGIN_CONFIG);
       assertThat(TestUtils.transformMessages(m))
diff --git a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/MimeTypeDetection.java b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/MimeTypeDetection.java
index 9140681..212806a 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/MimeTypeDetection.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/uploadvalidator/MimeTypeDetection.java
@@ -13,28 +13,21 @@
 // limitations under the License.
 
 package com.googlesource.gerrit.plugins.uploadvalidator;
-
-import static java.util.Comparator.comparing;
-
-import eu.medsea.mimeutil.MimeType;
-import eu.medsea.mimeutil.MimeUtil2;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
 
 class MimeTypeDetection {
-  public String getMimeType(String path, byte[] content) {
-    MimeUtil2 mimeUtil = new MimeUtil2();
+  public String getMimeType(String path, byte[] content) throws IOException {
+    Tika tika = new Tika(TikaConfig.getDefaultConfig());
 
-    Set<MimeType> mimeTypes = new HashSet<>();
-    mimeTypes.addAll(mimeUtil.getMimeTypes(content));
-    mimeTypes.addAll(mimeUtil.getMimeTypes(path));
+    Metadata metadata = new Metadata();
+    metadata.set(Metadata.RESOURCE_NAME_KEY, path);
 
-    if (mimeTypes.isEmpty()
-        || (mimeTypes.size() == 1 && mimeTypes.contains(MimeUtil2.UNKNOWN_MIME_TYPE))) {
-      return MimeUtil2.UNKNOWN_MIME_TYPE.toString();
-    }
-
-    return Collections.max(mimeTypes, comparing(MimeType::getSpecificity)).toString();
+    ByteArrayInputStream bis = new ByteArrayInputStream(content);
+    return tika.detect(TikaInputStream.get(bis), metadata);
   }
 }