Include the mime-util library to guess file MIME types

The mime-util project is an open source, APLv2 library providing
support for MIME content type detection on files through both
the /etc/mime.types and /etc/magic.mime file formats.  We can use
it within CatServlet to determine what sort of file we are about
to serve to the client, so the client gets a proper Content-Type
header in the HTTP response.

Reviewed-by: Cedric Beust <cbeust@google.com>
Signed-off-by: Shawn O. Pearce <sop@google.com>
diff --git a/Documentation/licenses.txt b/Documentation/licenses.txt
index 4a4b197..71cca10 100644
--- a/Documentation/licenses.txt
+++ b/Documentation/licenses.txt
@@ -29,6 +29,7 @@
 OpenXRI                     <<apache2,Apache License 2.0>>
 Neko HTML                   <<apache2,Apache License 2.0>>
 Ehcache                     <<apache2,Apache License 2.0>>
+mime-util                   <<apache2,Apache License 2.0>>
 ICU4J                       <<icu4j,ICU4J License>>
 JGit                        <<jgit,New-Style BSD>>
 JSch                        <<sshd,New-Style BSD>>
diff --git a/pom.xml b/pom.xml
index 98c9841..c922c94 100644
--- a/pom.xml
+++ b/pom.xml
@@ -543,6 +543,12 @@
     </dependency>
 
     <dependency>
+      <groupId>eu.medsea.mimeutil</groupId>
+      <artifactId>mime-util</artifactId>
+      <version>2.1.2</version>
+    </dependency>
+
+    <dependency>
       <groupId>bouncycastle</groupId>
       <artifactId>bcpg-jdk15</artifactId>
       <version>140</version>
diff --git a/src/main/java/com/google/gerrit/server/CatServlet.java b/src/main/java/com/google/gerrit/server/CatServlet.java
index 024cb04..2f3839c 100644
--- a/src/main/java/com/google/gerrit/server/CatServlet.java
+++ b/src/main/java/com/google/gerrit/server/CatServlet.java
@@ -28,6 +28,8 @@
 import com.google.gwtjsonrpc.server.XsrfException;
 import com.google.gwtorm.client.OrmException;
 
+import eu.medsea.mimeutil.MimeType;
+
 import org.spearce.jgit.lib.Constants;
 import org.spearce.jgit.lib.ObjectId;
 import org.spearce.jgit.lib.Repository;
@@ -61,10 +63,10 @@
  */
 @SuppressWarnings("serial")
 public class CatServlet extends HttpServlet {
-  private static final String APPLICATION_OCTET_STREAM =
-      "application/octet-stream";
+  private static final MimeType ZIP = new MimeType("application/zip");
   private GerritServer server;
   private SecureRandom rng;
+  private FileTypeRegistry registry;
 
   @Override
   public void init(final ServletConfig config) throws ServletException {
@@ -77,6 +79,7 @@
       throw new ServletException("Cannot load GerritServer", e);
     }
     rng = new SecureRandom();
+    registry = FileTypeRegistry.getInstance();
   }
 
   @Override
@@ -219,11 +222,11 @@
     }
 
     final long when = fromCommit.getCommitTime() * 1000L;
-    String contentType = guessContentType(project, path, blobData);
+    MimeType contentType = registry.getMimeType(path, blobData);
     final String fn;
     final byte[] outData;
 
-    if (isSafeInline(contentType)) {
+    if (registry.isSafeInline(contentType)) {
       fn = safeFileName(path, suffix);
       outData = blobData;
 
@@ -248,11 +251,11 @@
       zo.close();
 
       outData = zip.toByteArray();
-      contentType = "application/zip";
+      contentType = ZIP;
       fn = safeFileName(path, suffix) + ".zip";
     }
 
-    rsp.setContentType(contentType);
+    rsp.setContentType(contentType.toString());
     rsp.setContentLength(outData.length);
     rsp.setDateHeader("Last-Modified", when);
     rsp.setHeader("Content-Disposition", "attachment; filename=\"" + fn + "\"");
@@ -262,27 +265,6 @@
     rsp.getOutputStream().write(outData);
   }
 
-  private String guessContentType(final Project project, final String path,
-      final byte[] content) {
-    // When in doubt, call it a generic binary stream.
-    //
-    return APPLICATION_OCTET_STREAM;
-  }
-
-  private boolean isSafeInline(final String contentType) {
-    if (APPLICATION_OCTET_STREAM.equals(contentType)) {
-      // Most browsers perform content type sniffing when they get told
-      // a generic content type. This is bad, so assume we cannot send
-      // the file inline.
-      //
-      return false;
-    }
-
-    // Assume we cannot send the content inline.
-    //
-    return false;
-  }
-
   private static String safeFileName(String fileName, final String suffix) {
     // Convert a file path (e.g. "src/Init.c") to a safe file name with
     // no meta-characters that might be unsafe on any given platform.
diff --git a/src/main/java/com/google/gerrit/server/FileTypeRegistry.java b/src/main/java/com/google/gerrit/server/FileTypeRegistry.java
new file mode 100644
index 0000000..2545aaa
--- /dev/null
+++ b/src/main/java/com/google/gerrit/server/FileTypeRegistry.java
@@ -0,0 +1,142 @@
+// Copyright (C) 2009 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.gerrit.server;
+
+import eu.medsea.mimeutil.MimeException;
+import eu.medsea.mimeutil.MimeType;
+import eu.medsea.mimeutil.MimeUtil2;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.AccessController;
+import java.security.PrivilegedAction;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+public class FileTypeRegistry {
+  private static final Logger log =
+      LoggerFactory.getLogger(FileTypeRegistry.class);
+  private static final FileTypeRegistry INSTANCE = new FileTypeRegistry();
+
+  /** Get the global registry. */
+  public static FileTypeRegistry getInstance() {
+    return INSTANCE;
+  }
+
+  private MimeUtil2 mimeUtil;
+
+  private FileTypeRegistry() {
+    mimeUtil = new MimeUtil2();
+    register("eu.medsea.mimeutil.detector.ExtensionMimeDetector");
+    register("eu.medsea.mimeutil.detector.MagicMimeMimeDetector");
+    if (isWin32()) {
+      register("eu.medsea.mimeutil.detector.WindowsRegistryMimeDetector");
+    }
+  }
+
+  private void register(String name) {
+    mimeUtil.registerMimeDetector(name);
+  }
+
+  private static boolean isWin32() {
+    final String osDotName =
+        AccessController.doPrivileged(new PrivilegedAction<String>() {
+          public String run() {
+            return System.getProperty("os.name");
+          }
+        });
+    return osDotName != null
+        && osDotName.toLowerCase().indexOf("windows") != -1;
+  }
+
+  /**
+   * Get the most specific MIME type available for a file.
+   * 
+   * @param path name of the file. The base name (component after the last '/')
+   *        may be used to help determine the MIME type, such as by examining
+   *        the extension (portion after the last '.' if present).
+   * @param content the complete file content. If non-null the content may be
+   *        used to guess the MIME type by examining the beginning for common
+   *        file headers.
+   * @return the MIME type for this content. If the MIME type is not recognized
+   *         or cannot be determined, {@link MimeUtil2#UNKNOWN_MIME_TYPE} which
+   *         is an alias for {@code application/octet-stream}.
+   */
+  public MimeType getMimeType(final String path, final byte[] content) {
+    Set<MimeType> mimeTypes = new HashSet<MimeType>();
+    if (content != null && content.length > 0) {
+      try {
+        mimeTypes.addAll(mimeUtil.getMimeTypes(content));
+      } catch (MimeException e) {
+        log.warn("Unable to determine MIME type from content", e);
+      }
+    }
+    try {
+      mimeTypes.addAll(mimeUtil.getMimeTypes(path));
+    } catch (MimeException e) {
+      log.warn("Unable to determine MIME type from path", e);
+    }
+
+    if (isUnknownType(mimeTypes)) {
+      return MimeUtil2.UNKNOWN_MIME_TYPE;
+    }
+
+    final List<MimeType> types = new ArrayList<MimeType>(mimeTypes);
+    Collections.sort(types, new Comparator<MimeType>() {
+      @Override
+      public int compare(MimeType a, MimeType b) {
+        return b.getSpecificity() - a.getSpecificity();
+      }
+    });
+    return types.get(0);
+  }
+
+  /**
+   * Is this content type safe to transmit to a browser directly?
+   * 
+   * @param contentType the MIME type of the file content.
+   * @return true if the Gerrit administrator wants to permit this content to be
+   *         served as-is; false if the administrator does not trust this
+   *         content type and wants it to be protected (typically by wrapping
+   *         the data in a ZIP archive).
+   */
+  public boolean isSafeInline(final MimeType contentType) {
+    if (MimeUtil2.UNKNOWN_MIME_TYPE.equals(contentType)) {
+      // Most browsers perform content type sniffing when they get told
+      // a generic content type. This is bad, so assume we cannot send
+      // the file inline.
+      //
+      return false;
+    }
+
+    // Assume we cannot send the content inline.
+    //
+    return false;
+  }
+
+  private static boolean isUnknownType(Collection<MimeType> mimeTypes) {
+    if (mimeTypes.isEmpty()) {
+      return true;
+    }
+    return mimeTypes.size() == 1
+        && mimeTypes.contains(MimeUtil2.UNKNOWN_MIME_TYPE);
+  }
+}
diff --git a/src/main/java/log4j.properties b/src/main/java/log4j.properties
index 27d6547..a99be50 100644
--- a/src/main/java/log4j.properties
+++ b/src/main/java/log4j.properties
@@ -24,6 +24,10 @@
 log4j.logger.org.apache.sshd.server=WARN
 log4j.logger.org.apache.sshd.common.keyprovider.FileKeyPairProvider=INFO
 
+# Silence non-critical messages from mime-util.
+#
+log4j.logger.eu.medsea.mimeutil=WARN
+
 # Silence non-critical messages from openid4java
 #
 log4j.logger.org.apache.xml=WARN