ObjectChecker: Report .gitmodules files found in the pack

In order to validate .gitmodules files, we first need to find them
in the incoming pack.

Do it in the ObjectChecker stage. Check in the tree objects if they
point to a .gitmodules file and report the tree id and the .gitmodules
blob id.

This can be used later to check if the file is in the root of the
project and if the contents are good.

While we're here, make isMacHFSGit more accurate by detecting variants
of filenames that vary in case.

[jn: tweaked NTFS and HFS+ checking; added more tests]

Change-Id: I70802e7d2c1374116149de4f89836b9498f39582
Signed-off-by: Ivan Frade <ifrade@google.com>
Signed-off-by: Jonathan Nieder <jrn@google.com>
Signed-off-by: Matthias Sohn <matthias.sohn@sap.com>
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/lib/ObjectCheckerTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/lib/ObjectCheckerTest.java
index 43160fb..7d298ed 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/lib/ObjectCheckerTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/lib/ObjectCheckerTest.java
@@ -768,6 +768,112 @@
 	}
 
 	@Test
+	public void testValidTreeWithGitmodules() throws CorruptObjectException {
+		ObjectId treeId = ObjectId
+				.fromString("0123012301230123012301230123012301230123");
+		StringBuilder b = new StringBuilder();
+		ObjectId blobId = entry(b, "100644 .gitmodules");
+
+		byte[] data = encodeASCII(b.toString());
+		checker.checkTree(treeId, data);
+		assertEquals(1, checker.getGitsubmodules().size());
+		assertEquals(treeId, checker.getGitsubmodules().get(0).getTreeId());
+		assertEquals(blobId, checker.getGitsubmodules().get(0).getBlobId());
+	}
+
+	/*
+	 * Windows case insensitivity and long file name handling
+	 * means that .gitmodules has many synonyms.
+	 *
+	 * Examples inspired by git.git's t/t0060-path-utils.sh, by
+	 * Johannes Schindelin and Congyi Wu.
+	 */
+	@Test
+	public void testNTFSGitmodules() throws CorruptObjectException {
+		for (String gitmodules : new String[] {
+			".GITMODULES",
+			".gitmodules",
+			".Gitmodules",
+			".gitmoduleS",
+			"gitmod~1",
+			"GITMOD~1",
+			"gitmod~4",
+			"GI7EBA~1",
+			"gi7eba~9",
+			"GI7EB~10",
+			"GI7E~123",
+			"~1000000",
+			"~9999999"
+		}) {
+			checker = new ObjectChecker(); // Reset the ObjectChecker state.
+			checker.setSafeForWindows(true);
+			ObjectId treeId = ObjectId
+					.fromString("0123012301230123012301230123012301230123");
+			StringBuilder b = new StringBuilder();
+			ObjectId blobId = entry(b, "100644 " + gitmodules);
+
+			byte[] data = encodeASCII(b.toString());
+			checker.checkTree(treeId, data);
+			assertEquals(1, checker.getGitsubmodules().size());
+			assertEquals(treeId, checker.getGitsubmodules().get(0).getTreeId());
+			assertEquals(blobId, checker.getGitsubmodules().get(0).getBlobId());
+		}
+	}
+
+	@Test
+	public void testNotGitmodules() throws CorruptObjectException {
+		for (String notGitmodules : new String[] {
+			".gitmodu",
+			".gitmodules oh never mind",
+		}) {
+			checker = new ObjectChecker(); // Reset the ObjectChecker state.
+			checker.setSafeForWindows(true);
+			ObjectId treeId = ObjectId
+					.fromString("0123012301230123012301230123012301230123");
+			StringBuilder b = new StringBuilder();
+			entry(b, "100644 " + notGitmodules);
+
+			byte[] data = encodeASCII(b.toString());
+			checker.checkTree(treeId, data);
+			assertEquals(0, checker.getGitsubmodules().size());
+		}
+	}
+
+	/*
+	 * TODO HFS: match ".gitmodules" case-insensitively, after stripping out
+	 * certain zero-length Unicode code points that HFS+ strips out
+	 */
+
+	@Test
+	public void testValidTreeWithGitmodulesUppercase()
+			throws CorruptObjectException {
+		ObjectId treeId = ObjectId
+				.fromString("0123012301230123012301230123012301230123");
+		StringBuilder b = new StringBuilder();
+		ObjectId blobId = entry(b, "100644 .GITMODULES");
+
+		byte[] data = encodeASCII(b.toString());
+		checker.setSafeForWindows(true);
+		checker.checkTree(treeId, data);
+		assertEquals(1, checker.getGitsubmodules().size());
+		assertEquals(treeId, checker.getGitsubmodules().get(0).getTreeId());
+		assertEquals(blobId, checker.getGitsubmodules().get(0).getBlobId());
+	}
+
+	@Test
+	public void testTreeWithInvalidGitmodules() throws CorruptObjectException {
+		ObjectId treeId = ObjectId
+				.fromString("0123012301230123012301230123012301230123");
+		StringBuilder b = new StringBuilder();
+		entry(b, "100644 .gitmodulez");
+
+		byte[] data = encodeASCII(b.toString());
+		checker.checkTree(treeId, data);
+		checker.setSafeForWindows(true);
+		assertEquals(0, checker.getGitsubmodules().size());
+	}
+
+	@Test
 	public void testNullSha1InTreeEntry() throws CorruptObjectException {
 		byte[] data = concat(
 				encodeASCII("100644 A"), new byte[] { '\0' },
@@ -1551,11 +1657,20 @@
 		checker.checkTree(encodeASCII(b.toString()));
 	}
 
-	private static void entry(StringBuilder b, final String modeName) {
+	/*
+	 * Returns the id generated for the entry
+	 */
+	private static ObjectId entry(StringBuilder b, String modeName) {
+		byte[] id = new byte[OBJECT_ID_LENGTH];
+
 		b.append(modeName);
 		b.append('\0');
-		for (int i = 0; i < OBJECT_ID_LENGTH; i++)
+		for (int i = 0; i < OBJECT_ID_LENGTH; i++) {
 			b.append((char) i);
+			id[i] = (byte) i;
+		}
+
+		return ObjectId.fromRaw(id);
 	}
 
 	private void assertCorrupt(String msg, int type, StringBuilder b) {
diff --git a/org.eclipse.jgit/.settings/.api_filters b/org.eclipse.jgit/.settings/.api_filters
index ed43015..81aa9c2 100644
--- a/org.eclipse.jgit/.settings/.api_filters
+++ b/org.eclipse.jgit/.settings/.api_filters
@@ -35,6 +35,22 @@
             </message_arguments>
         </filter>
     </resource>
+    <resource path="src/org/eclipse/jgit/lib/GitmoduleEntry.java" type="org.eclipse.jgit.lib.GitmoduleEntry">
+        <filter id="1109393411">
+            <message_arguments>
+                <message_argument value="4.7.5"/>
+                <message_argument value="org.eclipse.jgit.lib.GitmoduleEntry"/>
+            </message_arguments>
+        </filter>
+    </resource>
+    <resource path="src/org/eclipse/jgit/lib/ObjectChecker.java" type="org.eclipse.jgit.lib.ObjectChecker">
+        <filter id="1142947843">
+            <message_arguments>
+                <message_argument value="4.7.5"/>
+                <message_argument value="getGitsubmodules()"/>
+            </message_arguments>
+        </filter>
+    </resource>
     <resource path="src/org/eclipse/jgit/util/FS.java" type="org.eclipse.jgit.util.FS">
         <filter id="1141899266">
             <message_arguments>
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/lib/GitmoduleEntry.java b/org.eclipse.jgit/src/org/eclipse/jgit/lib/GitmoduleEntry.java
new file mode 100644
index 0000000..bded527
--- /dev/null
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/lib/GitmoduleEntry.java
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2018, Google LLC.
+ * and other copyright owners as documented in the project's IP log.
+ *
+ * This program and the accompanying materials are made available
+ * under the terms of the Eclipse Distribution License v1.0 which
+ * accompanies this distribution, is reproduced below, and is
+ * available at http://www.eclipse.org/org/documents/edl-v10.php
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ *   copyright notice, this list of conditions and the following
+ *   disclaimer in the documentation and/or other materials provided
+ *   with the distribution.
+ *
+ * - Neither the name of the Eclipse Foundation, Inc. nor the
+ *   names of its contributors may be used to endorse or promote
+ *   products derived from this software without specific prior
+ *   written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+package org.eclipse.jgit.lib;
+
+import org.eclipse.jgit.lib.AnyObjectId;
+
+/**
+ * A .gitmodules file found in the pack. Store the blob of the file itself (e.g.
+ * to access its contents) and the tree where it was found (e.g. to check if it
+ * is in the root)
+ *
+ * @since 4.7.5
+ */
+public final class GitmoduleEntry {
+	private final AnyObjectId treeId;
+
+	private final AnyObjectId blobId;
+
+	/**
+	 * A record of (tree, blob) for a .gitmodule file in a pack
+	 *
+	 * @param treeId
+	 *            tree id containing a .gitmodules entry
+	 * @param blobId
+	 *            id of the blob of the .gitmodules file
+	 */
+	public GitmoduleEntry(AnyObjectId treeId, AnyObjectId blobId) {
+		// AnyObjectId's are reused, must keep a copy.
+		this.treeId = treeId.copy();
+		this.blobId = blobId.copy();
+	}
+
+	/**
+	 * @return Id of a .gitmodules file found in the pack
+	 */
+	public AnyObjectId getBlobId() {
+		return blobId;
+	}
+
+	/**
+	 * @return Id of a tree object where the .gitmodules file was found
+	 */
+	public AnyObjectId getTreeId() {
+		return treeId;
+	}
+}
\ No newline at end of file
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/lib/ObjectChecker.java b/org.eclipse.jgit/src/org/eclipse/jgit/lib/ObjectChecker.java
index 9d3aee1..6ae752c 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/lib/ObjectChecker.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/lib/ObjectChecker.java
@@ -44,6 +44,7 @@
 
 package org.eclipse.jgit.lib;
 
+import static org.eclipse.jgit.lib.Constants.DOT_GIT_MODULES;
 import static org.eclipse.jgit.lib.Constants.OBJECT_ID_LENGTH;
 import static org.eclipse.jgit.lib.Constants.OBJECT_ID_STRING_LENGTH;
 import static org.eclipse.jgit.lib.Constants.OBJ_BAD;
@@ -84,8 +85,10 @@
 
 import java.text.MessageFormat;
 import java.text.Normalizer;
+import java.util.ArrayList;
 import java.util.EnumSet;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Locale;
 import java.util.Set;
 
@@ -136,6 +139,9 @@
 	/** Header "tagger " */
 	public static final byte[] tagger = Constants.encodeASCII("tagger "); //$NON-NLS-1$
 
+	/** Path ".gitmodules" */
+	private static final byte[] dotGitmodules = Constants.encodeASCII(DOT_GIT_MODULES);
+
 	/**
 	 * Potential issues identified by the checker.
 	 *
@@ -199,6 +205,8 @@
 	private boolean windows;
 	private boolean macosx;
 
+	private final List<GitmoduleEntry> gitsubmodules = new ArrayList<>();
+
 	/**
 	 * Enable accepting specific malformed (but not horribly broken) objects.
 	 *
@@ -678,9 +686,15 @@
 				throw new CorruptObjectException(
 						JGitText.get().corruptObjectTruncatedInObjectId);
 			}
+
 			if (ObjectId.zeroId().compareTo(raw, ptr - OBJECT_ID_LENGTH) == 0) {
 				report(NULL_SHA1, id, JGitText.get().corruptObjectZeroId);
 			}
+
+			if (id != null && isGitmodules(raw, lastNameB, lastNameE, id)) {
+				ObjectId blob = ObjectId.fromRaw(raw, ptr - OBJECT_ID_LENGTH);
+				gitsubmodules.add(new GitmoduleEntry(id, blob));
+			}
 		}
 	}
 
@@ -845,10 +859,9 @@
 
 	// Mac's HFS+ folds permutations of ".git" and Unicode ignorable characters
 	// to ".git" therefore we should prevent such names
-	private boolean isMacHFSGit(byte[] raw, int ptr, int end,
+	private boolean isMacHFSPath(byte[] raw, int ptr, int end, byte[] path,
 			@Nullable AnyObjectId id) throws CorruptObjectException {
 		boolean ignorable = false;
-		byte[] git = new byte[] { '.', 'g', 'i', 't' };
 		int g = 0;
 		while (ptr < end) {
 			switch (raw[ptr]) {
@@ -904,17 +917,31 @@
 				}
 				return false;
 			default:
-				if (g == 4)
+				if (g == path.length) {
 					return false;
-				if (raw[ptr++] != git[g++])
+				}
+				if (toLower(raw[ptr++]) != path[g++]) {
 					return false;
+				}
 			}
 		}
-		if (g == 4 && ignorable)
+		if (g == path.length && ignorable) {
 			return true;
+		}
 		return false;
 	}
 
+	private boolean isMacHFSGit(byte[] raw, int ptr, int end,
+			@Nullable AnyObjectId id) throws CorruptObjectException {
+		byte[] git = new byte[] { '.', 'g', 'i', 't' };
+		return isMacHFSPath(raw, ptr, end, git, id);
+	}
+
+	private boolean isMacHFSGitmodules(byte[] raw, int ptr, int end,
+			@Nullable AnyObjectId id) throws CorruptObjectException {
+		return isMacHFSPath(raw, ptr, end, dotGitmodules, id);
+	}
+
 	private boolean checkTruncatedIgnorableUTF8(byte[] raw, int ptr, int end,
 			@Nullable AnyObjectId id) throws CorruptObjectException {
 		if ((ptr + 2) >= end) {
@@ -1021,6 +1048,104 @@
 				&& toLower(buf[p + 2]) == 't';
 	}
 
+	/**
+	 * Check if the filename contained in buf[start:end] could be read as a
+	 * .gitmodules file when checked out to the working directory.
+	 *
+	 * This ought to be a simple comparison, but some filesystems have peculiar
+	 * rules for normalizing filenames:
+	 *
+	 * NTFS has backward-compatibility support for 8.3 synonyms of long file
+	 * names (see
+	 * https://web.archive.org/web/20160318181041/https://usn.pw/blog/gen/2015/06/09/filenames/
+	 * for details). NTFS is also case-insensitive.
+	 *
+	 * MacOS's HFS+ folds away ignorable Unicode characters in addition to case
+	 * folding.
+	 *
+	 * @param buf
+	 *            byte array to decode
+	 * @param start
+	 *            position where a supposed filename is starting
+	 * @param end
+	 *            position where a supposed filename is ending
+	 * @param id
+	 *            object id for error reporting
+	 *
+	 * @return true if the filename in buf could be a ".gitmodules" file
+	 * @throws CorruptObjectException
+	 */
+	private boolean isGitmodules(byte[] buf, int start, int end, @Nullable AnyObjectId id)
+			throws CorruptObjectException {
+		// Simple cases first.
+		if (end - start < 8) {
+			return false;
+		}
+		return (end - start == dotGitmodules.length
+				&& RawParseUtils.match(buf, start, dotGitmodules) != -1)
+			|| (macosx && isMacHFSGitmodules(buf, start, end, id))
+			|| (windows && isNTFSGitmodules(buf, start, end));
+	}
+
+	private boolean matchLowerCase(byte[] b, int ptr, byte[] src) {
+		if (ptr + src.length > b.length) {
+			return false;
+		}
+		for (int i = 0; i < src.length; i++, ptr++) {
+			if (toLower(b[ptr]) != src[i]) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	// .gitmodules, case-insensitive, or an 8.3 abbreviation of the same.
+	private boolean isNTFSGitmodules(byte[] buf, int start, int end) {
+		if (end - start == 11) {
+			return matchLowerCase(buf, start, dotGitmodules);
+		}
+
+		if (end - start != 8) {
+			return false;
+		}
+
+		// "gitmod" or a prefix of "gi7eba", followed by...
+		byte[] gitmod = new byte[]{'g', 'i', 't', 'm', 'o', 'd', '~'};
+		if (matchLowerCase(buf, start, gitmod)) {
+			start += 6;
+		} else {
+			byte[] gi7eba = new byte[]{'g', 'i', '7', 'e', 'b', 'a'};
+			for (int i = 0; i < gi7eba.length; i++, start++) {
+				byte c = (byte) toLower(buf[start]);
+				if (c == '~') {
+					break;
+				}
+				if (c != gi7eba[i]) {
+					return false;
+				}
+			}
+		}
+
+		// ... ~ and a number
+		if (end - start < 2) {
+			return false;
+		}
+		if (buf[start] != '~') {
+			return false;
+		}
+		start++;
+		if (buf[start] < '1' || buf[start] > '9') {
+			return false;
+		}
+		start++;
+		for (; start != end; start++) {
+			if (buf[start] < '0' || buf[start] > '9') {
+				return false;
+			}
+		}
+		return true;
+	}
+
 	private static boolean isGitTilde1(byte[] buf, int p, int end) {
 		if (end - p != 5)
 			return false;
@@ -1082,4 +1207,17 @@
 		String n = RawParseUtils.decode(raw, ptr, end).toLowerCase(Locale.US);
 		return macosx ? Normalizer.normalize(n, Normalizer.Form.NFC) : n;
 	}
+
+	/**
+	 * Get the list of".gitmodules" files found in the pack. For each, report
+	 * its blob id (e.g. to validate its contents) and the tree where it was
+	 * found (e.g. to check if it is in the root)
+	 *
+	 * @return List of pairs of ids <tree, blob>
+	 *
+	 * @since 4.7.5
+	 */
+	public List<GitmoduleEntry> getGitsubmodules() {
+		return gitsubmodules;
+	}
 }