ObjectChecker: Disallow names potentially mapping to ".git" on HFS+

Mac's HFS+ folds concatentations of ".git" and ignorable Unicode
characters [1] to ".git" [2]. Hence we need to disallow all names which
could potentially be a shortname for ".git". Example: in an empty
directory create a folder ".g\U+200Cit". Now you can't create another
folder ".git".

The following characters are ignorable Unicode which are ignored on
HFS+:

unicode   hex         name
-------------------------------------------------
U+200C    0xe2808c    ZERO WIDTH NON-JOINER
U+200D    0xe2808d    ZERO WIDTH JOINER
U+200E    0xe2808e    LEFT-TO-RIGHT MARK
U+200F    0xe2808f    RIGHT-TO-LEFT MARK
U+202A    0xe280aa    LEFT-TO-RIGHT EMBEDDING
U+202B    0xe280ab    RIGHT-TO-LEFT EMBEDDING
U+202C    0xe280ac    POP DIRECTIONAL FORMATTING
U+202D    0xe280ad    LEFT-TO-RIGHT OVERRIDE
U+202E    0xe280ae    RIGHT-TO-LEFT OVERRIDE
U+206A    0xe281aa    INHIBIT SYMMETRIC SWAPPING
U+206B    0xe281ab    ACTIVATE SYMMETRIC SWAPPING
U+206C    0xe281ac    INHIBIT ARABIC FORM SHAPING
U+206D    0xe281ad    ACTIVATE ARABIC FORM SHAPING
U+206E    0xe281ae    NATIONAL DIGIT SHAPES
U+206F    0xe281af    NOMINAL DIGIT SHAPES
U+FEFF    0xefbbbf    ZERO WIDTH NO-BREAK SPACE

[1] http://www.unicode.org/versions/Unicode7.0.0/ch05.pdf#G40025
    http://www.unicode.org/reports/tr31/#Layout_and_Format_Control_Characters
[2] http://dubeiko.com/development/FileSystems/HFSPLUS/tn1150.html#UnicodeSubtleties

Change-Id: Ib6a1dd090b2649bdd8ec16387c994ed29de2860d
Signed-off-by: Matthias Sohn <matthias.sohn@sap.com>
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/lib/ObjectCheckerTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/lib/ObjectCheckerTest.java
index 8a782b7..c6578cc 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/lib/ObjectCheckerTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/lib/ObjectCheckerTest.java
@@ -1308,6 +1308,126 @@ public void testInvalidTreeNameIsMixedCaseGit() {
 	}
 
 	@Test
+	public void testInvalidTreeNameIsMacHFSGit() {
+		StringBuilder b = new StringBuilder();
+		entry(b, "100644 .gi\u200Ct");
+		byte[] data = Constants.encode(b.toString());
+		try {
+			checker.setSafeForMacOS(true);
+			checker.checkTree(data);
+			fail("incorrectly accepted an invalid tree");
+		} catch (CorruptObjectException e) {
+			assertEquals(
+					"invalid name '.gi\u200Ct' contains ignorable Unicode characters",
+					e.getMessage());
+		}
+	}
+
+	@Test
+	public void testInvalidTreeNameIsMacHFSGit2() {
+		StringBuilder b = new StringBuilder();
+		entry(b, "100644 \u206B.git");
+		byte[] data = Constants.encode(b.toString());
+		try {
+			checker.setSafeForMacOS(true);
+			checker.checkTree(data);
+			fail("incorrectly accepted an invalid tree");
+		} catch (CorruptObjectException e) {
+			assertEquals(
+					"invalid name '\u206B.git' contains ignorable Unicode characters",
+					e.getMessage());
+		}
+	}
+
+	@Test
+	public void testInvalidTreeNameIsMacHFSGit3() {
+		StringBuilder b = new StringBuilder();
+		entry(b, "100644 .git\uFEFF");
+		byte[] data = Constants.encode(b.toString());
+		try {
+			checker.setSafeForMacOS(true);
+			checker.checkTree(data);
+			fail("incorrectly accepted an invalid tree");
+		} catch (CorruptObjectException e) {
+			assertEquals(
+					"invalid name '.git\uFEFF' contains ignorable Unicode characters",
+					e.getMessage());
+		}
+	}
+
+	private static byte[] concat(byte[] b1, byte[] b2) {
+		byte[] data = new byte[b1.length + b2.length];
+		System.arraycopy(b1, 0, data, 0, b1.length);
+		System.arraycopy(b2, 0, data, b1.length, b2.length);
+		return data;
+	}
+
+	@Test
+	public void testInvalidTreeNameIsMacHFSGitCorruptUTF8AtEnd() {
+		byte[] data = concat(Constants.encode("100644 .git"),
+				new byte[] { (byte) 0xef });
+		StringBuilder b = new StringBuilder();
+		entry(b, "");
+		data = concat(data, Constants.encode(b.toString()));
+		try {
+			checker.setSafeForMacOS(true);
+			checker.checkTree(data);
+			fail("incorrectly accepted an invalid tree");
+		} catch (CorruptObjectException e) {
+			assertEquals(
+					"invalid name contains byte sequence '0xef' which is not a valid UTF-8 character",
+					e.getMessage());
+		}
+	}
+
+	@Test
+	public void testInvalidTreeNameIsMacHFSGitCorruptUTF8AtEnd2() {
+		byte[] data = concat(Constants.encode("100644 .git"), new byte[] {
+				(byte) 0xe2, (byte) 0xab });
+		StringBuilder b = new StringBuilder();
+		entry(b, "");
+		data = concat(data, Constants.encode(b.toString()));
+		try {
+			checker.setSafeForMacOS(true);
+			checker.checkTree(data);
+			fail("incorrectly accepted an invalid tree");
+		} catch (CorruptObjectException e) {
+			assertEquals(
+					"invalid name contains byte sequence '0xe2ab' which is not a valid UTF-8 character",
+					e.getMessage());
+		}
+	}
+
+	@Test
+	public void testInvalidTreeNameIsNotMacHFSGit()
+			throws CorruptObjectException {
+		StringBuilder b = new StringBuilder();
+		entry(b, "100644 .git\u200Cx");
+		byte[] data = Constants.encode(b.toString());
+		checker.setSafeForMacOS(true);
+		checker.checkTree(data);
+	}
+
+	@Test
+	public void testInvalidTreeNameIsNotMacHFSGit2()
+			throws CorruptObjectException {
+		StringBuilder b = new StringBuilder();
+		entry(b, "100644 .kit\u200C");
+		byte[] data = Constants.encode(b.toString());
+		checker.setSafeForMacOS(true);
+		checker.checkTree(data);
+	}
+
+	@Test
+	public void testInvalidTreeNameIsNotMacHFSGitOtherPlatform()
+			throws CorruptObjectException {
+		StringBuilder b = new StringBuilder();
+		entry(b, "100644 .git\u200C");
+		byte[] data = Constants.encode(b.toString());
+		checker.checkTree(data);
+	}
+
+	@Test
 	public void testInvalidTreeNameIsDotGitDot() {
 		StringBuilder b = new StringBuilder();
 		entry(b, "100644 .git.");
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/lib/ObjectChecker.java b/org.eclipse.jgit/src/org/eclipse/jgit/lib/ObjectChecker.java
index 4913c44..281bccd 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/lib/ObjectChecker.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/lib/ObjectChecker.java
@@ -469,6 +469,11 @@ private void checkPathSegment2(byte[] raw, int ptr, int end)
 					RawParseUtils.decode(raw, ptr, end)));
 		}
 
+		if (macosx && isMacHFSGit(raw, ptr, end))
+			throw new CorruptObjectException(String.format(
+					"invalid name '%s' contains ignorable Unicode characters",
+					RawParseUtils.decode(raw, ptr, end)));
+
 		if (windows) {
 			// Windows ignores space and dot at end of file name.
 			if (raw[end - 1] == ' ' || raw[end - 1] == '.')
@@ -479,6 +484,88 @@ private void checkPathSegment2(byte[] raw, int ptr, int end)
 		}
 	}
 
+	// Mac's HFS+ folds permutations of ".git" and Unicode ignorable characters
+	// to ".git" therefore we should prevent such names
+	private static boolean isMacHFSGit(byte[] raw, int ptr, int end)
+			throws CorruptObjectException {
+		boolean ignorable = false;
+		byte[] git = new byte[] { '.', 'g', 'i', 't' };
+		int g = 0;
+		while (ptr < end) {
+			switch (raw[ptr]) {
+			case (byte) 0xe2: // http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192
+				checkTruncatedIgnorableUTF8(raw, ptr, end);
+				switch (raw[ptr + 1]) {
+				case (byte) 0x80:
+					switch (raw[ptr + 2]) {
+					case (byte) 0x8c:	// U+200C 0xe2808c ZERO WIDTH NON-JOINER
+					case (byte) 0x8d:	// U+200D 0xe2808d ZERO WIDTH JOINER
+					case (byte) 0x8e:	// U+200E 0xe2808e LEFT-TO-RIGHT MARK
+					case (byte) 0x8f:	// U+200F 0xe2808f RIGHT-TO-LEFT MARK
+					case (byte) 0xaa:	// U+202A 0xe280aa LEFT-TO-RIGHT EMBEDDING
+					case (byte) 0xab:	// U+202B 0xe280ab RIGHT-TO-LEFT EMBEDDING
+					case (byte) 0xac:	// U+202C 0xe280ac POP DIRECTIONAL FORMATTING
+					case (byte) 0xad:	// U+202D 0xe280ad LEFT-TO-RIGHT OVERRIDE
+					case (byte) 0xae:	// U+202E 0xe280ae RIGHT-TO-LEFT OVERRIDE
+						ignorable = true;
+						ptr += 3;
+						continue;
+					default:
+						return false;
+					}
+				case (byte) 0x81:
+					switch (raw[ptr + 2]) {
+					case (byte) 0xaa:	// U+206A 0xe281aa INHIBIT SYMMETRIC SWAPPING
+					case (byte) 0xab:	// U+206B 0xe281ab ACTIVATE SYMMETRIC SWAPPING
+					case (byte) 0xac:	// U+206C 0xe281ac INHIBIT ARABIC FORM SHAPING
+					case (byte) 0xad:	// U+206D 0xe281ad ACTIVATE ARABIC FORM SHAPING
+					case (byte) 0xae:	// U+206E 0xe281ae NATIONAL DIGIT SHAPES
+					case (byte) 0xaf:	// U+206F 0xe281af NOMINAL DIGIT SHAPES
+						ignorable = true;
+						ptr += 3;
+						continue;
+					default:
+						return false;
+					}
+				}
+				break;
+			case (byte) 0xef: // http://www.utf8-chartable.de/unicode-utf8-table.pl?start=65024
+				checkTruncatedIgnorableUTF8(raw, ptr, end);
+				// U+FEFF 0xefbbbf ZERO WIDTH NO-BREAK SPACE
+				if ((raw[ptr + 1] == (byte) 0xbb)
+						&& (raw[ptr + 2] == (byte) 0xbf)) {
+					ignorable = true;
+					ptr += 3;
+					continue;
+				}
+				return false;
+			default:
+				if (g == 4)
+					return false;
+				if (raw[ptr++] != git[g++])
+					return false;
+			}
+		}
+		if (g == 4 && ignorable)
+			return true;
+		return false;
+	}
+
+	private static void checkTruncatedIgnorableUTF8(byte[] raw, int ptr, int end)
+			throws CorruptObjectException {
+		if ((ptr + 2) >= end)
+			throw new CorruptObjectException(MessageFormat.format(
+				"invalid name contains byte sequence ''{0}'' which is not a valid UTF-8 character",
+					toHexString(raw, ptr, end)));
+	}
+
+	private static String toHexString(byte[] raw, int ptr, int end) {
+		StringBuilder b = new StringBuilder("0x"); //$NON-NLS-1$
+		for (int i = ptr; i < end; i++)
+			b.append(String.format("%02x", Byte.valueOf(raw[i]))); //$NON-NLS-1$
+		return b.toString();
+	}
+
 	private static void checkNotWindowsDevice(byte[] raw, int ptr, int end)
 			throws CorruptObjectException {
 		switch (toLower(raw[ptr])) {