Implement safe find/replace for user-provided links

Extract an interface out of RegexFindReplace for replacement with
arbitrary types of sanitization or other munging. Implement a
LinkFindReplace that knows how to sanitize links with SafeHtmlBuilder.

Push the replacement into the FindReplace interface so implementations
can do sanitization _after_ regular expression replacement.

Change-Id: I46383abd1921a11b4c90b80b2022051fc0c8e13c
diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs
index 82eb859..f9fe345 100644
--- a/.settings/org.eclipse.core.resources.prefs
+++ b/.settings/org.eclipse.core.resources.prefs
@@ -1,3 +1,4 @@
-#Tue Sep 02 16:59:24 PDT 2008
 eclipse.preferences.version=1
+encoding//src/main/java=UTF-8
+encoding//src/test/java=UTF-8
 encoding/<project>=UTF-8
diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs
index b0b2fe2..3473ea8 100644
--- a/.settings/org.eclipse.jdt.core.prefs
+++ b/.settings/org.eclipse.jdt.core.prefs
@@ -1,4 +1,3 @@
-#Wed May 13 08:49:53 PDT 2009
 eclipse.preferences.version=1
 org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
@@ -9,6 +8,8 @@
 org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
 org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.processAnnotations=disabled
 org.eclipse.jdt.core.compiler.source=1.6
 org.eclipse.jdt.core.formatter.align_type_members_on_columns=false
 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression=16
diff --git a/src/main/java/com/google/gwtexpui/safehtml/client/FindReplace.java b/src/main/java/com/google/gwtexpui/safehtml/client/FindReplace.java
new file mode 100644
index 0000000..f7bc907
--- /dev/null
+++ b/src/main/java/com/google/gwtexpui/safehtml/client/FindReplace.java
@@ -0,0 +1,40 @@
+// Copyright (C) 2013 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.gwtexpui.safehtml.client;
+
+import com.google.gwt.regexp.shared.RegExp;
+
+/** A Find/Replace pair used against the {@link SafeHtml} block of text. */
+public interface FindReplace {
+  /**
+   * @return regular expression to match substrings with; should be treated as
+   *     immutable.
+   */
+  public RegExp pattern();
+
+  /**
+   * Find and replace a single instance of this pattern in an input.
+   * <p>
+   * <b>WARNING:</b> No XSS sanitization is done on the return value of this
+   * method, e.g. this value may be passed directly to
+   * {@link SafeHtml#replaceAll(String, String)}. Implementations must sanitize output
+   * appropriately.
+   *
+   * @param input input string.
+   * @return result of regular expression replacement.
+   * @throws IllegalArgumentException if the input could not be safely sanitized.
+   */
+  public String replace(String input);
+}
diff --git a/src/main/java/com/google/gwtexpui/safehtml/client/LinkFindReplace.java b/src/main/java/com/google/gwtexpui/safehtml/client/LinkFindReplace.java
new file mode 100644
index 0000000..eaa4f23
--- /dev/null
+++ b/src/main/java/com/google/gwtexpui/safehtml/client/LinkFindReplace.java
@@ -0,0 +1,84 @@
+// Copyright (C) 2013 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.gwtexpui.safehtml.client;
+
+import com.google.gwt.regexp.shared.RegExp;
+
+/**
+ * A Find/Replace pair whose replacement string is a link.
+ * <p>
+ * It is safe to pass arbitrary user-provided links to this class. Links are
+ * sanitized as follows:
+ * <ul>
+ * <li>Only http(s) and mailto links are supported; any other scheme results in
+ * an {@link IllegalArgumentException} from {@link #replace(String)}.
+ * <li>Special characters in the link after regex replacement are escaped with
+ * {@link SafeHtmlBuilder}.</li>
+ * </ul>
+ */
+public class LinkFindReplace implements FindReplace {
+  public static boolean hasValidScheme(String link) {
+    int colon = link.indexOf(':');
+    if (colon < 0) {
+      return true;
+    }
+    String scheme = link.substring(0, colon);
+    return "http".equalsIgnoreCase(scheme)
+        || "https".equalsIgnoreCase(scheme)
+        || "mailto".equalsIgnoreCase(scheme);
+  }
+
+  private RegExp pat;
+  private String link;
+
+  protected LinkFindReplace() {
+  }
+
+  /**
+   * @param regex regular expression pattern to match substrings with.
+   * @param repl replacement link href. Capture groups within
+   *        <code>regex</code> can be referenced with <code>$<i>n</i></code>.
+   */
+  public LinkFindReplace(String find, String link) {
+    this.pat = RegExp.compile(find);
+    this.link = link;
+  }
+
+  @Override
+  public RegExp pattern() {
+    return pat;
+  }
+
+  @Override
+  public String replace(String input) {
+    String href = pat.replace(input, link);
+    if (!hasValidScheme(href)) {
+      throw new IllegalArgumentException(
+          "Invalid scheme (" + toString() + "): " + href);
+    }
+    String result = new SafeHtmlBuilder()
+        .openAnchor()
+        .setAttribute("href", href)
+        .append(SafeHtml.asis(input))
+        .closeAnchor()
+        .asString();
+    return result;
+  }
+
+  @Override
+  public String toString() {
+    return "find = " + pat.getSource() + ", link = " + link;
+  }
+}
diff --git a/src/main/java/com/google/gwtexpui/safehtml/client/RawFindReplace.java b/src/main/java/com/google/gwtexpui/safehtml/client/RawFindReplace.java
new file mode 100644
index 0000000..d22fef6
--- /dev/null
+++ b/src/main/java/com/google/gwtexpui/safehtml/client/RawFindReplace.java
@@ -0,0 +1,55 @@
+// Copyright (C) 2009 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.gwtexpui.safehtml.client;
+
+import com.google.gwt.regexp.shared.RegExp;
+
+/**
+ * A Find/Replace pair whose replacement string is arbitrary HTML.
+ * <p>
+ * <b>WARNING:</b> This class is not safe used with user-provided patterns.
+ */
+public class RawFindReplace implements FindReplace {
+  private RegExp pat;
+  private String replace;
+
+  protected RawFindReplace() {
+  }
+
+  /**
+   * @param regex regular expression pattern to match substrings with.
+   * @param repl replacement expression. Capture groups within
+   *        <code>regex</code> can be referenced with <code>$<i>n</i></code>.
+   */
+  public RawFindReplace(String find, String replace) {
+    this.pat = RegExp.compile(find);
+    this.replace = replace;
+  }
+
+  @Override
+  public RegExp pattern() {
+    return pat;
+  }
+
+  @Override
+  public String replace(String input) {
+    return pat.replace(input, replace);
+  }
+
+  @Override
+  public String toString() {
+    return "find = " + pat.getSource() + ", replace = " + replace;
+  }
+}
diff --git a/src/main/java/com/google/gwtexpui/safehtml/client/RegexFindReplace.java b/src/main/java/com/google/gwtexpui/safehtml/client/RegexFindReplace.java
deleted file mode 100644
index fe272d4..0000000
--- a/src/main/java/com/google/gwtexpui/safehtml/client/RegexFindReplace.java
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (C) 2009 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package com.google.gwtexpui.safehtml.client;
-
-/** A Find/Replace pair used against the SafeHtml block of text */
-public class RegexFindReplace {
-  private String find;
-  private String replace;
-
-  protected RegexFindReplace() {
-  }
-
-  public RegexFindReplace(final String find, final String replace) {
-    this.find = find;
-    this.replace = replace;
-  }
-
-  public String find() {
-    return find;
-  }
-
-  public String replace() {
-    return replace;
-  }
-
-  @Override
-  public String toString() {
-    return "find = " + find + ", replace = " + replace;
-  }
-}
diff --git a/src/main/java/com/google/gwtexpui/safehtml/client/SafeHtml.java b/src/main/java/com/google/gwtexpui/safehtml/client/SafeHtml.java
index 34ce6f4..5794386 100644
--- a/src/main/java/com/google/gwtexpui/safehtml/client/SafeHtml.java
+++ b/src/main/java/com/google/gwtexpui/safehtml/client/SafeHtml.java
@@ -25,7 +25,6 @@
 import com.google.gwt.user.client.ui.InlineHTML;
 import com.google.gwt.user.client.ui.Widget;
 
-import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 
@@ -115,9 +114,9 @@
   /** Convert bare http:// and https:// URLs into &lt;a href&gt; tags. */
   public SafeHtml linkify() {
     final String part = "(?:" +
-		"[a-zA-Z0-9$_.+!*',%;:@=?#/~-]" +
-		"|&(?!lt;|gt;)" +
-		")";
+    "[a-zA-Z0-9$_.+!*',%;:@=?#/~-]" +
+    "|&(?!lt;|gt;)" +
+    ")";
     return replaceAll(
         "(https?://" +
           part + "{2,}" +
@@ -241,35 +240,22 @@
     return new SafeHtmlString(asString().replaceAll(regex, repl));
   }
 
-  private static class RegExpReplacement {
-    private final RegExp re;
-    private final String repl;
-
-    private RegExpReplacement(String pat, String repl) {
-      this.re = RegExp.compile(pat);
-      this.repl = repl;
-    }
-  }
-
   /**
    * Replace all find/replace pairs in the list in a single pass.
    *
    * @param findReplaceList find/replace pairs to use.
    * @return a new string, after the replacements have been made.
    */
-  public SafeHtml replaceAll(final List<RegexFindReplace> findReplaceList) {
+  public <T> SafeHtml replaceAll(List<? extends FindReplace> findReplaceList) {
     if (findReplaceList == null) {
       return this;
     }
-    List<RegExpReplacement> repls =
-        new ArrayList<RegExpReplacement>(findReplaceList.size());
 
     StringBuilder pat = new StringBuilder();
-    Iterator<RegexFindReplace> it = findReplaceList.iterator();
+    Iterator<? extends FindReplace> it = findReplaceList.iterator();
     while (it.hasNext()) {
-      RegexFindReplace fr = it.next();
-      repls.add(new RegExpReplacement(fr.find(), fr.replace()));
-      pat.append(fr.find());
+      FindReplace fr = it.next();
+      pat.append(fr.pattern().getSource());
       if (it.hasNext()) {
         pat.append('|');
       }
@@ -283,10 +269,15 @@
     while ((mat = re.exec(orig)) != null) {
       String g = mat.getGroup(0);
       // Re-run each candidate to find which one matched.
-      for (RegExpReplacement repl : repls) {
-        if (repl.re.exec(g) != null) {
-          result.append(orig.substring(index, mat.getIndex()));
-          result.append(repl.re.replace(g, repl.repl));
+      for (FindReplace fr : findReplaceList) {
+        if (fr.pattern().test(g)) {
+          try {
+            String repl = fr.replace(g);
+            result.append(orig.substring(index, mat.getIndex()));
+            result.append(repl);
+          } catch (IllegalArgumentException e) {
+            continue;
+          }
           index = mat.getIndex() + g.length();
           break;
         }
diff --git a/src/test/java/com/google/gwtexpui/safehtml/client/LinkFindReplaceTest.java b/src/test/java/com/google/gwtexpui/safehtml/client/LinkFindReplaceTest.java
new file mode 100644
index 0000000..97f816f
--- /dev/null
+++ b/src/test/java/com/google/gwtexpui/safehtml/client/LinkFindReplaceTest.java
@@ -0,0 +1,77 @@
+// Copyright (C) 2013 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.gwtexpui.safehtml.client;
+
+import static com.google.gwtexpui.safehtml.client.LinkFindReplace.hasValidScheme;
+
+import junit.framework.TestCase;
+
+public class LinkFindReplaceTest extends TestCase {
+  public void testNoEscaping() {
+    String find = "find";
+    String link = "link";
+    LinkFindReplace a = new LinkFindReplace(find, link);
+    assertEquals(find, a.pattern().getSource());
+    assertEquals("<a href=\"link\">find</a>", a.replace(find));
+    assertEquals("find = " + find + ", link = " + link, a.toString());
+  }
+
+  public void testBackreference() {
+    assertEquals("<a href=\"/bug?id=123\">issue 123</a>",
+        new LinkFindReplace("(bug|issue)\\s*([0-9]+)", "/bug?id=$2")
+            .replace("issue 123"));
+  }
+
+  public void testHasValidScheme() {
+    assertTrue(hasValidScheme("/absolute/path"));
+    assertTrue(hasValidScheme("relative/path"));
+    assertTrue(hasValidScheme("http://url/"));
+    assertTrue(hasValidScheme("HTTP://url/"));
+    assertTrue(hasValidScheme("https://url/"));
+    assertTrue(hasValidScheme("mailto://url/"));
+    assertFalse(hasValidScheme("ftp://url/"));
+    assertFalse(hasValidScheme("data:evil"));
+    assertFalse(hasValidScheme("javascript:alert(1)"));
+  }
+
+  public void testInvalidSchemeInReplace() {
+    try {
+      new LinkFindReplace("find", "javascript:alert(1)").replace("find");
+      fail("Expected IllegalStateException");
+    } catch (IllegalArgumentException expected) {
+    }
+  }
+
+  public void testInvalidSchemeWithBackreference() {
+    try {
+      new LinkFindReplace(".*(script:[^;]*)", "java$1")
+          .replace("Look at this script: alert(1);");
+      fail("Expected IllegalStateException");
+    } catch (IllegalArgumentException expected) {
+    }
+  }
+
+  public void testReplaceEscaping() {
+    assertEquals("<a href=\"a&quot;&amp;&#39;&lt;&gt;b\">find</a>",
+        new LinkFindReplace("find", "a\"&'<>b").replace("find"));
+  }
+
+  public void testHtmlInFind() {
+    String rawFind = "<b>&quot;bold&quot;</b>";
+    LinkFindReplace a = new LinkFindReplace(rawFind, "/bold");
+    assertEquals(rawFind, a.pattern().getSource());
+    assertEquals("<a href=\"/bold\">" + rawFind + "</a>", a.replace(rawFind));
+  }
+}
diff --git a/src/test/java/com/google/gwtexpui/safehtml/client/RegexFindReplaceTest.java b/src/test/java/com/google/gwtexpui/safehtml/client/RawFindReplaceTest.java
similarity index 77%
rename from src/test/java/com/google/gwtexpui/safehtml/client/RegexFindReplaceTest.java
rename to src/test/java/com/google/gwtexpui/safehtml/client/RawFindReplaceTest.java
index a11a7ec..9c450bd 100644
--- a/src/test/java/com/google/gwtexpui/safehtml/client/RegexFindReplaceTest.java
+++ b/src/test/java/com/google/gwtexpui/safehtml/client/RawFindReplaceTest.java
@@ -16,13 +16,13 @@
 
 import junit.framework.TestCase;
 
-public class RegexFindReplaceTest extends TestCase {
-  public void testCreate() {
+public class RawFindReplaceTest extends TestCase {
+  public void testFindReplace() {
     final String find = "find";
     final String replace = "replace";
-    final RegexFindReplace a = new RegexFindReplace(find, replace);
-    assertSame(find, a.find());
-    assertSame(replace, a.replace());
+    final RawFindReplace a = new RawFindReplace(find, replace);
+    assertEquals(find, a.pattern().getSource());
+    assertEquals(replace, a.replace(find));
     assertEquals("find = " + find + ", replace = " + replace, a.toString());
   }
 }
diff --git a/src/test/java/com/google/gwtexpui/safehtml/client/SafeHtml_ReplaceTest.java b/src/test/java/com/google/gwtexpui/safehtml/client/SafeHtml_ReplaceTest.java
index ec9d06f..e8c14d7 100644
--- a/src/test/java/com/google/gwtexpui/safehtml/client/SafeHtml_ReplaceTest.java
+++ b/src/test/java/com/google/gwtexpui/safehtml/client/SafeHtml_ReplaceTest.java
@@ -23,7 +23,7 @@
   public void testReplaceOneLink() {
     SafeHtml o = html("A\nissue 42\nB");
     SafeHtml n = o.replaceAll(repls(
-        new RegexFindReplace("(issue\\s(\\d+))", "<a href=\"?$2\">$1</a>")));
+        new RawFindReplace("(issue\\s(\\d+))", "<a href=\"?$2\">$1</a>")));
     assertNotSame(o, n);
     assertEquals("A\n<a href=\"?42\">issue 42</a>\nB", n.asString());
   }
@@ -31,7 +31,7 @@
   public void testReplaceNoLeadingOrTrailingText() {
     SafeHtml o = html("issue 42");
     SafeHtml n = o.replaceAll(repls(
-        new RegexFindReplace("(issue\\s(\\d+))", "<a href=\"?$2\">$1</a>")));
+        new RawFindReplace("(issue\\s(\\d+))", "<a href=\"?$2\">$1</a>")));
     assertNotSame(o, n);
     assertEquals("<a href=\"?42\">issue 42</a>", n.asString());
   }
@@ -39,7 +39,7 @@
   public void testReplaceTwoLinks() {
     SafeHtml o = html("A\nissue 42\nissue 9918\nB");
     SafeHtml n = o.replaceAll(repls(
-        new RegexFindReplace("(issue\\s(\\d+))", "<a href=\"?$2\">$1</a>")));
+        new RawFindReplace("(issue\\s(\\d+))", "<a href=\"?$2\">$1</a>")));
     assertNotSame(o, n);
     assertEquals("A\n"
         + "<a href=\"?42\">issue 42</a>\n"
@@ -51,9 +51,9 @@
   public void testReplaceInOrder() {
     SafeHtml o = html("A\nissue 42\nReally GWTEXPUI-9918 is better\nB");
     SafeHtml n = o.replaceAll(repls(
-        new RegexFindReplace("(GWTEXPUI-(\\d+))",
+        new RawFindReplace("(GWTEXPUI-(\\d+))",
             "<a href=\"gwtexpui-bug?$2\">$1</a>"),
-        new RegexFindReplace("(issue\\s+(\\d+))",
+        new RawFindReplace("(issue\\s+(\\d+))",
             "<a href=\"generic-bug?$2\">$1</a>")));
     assertNotSame(o, n);
     assertEquals("A\n"
@@ -65,9 +65,9 @@
 
   public void testReplaceOverlappingAfterFirstChar() {
     SafeHtml o = html("abcd");
-    RegexFindReplace ab = new RegexFindReplace("ab", "AB");
-    RegexFindReplace bc = new RegexFindReplace("bc", "23");
-    RegexFindReplace cd = new RegexFindReplace("cd", "YZ");
+    RawFindReplace ab = new RawFindReplace("ab", "AB");
+    RawFindReplace bc = new RawFindReplace("bc", "23");
+    RawFindReplace cd = new RawFindReplace("cd", "YZ");
 
     assertEquals("ABcd", o.replaceAll(repls(ab, bc)).asString());
     assertEquals("ABcd", o.replaceAll(repls(bc, ab)).asString());
@@ -76,8 +76,8 @@
 
   public void testReplaceOverlappingAtFirstCharLongestMatch() {
     SafeHtml o = html("abcd");
-    RegexFindReplace ab = new RegexFindReplace("ab", "AB");
-    RegexFindReplace abc = new RegexFindReplace("[^d][^d][^d]", "234");
+    RawFindReplace ab = new RawFindReplace("ab", "AB");
+    RawFindReplace abc = new RawFindReplace("[^d][^d][^d]", "234");
 
     assertEquals("ABcd", o.replaceAll(repls(ab, abc)).asString());
     assertEquals("234d", o.replaceAll(repls(abc, ab)).asString());
@@ -85,18 +85,28 @@
 
   public void testReplaceOverlappingAtFirstCharFirstMatch() {
     SafeHtml o = html("abcd");
-    RegexFindReplace ab1 = new RegexFindReplace("ab", "AB");
-    RegexFindReplace ab2 = new RegexFindReplace("[^cd][^cd]", "12");
+    RawFindReplace ab1 = new RawFindReplace("ab", "AB");
+    RawFindReplace ab2 = new RawFindReplace("[^cd][^cd]", "12");
 
     assertEquals("ABcd", o.replaceAll(repls(ab1, ab2)).asString());
     assertEquals("12cd", o.replaceAll(repls(ab2, ab1)).asString());
   }
 
+  public void testFailedSanitization() {
+    SafeHtml o = html("abcd");
+    LinkFindReplace evil = new LinkFindReplace("(b)", "javascript:alert('$1')");
+    LinkFindReplace ok = new LinkFindReplace("(b)", "/$1");
+    assertEquals("abcd", o.replaceAll(repls(evil)).asString());
+    String linked = "a<a href=\"/b\">b</a>cd";
+    assertEquals(linked, o.replaceAll(repls(ok)).asString());
+    assertEquals(linked, o.replaceAll(repls(evil, ok)).asString());
+  }
+
   private static SafeHtml html(String text) {
     return new SafeHtmlBuilder().append(text).toSafeHtml();
   }
 
-  private static List<RegexFindReplace> repls(RegexFindReplace... repls) {
+  private static List<FindReplace> repls(FindReplace... repls) {
     return Arrays.asList(repls);
   }
 }