src/prettify/parser/CombinePrefixPattern.java - java-prettify - Git at Google

 // Copyright (C) 2006 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package prettify.parser;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
  * This is similar to the combinePrefixPattern.js in JavaScript Prettify.
  *
  * All comments are adapted from the JavaScript Prettify.
  *
  * @author mikesamuel@gmail.com
  */
 public class CombinePrefixPattern {

   protected int capturedGroupIndex = 0;
   protected boolean needToFoldCase = false;

   public CombinePrefixPattern() {
   }

   /**
    * Given a group of {@link java.util.regex.Pattern}s, returns a {@code RegExp} that globally
    * matches the union of the sets of strings matched by the input RegExp.
    * Since it matches globally, if the input strings have a start-of-input
    * anchor (/^.../), it is ignored for the purposes of unioning.
    * @param regexs non multiline, non-global regexs.
    * @return Pattern a global regex.
    */
   public Pattern combinePrefixPattern(List<Pattern> regexs) throws Exception {
     boolean ignoreCase = false;

     for (int i = 0, n = regexs.size(); i < n; ++i) {
       Pattern regex = regexs.get(i);
       if ((regex.flags() & Pattern.CASE_INSENSITIVE) != 0) {
         ignoreCase = true;
       } else if (Util.test(Pattern.compile("[a-z]", Pattern.CASE_INSENSITIVE), regex.pattern().replaceAll("\\\\[Uu][0-9A-Fa-f]{4}|\\\\[Xx][0-9A-Fa-f]{2}|\\\\[^UuXx]", ""))) {
         needToFoldCase = true;
         ignoreCase = false;
         break;
       }
     }

     List<String> rewritten = new ArrayList<String>();
     for (int i = 0, n = regexs.size(); i < n; ++i) {
       Pattern regex = regexs.get(i);
       if ((regex.flags() & Pattern.MULTILINE) != 0) {
         throw new Exception(regex.pattern());
       }
       rewritten.add("(?:" + allowAnywhereFoldCaseAndRenumberGroups(regex) + ")");
     }

     return ignoreCase ? Pattern.compile(Util.join(rewritten, "|"), Pattern.CASE_INSENSITIVE) : Pattern.compile(Util.join(rewritten, "|"));
   }
   protected static final Map<Character, Integer> escapeCharToCodeUnit = new HashMap<Character, Integer>();

   static {
     escapeCharToCodeUnit.put('b', 8);
     escapeCharToCodeUnit.put('t', 9);
     escapeCharToCodeUnit.put('n', 0xa);
     escapeCharToCodeUnit.put('v', 0xb);
     escapeCharToCodeUnit.put('f', 0xc);
     escapeCharToCodeUnit.put('r', 0xf);
   }

   protected static int decodeEscape(String charsetPart) {
     Integer cc0 = charsetPart.codePointAt(0);
     if (cc0 != 92 /* \\ */) {
       return cc0;
     }
     char c1 = charsetPart.charAt(1);
     cc0 = escapeCharToCodeUnit.get(c1);
     if (cc0 != null) {
       return cc0;
     } else if ('0' <= c1 && c1 <= '7') {
       return Integer.parseInt(charsetPart.substring(1), 8);
     } else if (c1 == 'u' || c1 == 'x') {
       return Integer.parseInt(charsetPart.substring(2), 16);
     } else {
       return charsetPart.codePointAt(1);
     }
   }

   protected static String encodeEscape(int charCode) {
     if (charCode < 0x20) {
       return (charCode < 0x10 ? "\\x0" : "\\x") + Integer.toString(charCode, 16);
     }

     String ch = new String(Character.toChars(charCode));
     return (charCode == '\\' || charCode == '-' || charCode == ']' || charCode == '^')
             ? "\\" + ch : ch;
   }

   protected static String caseFoldCharset(String charSet) {
     String[] charsetParts = Util.match(Pattern.compile("\\\\u[0-9A-Fa-f]{4}"
             + "|\\\\x[0-9A-Fa-f]{2}"
             + "|\\\\[0-3][0-7]{0,2}"
             + "|\\\\[0-7]{1,2}"
             + "|\\\\[\\s\\S]"
             + "|-"
             + "|[^-\\\\]"), charSet.substring(1, charSet.length() - 1), true);
     List<List<Integer>> ranges = new ArrayList<List<Integer>>();
     boolean inverse = charsetParts[0] != null && charsetParts[0].equals("^");

     List<String> out = new ArrayList<String>(Arrays.asList(new String[]{"["}));
     if (inverse) {
       out.add("^");
     }

     for (int i = inverse ? 1 : 0, n = charsetParts.length; i < n; ++i) {
       String p = charsetParts[i];
       if (Util.test(Pattern.compile("\\\\[bdsw]", Pattern.CASE_INSENSITIVE), p)) {  // Don't muck with named groups.
         out.add(p);
       } else {
         int start = decodeEscape(p);
         int end;
         if (i + 2 < n && "-".equals(charsetParts[i + 1])) {
           end = decodeEscape(charsetParts[i + 2]);
           i += 2;
         } else {
           end = start;
         }
         ranges.add(Arrays.asList(new Integer[]{start, end}));
         // If the range might intersect letters, then expand it.
         // This case handling is too simplistic.
         // It does not deal with non-latin case folding.
         // It works for latin source code identifiers though.
         if (!(end < 65 || start > 122)) {
           if (!(end < 65 || start > 90)) {
             ranges.add(Arrays.asList(new Integer[]{Math.max(65, start) | 32, Math.min(end, 90) | 32}));
           }
           if (!(end < 97 || start > 122)) {
             ranges.add(Arrays.asList(new Integer[]{Math.max(97, start) & ~32, Math.min(end, 122) & ~32}));
           }
         }
       }
     }

     // [[1, 10], [3, 4], [8, 12], [14, 14], [16, 16], [17, 17]]
     // -> [[1, 12], [14, 14], [16, 17]]
     Collections.sort(ranges, new Comparator<List<Integer>>() {

       @Override
       public int compare(List<Integer> a, List<Integer> b) {
         return a.get(0) != b.get(0) ? (a.get(0) - b.get(0)) : (b.get(1) - a.get(1));
       }
     });
     List<List<Integer>> consolidatedRanges = new ArrayList<List<Integer>>();
 //        List<Integer> lastRange = Arrays.asList(new Integer[]{0, 0});
     List<Integer> lastRange = new ArrayList<Integer>(Arrays.asList(new Integer[]{0, 0}));
     for (int i = 0; i < ranges.size(); ++i) {
       List<Integer> range = ranges.get(i);
       if (lastRange.get(1) != null && range.get(0) <= lastRange.get(1) + 1) {
         lastRange.set(1, Math.max(lastRange.get(1), range.get(1)));
       } else {
         // reference of lastRange is added
         consolidatedRanges.add(lastRange = range);
       }
     }

     for (int i = 0; i < consolidatedRanges.size(); ++i) {
       List<Integer> range = consolidatedRanges.get(i);
       out.add(encodeEscape(range.get(0)));
       if (range.get(1) > range.get(0)) {
         if (range.get(1) + 1 > range.get(0)) {
           out.add("-");
         }
         out.add(encodeEscape(range.get(1)));
       }
     }
     out.add("]");

     return Util.join(out);
   }

   protected String allowAnywhereFoldCaseAndRenumberGroups(Pattern regex) {
     // Split into character sets, escape sequences, punctuation strings
     // like ('(', '(?:', ')', '^'), and runs of characters that do not
     // include any of the above.
     String[] parts = Util.match(Pattern.compile("(?:"
             + "\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]" // a character set
             + "|\\\\u[A-Fa-f0-9]{4}" // a unicode escape
             + "|\\\\x[A-Fa-f0-9]{2}" // a hex escape
             + "|\\\\[0-9]+" // a back-reference or octal escape
             + "|\\\\[^ux0-9]" // other escape sequence
             + "|\\(\\?[:!=]" // start of a non-capturing group
             + "|[\\(\\)\\^]" // start/end of a group, or line start
             + "|[^\\x5B\\x5C\\(\\)\\^]+" // run of other characters
             + ")"), regex.pattern(), true);
     int n = parts.length;

     // Maps captured group numbers to the number they will occupy in
     // the output or to -1 if that has not been determined, or to
     // undefined if they need not be capturing in the output.
     Map<Integer, Integer> capturedGroups = new HashMap<Integer, Integer>();

     // Walk over and identify back references to build the capturedGroups
     // mapping.
     for (int i = 0, groupIndex = 0; i < n; ++i) {
       String p = parts[i];
       if (p.equals("(")) {
         // groups are 1-indexed, so max group index is count of '('
         ++groupIndex;
       } else if ('\\' == p.charAt(0)) {
         try {
           int decimalValue = Math.abs(Integer.parseInt(p.substring(1)));
           if (decimalValue <= groupIndex) {
             capturedGroups.put(decimalValue, -1);
           } else {
             // Replace with an unambiguous escape sequence so that
             // an octal escape sequence does not turn into a backreference
             // to a capturing group from an earlier regex.
             parts[i] = encodeEscape(decimalValue);
           }
         } catch (NumberFormatException ex) {
         }
       }
     }

     // Renumber groups and reduce capturing groups to non-capturing groups
     // where possible.
     for (int i : capturedGroups.keySet()) {
       if (-1 == capturedGroups.get(i)) {
         capturedGroups.put(i, ++capturedGroupIndex);
       }
     }
     for (int i = 0, groupIndex = 0; i < n; ++i) {
       String p = parts[i];
       if (p.equals("(")) {
         ++groupIndex;
         if (capturedGroups.get(groupIndex) == null) {
           parts[i] = "(?:";
         }
       } else if ('\\' == p.charAt(0)) {
         try {
           int decimalValue = Math.abs(Integer.parseInt(p.substring(1)));
           if (decimalValue <= groupIndex) {
             parts[i] = "\\" + capturedGroups.get(decimalValue);
           }
         } catch (NumberFormatException ex) {
         }
       }
     }

     // Remove any prefix anchors so that the output will match anywhere.
     // ^^ really does mean an anchored match though.
     for (int i = 0; i < n; ++i) {
       if ("^".equals(parts[i]) && !"^".equals(parts[i + 1])) {
         parts[i] = "";
       }
     }

     // Expand letters to groups to handle mixing of case-sensitive and
     // case-insensitive patterns if necessary.
     if ((regex.flags() & Pattern.CASE_INSENSITIVE) != 0 && needToFoldCase) {
       for (int i = 0; i < n; ++i) {
         String p = parts[i];
         char ch0 = p.length() > 0 ? p.charAt(0) : 0;
         if (p.length() >= 2 && ch0 == '[') {
           parts[i] = caseFoldCharset(p);
         } else if (ch0 != '\\') {
           // TODO: handle letters in numeric escapes.
           StringBuffer sb = new StringBuffer();
           Matcher _matcher = Pattern.compile("[a-zA-Z]").matcher(p);
           while (_matcher.find()) {
             int cc = _matcher.group(0).codePointAt(0);
             _matcher.appendReplacement(sb, "");
             sb.append("[").append(Character.toString((char) (cc & ~32))).append(Character.toString((char) (cc | 32))).append("]");
           }
           _matcher.appendTail(sb);
           parts[i] = sb.toString();
         }
       }
     }

     return Util.join(parts);
   }
 }
	// Copyright (C) 2006 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	package prettify.parser;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	/**
	* This is similar to the combinePrefixPattern.js in JavaScript Prettify.
	*
	* All comments are adapted from the JavaScript Prettify.
	*
	* @author mikesamuel@gmail.com
	*/
	public class CombinePrefixPattern {

	protected int capturedGroupIndex = 0;
	protected boolean needToFoldCase = false;

	public CombinePrefixPattern() {
	}

	/**
	* Given a group of {@link java.util.regex.Pattern}s, returns a {@code RegExp} that globally
	* matches the union of the sets of strings matched by the input RegExp.
	* Since it matches globally, if the input strings have a start-of-input
	* anchor (/^.../), it is ignored for the purposes of unioning.
	* @param regexs non multiline, non-global regexs.
	* @return Pattern a global regex.
	*/
	public Pattern combinePrefixPattern(List<Pattern> regexs) throws Exception {
	boolean ignoreCase = false;

	for (int i = 0, n = regexs.size(); i < n; ++i) {
	Pattern regex = regexs.get(i);
	if ((regex.flags() & Pattern.CASE_INSENSITIVE) != 0) {
	ignoreCase = true;
	} else if (Util.test(Pattern.compile("[a-z]", Pattern.CASE_INSENSITIVE), regex.pattern().replaceAll("\\\\[Uu][0-9A-Fa-f]{4}\|\\\\[Xx][0-9A-Fa-f]{2}\|\\\\[^UuXx]", ""))) {
	needToFoldCase = true;
	ignoreCase = false;
	break;
	}
	}

	List<String> rewritten = new ArrayList<String>();
	for (int i = 0, n = regexs.size(); i < n; ++i) {
	Pattern regex = regexs.get(i);
	if ((regex.flags() & Pattern.MULTILINE) != 0) {
	throw new Exception(regex.pattern());
	}
	rewritten.add("(?:" + allowAnywhereFoldCaseAndRenumberGroups(regex) + ")");
	}

	return ignoreCase ? Pattern.compile(Util.join(rewritten, "\|"), Pattern.CASE_INSENSITIVE) : Pattern.compile(Util.join(rewritten, "\|"));
	}
	protected static final Map<Character, Integer> escapeCharToCodeUnit = new HashMap<Character, Integer>();

	static {
	escapeCharToCodeUnit.put('b', 8);
	escapeCharToCodeUnit.put('t', 9);
	escapeCharToCodeUnit.put('n', 0xa);
	escapeCharToCodeUnit.put('v', 0xb);
	escapeCharToCodeUnit.put('f', 0xc);
	escapeCharToCodeUnit.put('r', 0xf);
	}

	protected static int decodeEscape(String charsetPart) {
	Integer cc0 = charsetPart.codePointAt(0);
	if (cc0 != 92 /* \\ */) {
	return cc0;
	}
	char c1 = charsetPart.charAt(1);
	cc0 = escapeCharToCodeUnit.get(c1);
	if (cc0 != null) {
	return cc0;
	} else if ('0' <= c1 && c1 <= '7') {
	return Integer.parseInt(charsetPart.substring(1), 8);
	} else if (c1 == 'u' \|\| c1 == 'x') {
	return Integer.parseInt(charsetPart.substring(2), 16);
	} else {
	return charsetPart.codePointAt(1);
	}
	}

	protected static String encodeEscape(int charCode) {
	if (charCode < 0x20) {
	return (charCode < 0x10 ? "\\x0" : "\\x") + Integer.toString(charCode, 16);
	}

	String ch = new String(Character.toChars(charCode));
	return (charCode == '\\' \|\| charCode == '-' \|\| charCode == ']' \|\| charCode == '^')
	? "\\" + ch : ch;
	}

	protected static String caseFoldCharset(String charSet) {
	String[] charsetParts = Util.match(Pattern.compile("\\\\u[0-9A-Fa-f]{4}"
	+ "\|\\\\x[0-9A-Fa-f]{2}"
	+ "\|\\\\[0-3][0-7]{0,2}"
	+ "\|\\\\[0-7]{1,2}"
	+ "\|\\\\[\\s\\S]"
	+ "\|-"
	+ "\|[^-\\\\]"), charSet.substring(1, charSet.length() - 1), true);
	List<List<Integer>> ranges = new ArrayList<List<Integer>>();
	boolean inverse = charsetParts[0] != null && charsetParts[0].equals("^");

	List<String> out = new ArrayList<String>(Arrays.asList(new String[]{"["}));
	if (inverse) {
	out.add("^");
	}

	for (int i = inverse ? 1 : 0, n = charsetParts.length; i < n; ++i) {
	String p = charsetParts[i];
	if (Util.test(Pattern.compile("\\\\[bdsw]", Pattern.CASE_INSENSITIVE), p)) { // Don't muck with named groups.
	out.add(p);
	} else {
	int start = decodeEscape(p);
	int end;
	if (i + 2 < n && "-".equals(charsetParts[i + 1])) {
	end = decodeEscape(charsetParts[i + 2]);
	i += 2;
	} else {
	end = start;
	}
	ranges.add(Arrays.asList(new Integer[]{start, end}));
	// If the range might intersect letters, then expand it.
	// This case handling is too simplistic.
	// It does not deal with non-latin case folding.
	// It works for latin source code identifiers though.
	if (!(end < 65 \|\| start > 122)) {
	if (!(end < 65 \|\| start > 90)) {
	ranges.add(Arrays.asList(new Integer[]{Math.max(65, start) \| 32, Math.min(end, 90) \| 32}));
	}
	if (!(end < 97 \|\| start > 122)) {
	ranges.add(Arrays.asList(new Integer[]{Math.max(97, start) & ~32, Math.min(end, 122) & ~32}));
	}
	}
	}
	}

	// [[1, 10], [3, 4], [8, 12], [14, 14], [16, 16], [17, 17]]
	// -> [[1, 12], [14, 14], [16, 17]]
	Collections.sort(ranges, new Comparator<List<Integer>>() {

	@Override
	public int compare(List<Integer> a, List<Integer> b) {
	return a.get(0) != b.get(0) ? (a.get(0) - b.get(0)) : (b.get(1) - a.get(1));
	}
	});
	List<List<Integer>> consolidatedRanges = new ArrayList<List<Integer>>();
	// List<Integer> lastRange = Arrays.asList(new Integer[]{0, 0});
	List<Integer> lastRange = new ArrayList<Integer>(Arrays.asList(new Integer[]{0, 0}));
	for (int i = 0; i < ranges.size(); ++i) {
	List<Integer> range = ranges.get(i);
	if (lastRange.get(1) != null && range.get(0) <= lastRange.get(1) + 1) {
	lastRange.set(1, Math.max(lastRange.get(1), range.get(1)));
	} else {
	// reference of lastRange is added
	consolidatedRanges.add(lastRange = range);
	}
	}

	for (int i = 0; i < consolidatedRanges.size(); ++i) {
	List<Integer> range = consolidatedRanges.get(i);
	out.add(encodeEscape(range.get(0)));
	if (range.get(1) > range.get(0)) {
	if (range.get(1) + 1 > range.get(0)) {
	out.add("-");
	}
	out.add(encodeEscape(range.get(1)));
	}
	}
	out.add("]");

	return Util.join(out);
	}

	protected String allowAnywhereFoldCaseAndRenumberGroups(Pattern regex) {
	// Split into character sets, escape sequences, punctuation strings
	// like ('(', '(?:', ')', '^'), and runs of characters that do not
	// include any of the above.
	String[] parts = Util.match(Pattern.compile("(?:"
	+ "\\[(?:[^\\x5C\\x5D]\|\\\\[\\s\\S])*\\]" // a character set
	+ "\|\\\\u[A-Fa-f0-9]{4}" // a unicode escape
	+ "\|\\\\x[A-Fa-f0-9]{2}" // a hex escape
	+ "\|\\\\[0-9]+" // a back-reference or octal escape
	+ "\|\\\\[^ux0-9]" // other escape sequence
	+ "\|\\(\\?[:!=]" // start of a non-capturing group
	+ "\|[\\(\\)\\^]" // start/end of a group, or line start
	+ "\|[^\\x5B\\x5C\\(\\)\\^]+" // run of other characters
	+ ")"), regex.pattern(), true);
	int n = parts.length;

	// Maps captured group numbers to the number they will occupy in
	// the output or to -1 if that has not been determined, or to
	// undefined if they need not be capturing in the output.
	Map<Integer, Integer> capturedGroups = new HashMap<Integer, Integer>();

	// Walk over and identify back references to build the capturedGroups
	// mapping.
	for (int i = 0, groupIndex = 0; i < n; ++i) {
	String p = parts[i];
	if (p.equals("(")) {
	// groups are 1-indexed, so max group index is count of '('
	++groupIndex;
	} else if ('\\' == p.charAt(0)) {
	try {
	int decimalValue = Math.abs(Integer.parseInt(p.substring(1)));
	if (decimalValue <= groupIndex) {
	capturedGroups.put(decimalValue, -1);
	} else {
	// Replace with an unambiguous escape sequence so that
	// an octal escape sequence does not turn into a backreference
	// to a capturing group from an earlier regex.
	parts[i] = encodeEscape(decimalValue);
	}
	} catch (NumberFormatException ex) {
	}
	}
	}

	// Renumber groups and reduce capturing groups to non-capturing groups
	// where possible.
	for (int i : capturedGroups.keySet()) {
	if (-1 == capturedGroups.get(i)) {
	capturedGroups.put(i, ++capturedGroupIndex);
	}
	}
	for (int i = 0, groupIndex = 0; i < n; ++i) {
	String p = parts[i];
	if (p.equals("(")) {
	++groupIndex;
	if (capturedGroups.get(groupIndex) == null) {
	parts[i] = "(?:";
	}
	} else if ('\\' == p.charAt(0)) {
	try {
	int decimalValue = Math.abs(Integer.parseInt(p.substring(1)));
	if (decimalValue <= groupIndex) {
	parts[i] = "\\" + capturedGroups.get(decimalValue);
	}
	} catch (NumberFormatException ex) {
	}
	}
	}

	// Remove any prefix anchors so that the output will match anywhere.
	// ^^ really does mean an anchored match though.
	for (int i = 0; i < n; ++i) {
	if ("^".equals(parts[i]) && !"^".equals(parts[i + 1])) {
	parts[i] = "";
	}
	}

	// Expand letters to groups to handle mixing of case-sensitive and
	// case-insensitive patterns if necessary.
	if ((regex.flags() & Pattern.CASE_INSENSITIVE) != 0 && needToFoldCase) {
	for (int i = 0; i < n; ++i) {
	String p = parts[i];
	char ch0 = p.length() > 0 ? p.charAt(0) : 0;
	if (p.length() >= 2 && ch0 == '[') {
	parts[i] = caseFoldCharset(p);
	} else if (ch0 != '\\') {
	// TODO: handle letters in numeric escapes.
	StringBuffer sb = new StringBuffer();
	Matcher _matcher = Pattern.compile("[a-zA-Z]").matcher(p);
	while (_matcher.find()) {
	int cc = _matcher.group(0).codePointAt(0);
	_matcher.appendReplacement(sb, "");
	sb.append("[").append(Character.toString((char) (cc & ~32))).append(Character.toString((char) (cc \| 32))).append("]");
	}
	_matcher.appendTail(sb);
	parts[i] = sb.toString();
	}
	}
	}

	return Util.join(parts);
	}
	}