blob: 9b4da884bbda706f5f9aa734e877ee2875b40395 [file] [log] [blame]
// Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package prettify.parser;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This is similar to the combinePrefixPattern.js in JavaScript Prettify.
*
* All comments are adapted from the JavaScript Prettify.
*
* @author mikesamuel@gmail.com
*/
public class CombinePrefixPattern {
protected int capturedGroupIndex = 0;
protected boolean needToFoldCase = false;
public CombinePrefixPattern() {
}
/**
* Given a group of {@link java.util.regex.Pattern}s, returns a {@code RegExp} that globally
* matches the union of the sets of strings matched by the input RegExp.
* Since it matches globally, if the input strings have a start-of-input
* anchor (/^.../), it is ignored for the purposes of unioning.
* @param regexs non multiline, non-global regexs.
* @return Pattern a global regex.
*/
public Pattern combinePrefixPattern(List<Pattern> regexs) throws Exception {
boolean ignoreCase = false;
for (int i = 0, n = regexs.size(); i < n; ++i) {
Pattern regex = regexs.get(i);
if ((regex.flags() & Pattern.CASE_INSENSITIVE) != 0) {
ignoreCase = true;
} else if (Util.test(Pattern.compile("[a-z]", Pattern.CASE_INSENSITIVE), regex.pattern().replaceAll("\\\\[Uu][0-9A-Fa-f]{4}|\\\\[Xx][0-9A-Fa-f]{2}|\\\\[^UuXx]", ""))) {
needToFoldCase = true;
ignoreCase = false;
break;
}
}
List<String> rewritten = new ArrayList<String>();
for (int i = 0, n = regexs.size(); i < n; ++i) {
Pattern regex = regexs.get(i);
if ((regex.flags() & Pattern.MULTILINE) != 0) {
throw new Exception(regex.pattern());
}
rewritten.add("(?:" + allowAnywhereFoldCaseAndRenumberGroups(regex) + ")");
}
return ignoreCase ? Pattern.compile(Util.join(rewritten, "|"), Pattern.CASE_INSENSITIVE) : Pattern.compile(Util.join(rewritten, "|"));
}
protected static final Map<Character, Integer> escapeCharToCodeUnit = new HashMap<Character, Integer>();
static {
escapeCharToCodeUnit.put('b', 8);
escapeCharToCodeUnit.put('t', 9);
escapeCharToCodeUnit.put('n', 0xa);
escapeCharToCodeUnit.put('v', 0xb);
escapeCharToCodeUnit.put('f', 0xc);
escapeCharToCodeUnit.put('r', 0xf);
}
protected static int decodeEscape(String charsetPart) {
Integer cc0 = charsetPart.codePointAt(0);
if (cc0 != 92 /* \\ */) {
return cc0;
}
char c1 = charsetPart.charAt(1);
cc0 = escapeCharToCodeUnit.get(c1);
if (cc0 != null) {
return cc0;
} else if ('0' <= c1 && c1 <= '7') {
return Integer.parseInt(charsetPart.substring(1), 8);
} else if (c1 == 'u' || c1 == 'x') {
return Integer.parseInt(charsetPart.substring(2), 16);
} else {
return charsetPart.codePointAt(1);
}
}
protected static String encodeEscape(int charCode) {
if (charCode < 0x20) {
return (charCode < 0x10 ? "\\x0" : "\\x") + Integer.toString(charCode, 16);
}
String ch = new String(Character.toChars(charCode));
return (charCode == '\\' || charCode == '-' || charCode == ']' || charCode == '^')
? "\\" + ch : ch;
}
protected static String caseFoldCharset(String charSet) {
String[] charsetParts = Util.match(Pattern.compile("\\\\u[0-9A-Fa-f]{4}"
+ "|\\\\x[0-9A-Fa-f]{2}"
+ "|\\\\[0-3][0-7]{0,2}"
+ "|\\\\[0-7]{1,2}"
+ "|\\\\[\\s\\S]"
+ "|-"
+ "|[^-\\\\]"), charSet.substring(1, charSet.length() - 1), true);
List<List<Integer>> ranges = new ArrayList<List<Integer>>();
boolean inverse = charsetParts[0] != null && charsetParts[0].equals("^");
List<String> out = new ArrayList<String>(Arrays.asList(new String[]{"["}));
if (inverse) {
out.add("^");
}
for (int i = inverse ? 1 : 0, n = charsetParts.length; i < n; ++i) {
String p = charsetParts[i];
if (Util.test(Pattern.compile("\\\\[bdsw]", Pattern.CASE_INSENSITIVE), p)) { // Don't muck with named groups.
out.add(p);
} else {
int start = decodeEscape(p);
int end;
if (i + 2 < n && "-".equals(charsetParts[i + 1])) {
end = decodeEscape(charsetParts[i + 2]);
i += 2;
} else {
end = start;
}
ranges.add(Arrays.asList(new Integer[]{start, end}));
// If the range might intersect letters, then expand it.
// This case handling is too simplistic.
// It does not deal with non-latin case folding.
// It works for latin source code identifiers though.
if (!(end < 65 || start > 122)) {
if (!(end < 65 || start > 90)) {
ranges.add(Arrays.asList(new Integer[]{Math.max(65, start) | 32, Math.min(end, 90) | 32}));
}
if (!(end < 97 || start > 122)) {
ranges.add(Arrays.asList(new Integer[]{Math.max(97, start) & ~32, Math.min(end, 122) & ~32}));
}
}
}
}
// [[1, 10], [3, 4], [8, 12], [14, 14], [16, 16], [17, 17]]
// -> [[1, 12], [14, 14], [16, 17]]
Collections.sort(ranges, new Comparator<List<Integer>>() {
@Override
public int compare(List<Integer> a, List<Integer> b) {
return a.get(0) != b.get(0) ? (a.get(0) - b.get(0)) : (b.get(1) - a.get(1));
}
});
List<List<Integer>> consolidatedRanges = new ArrayList<List<Integer>>();
// List<Integer> lastRange = Arrays.asList(new Integer[]{0, 0});
List<Integer> lastRange = new ArrayList<Integer>(Arrays.asList(new Integer[]{0, 0}));
for (int i = 0; i < ranges.size(); ++i) {
List<Integer> range = ranges.get(i);
if (lastRange.get(1) != null && range.get(0) <= lastRange.get(1) + 1) {
lastRange.set(1, Math.max(lastRange.get(1), range.get(1)));
} else {
// reference of lastRange is added
consolidatedRanges.add(lastRange = range);
}
}
for (int i = 0; i < consolidatedRanges.size(); ++i) {
List<Integer> range = consolidatedRanges.get(i);
out.add(encodeEscape(range.get(0)));
if (range.get(1) > range.get(0)) {
if (range.get(1) + 1 > range.get(0)) {
out.add("-");
}
out.add(encodeEscape(range.get(1)));
}
}
out.add("]");
return Util.join(out);
}
protected String allowAnywhereFoldCaseAndRenumberGroups(Pattern regex) {
// Split into character sets, escape sequences, punctuation strings
// like ('(', '(?:', ')', '^'), and runs of characters that do not
// include any of the above.
String[] parts = Util.match(Pattern.compile("(?:"
+ "\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]" // a character set
+ "|\\\\u[A-Fa-f0-9]{4}" // a unicode escape
+ "|\\\\x[A-Fa-f0-9]{2}" // a hex escape
+ "|\\\\[0-9]+" // a back-reference or octal escape
+ "|\\\\[^ux0-9]" // other escape sequence
+ "|\\(\\?[:!=]" // start of a non-capturing group
+ "|[\\(\\)\\^]" // start/end of a group, or line start
+ "|[^\\x5B\\x5C\\(\\)\\^]+" // run of other characters
+ ")"), regex.pattern(), true);
int n = parts.length;
// Maps captured group numbers to the number they will occupy in
// the output or to -1 if that has not been determined, or to
// undefined if they need not be capturing in the output.
Map<Integer, Integer> capturedGroups = new HashMap<Integer, Integer>();
// Walk over and identify back references to build the capturedGroups
// mapping.
for (int i = 0, groupIndex = 0; i < n; ++i) {
String p = parts[i];
if (p.equals("(")) {
// groups are 1-indexed, so max group index is count of '('
++groupIndex;
} else if ('\\' == p.charAt(0)) {
try {
int decimalValue = Math.abs(Integer.parseInt(p.substring(1)));
if (decimalValue <= groupIndex) {
capturedGroups.put(decimalValue, -1);
} else {
// Replace with an unambiguous escape sequence so that
// an octal escape sequence does not turn into a backreference
// to a capturing group from an earlier regex.
parts[i] = encodeEscape(decimalValue);
}
} catch (NumberFormatException ex) {
}
}
}
// Renumber groups and reduce capturing groups to non-capturing groups
// where possible.
for (int i : capturedGroups.keySet()) {
if (-1 == capturedGroups.get(i)) {
capturedGroups.put(i, ++capturedGroupIndex);
}
}
for (int i = 0, groupIndex = 0; i < n; ++i) {
String p = parts[i];
if (p.equals("(")) {
++groupIndex;
if (capturedGroups.get(groupIndex) == null) {
parts[i] = "(?:";
}
} else if ('\\' == p.charAt(0)) {
try {
int decimalValue = Math.abs(Integer.parseInt(p.substring(1)));
if (decimalValue <= groupIndex) {
parts[i] = "\\" + capturedGroups.get(decimalValue);
}
} catch (NumberFormatException ex) {
}
}
}
// Remove any prefix anchors so that the output will match anywhere.
// ^^ really does mean an anchored match though.
for (int i = 0; i < n; ++i) {
if ("^".equals(parts[i]) && !"^".equals(parts[i + 1])) {
parts[i] = "";
}
}
// Expand letters to groups to handle mixing of case-sensitive and
// case-insensitive patterns if necessary.
if ((regex.flags() & Pattern.CASE_INSENSITIVE) != 0 && needToFoldCase) {
for (int i = 0; i < n; ++i) {
String p = parts[i];
char ch0 = p.length() > 0 ? p.charAt(0) : 0;
if (p.length() >= 2 && ch0 == '[') {
parts[i] = caseFoldCharset(p);
} else if (ch0 != '\\') {
// TODO: handle letters in numeric escapes.
StringBuffer sb = new StringBuffer();
Matcher _matcher = Pattern.compile("[a-zA-Z]").matcher(p);
while (_matcher.find()) {
int cc = _matcher.group(0).codePointAt(0);
_matcher.appendReplacement(sb, "");
sb.append("[").append(Character.toString((char) (cc & ~32))).append(Character.toString((char) (cc | 32))).append("]");
}
_matcher.appendTail(sb);
parts[i] = sb.toString();
}
}
}
return Util.join(parts);
}
}