blob: 0e457741aaf75a73a58c4b1e002ba66ec82a9975 [file] [log] [blame]
// Copyright (C) 2006 Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package prettify.parser;
import prettify.lang.Lang;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import prettify.lang.LangAppollo;
import prettify.lang.LangClj;
import prettify.lang.LangCss;
import prettify.lang.LangDart;
import prettify.lang.LangGo;
import prettify.lang.LangHs;
import prettify.lang.LangLisp;
import prettify.lang.LangLua;
import prettify.lang.LangMl;
import prettify.lang.LangN;
import prettify.lang.LangR;
import prettify.lang.LangRd;
import prettify.lang.LangScala;
import prettify.lang.LangSql;
import prettify.lang.LangTcl;
import prettify.lang.LangTex;
import prettify.lang.LangVb;
import prettify.lang.LangVhdl;
import prettify.lang.LangWiki;
import prettify.lang.LangXq;
import prettify.lang.LangYaml;
* This is similar to the prettify.js in JavaScript Prettify.
* All comments are adapted from the JavaScript Prettify.
* <p>
* Some functions for browser-side pretty printing of code contained in html.
* </p>
* <p>
* For a fairly comprehensive set of languages see the
* <a href="">README</a>
* file that came with this source. At a minimum, the lexer should work on a
* number of languages including C and friends, Java, Python, Bash, SQL, HTML,
* XML, CSS, Javascript, and Makefiles. It works passably on Ruby, PHP and Awk
* and a subset of Perl, but, because of commenting conventions, doesn't work on
* Smalltalk, Lisp-like, or CAML-like languages without an explicit lang class.
* <p>
* Usage: <ol>
* <li> include this source file in an html page via
* {@code <script type="text/javascript" src="/path/to/prettify.js"></script>}
* <li> define style rules. See the example page for examples.
* <li> mark the {@code <pre>} and {@code <code>} tags in your source with
* {@code class=prettyprint.}
* You can also use the (html deprecated) {@code <xmp>} tag, but the pretty
* printer needs to do more substantial DOM manipulations to support that, so
* some css styles may not be preserved.
* </ol>
* That's it. I wanted to keep the API as simple as possible, so there's no
* need to specify which language the code is in, but if you wish, you can add
* another class to the {@code <pre>} or {@code <code>} element to specify the
* language, as in {@code <pre class="prettyprint lang-java">}. Any class that
* starts with "lang-" followed by a file extension, specifies the file type.
* See the "lang-*.js" files in this directory for code that implements
* per-language file handlers.
* <p>
* Change log:<br>
* cbeust, 2006/08/22
* <blockquote>
* Java annotations (start with "@") are now captured as literals ("lit")
* </blockquote>
public class Prettify {
private static final Logger LOG = Logger.getLogger(Prettify.class.getName());
// Keyword lists for various languages.
public static final String FLOW_CONTROL_KEYWORDS = "break,continue,do,else,for,if,return,while";
public static final String C_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "auto,case,char,const,default,"
+ "double,enum,extern,float,goto,int,long,register,short,signed,sizeof,"
+ "static,struct,switch,typedef,union,unsigned,void,volatile";
public static final String COMMON_KEYWORDS = C_KEYWORDS + "," + "catch,class,delete,false,import,"
+ "new,operator,private,protected,public,this,throw,true,try,typeof";
public static final String CPP_KEYWORDS = COMMON_KEYWORDS + "," + "alignof,align_union,asm,axiom,bool,"
+ "concept,concept_map,const_cast,constexpr,decltype,"
+ "dynamic_cast,explicit,export,friend,inline,late_check,"
+ "mutable,namespace,nullptr,reinterpret_cast,static_assert,static_cast,"
+ "template,typeid,typename,using,virtual,where";
public static final String JAVA_KEYWORDS = COMMON_KEYWORDS + ","
+ "abstract,boolean,byte,extends,final,finally,implements,import,"
+ "instanceof,null,native,package,strictfp,super,synchronized,throws,"
+ "transient";
public static final String CSHARP_KEYWORDS = JAVA_KEYWORDS + ","
+ "as,base,by,checked,decimal,delegate,descending,dynamic,event,"
+ "fixed,foreach,from,group,implicit,in,interface,internal,into,is,let,"
+ "lock,object,out,override,orderby,params,partial,readonly,ref,sbyte,"
+ "sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,"
+ "var,virtual,where";
public static final String COFFEE_KEYWORDS = "all,and,by,catch,class,else,extends,false,finally,"
+ "for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,"
+ "throw,true,try,unless,until,when,while,yes";
public static final String JSCRIPT_KEYWORDS = COMMON_KEYWORDS + ","
+ "debugger,eval,export,function,get,null,set,undefined,var,with,"
+ "Infinity,NaN";
public static final String PERL_KEYWORDS = "caller,delete,die,do,dump,elsif,eval,exit,foreach,for,"
+ "goto,if,import,last,local,my,next,no,our,print,package,redo,require,"
+ "sub,undef,unless,until,use,wantarray,while,BEGIN,END";
public static final String PYTHON_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "and,as,assert,class,def,del,"
+ "elif,except,exec,finally,from,global,import,in,is,lambda,"
+ "nonlocal,not,or,pass,print,raise,try,with,yield,"
+ "False,True,None";
public static final String RUBY_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "alias,and,begin,case,class,"
+ "def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,"
+ "rescue,retry,self,super,then,true,undef,unless,until,when,yield,"
public static final String SH_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "case,done,elif,esac,eval,fi,"
+ "function,in,local,set,then,until";
public static final String ALL_KEYWORDS = CPP_KEYWORDS + "," + CSHARP_KEYWORDS + "," + JSCRIPT_KEYWORDS + "," + PERL_KEYWORDS + ","
public static final Pattern C_TYPES = Pattern.compile("^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\\d*)\\b");
// token style names. correspond to css classes
* token style for a string literal
public static final String PR_STRING = "str";
* token style for a keyword
public static final String PR_KEYWORD = "kwd";
* token style for a comment
public static final String PR_COMMENT = "com";
* token style for a type
public static final String PR_TYPE = "typ";
* token style for a literal value. e.g. 1, null, true.
public static final String PR_LITERAL = "lit";
* token style for a punctuation string.
public static final String PR_PUNCTUATION = "pun";
* token style for a plain text.
public static final String PR_PLAIN = "pln";
* token style for an sgml tag.
public static final String PR_TAG = "tag";
* token style for a markup declaration such as a DOCTYPE.
public static final String PR_DECLARATION = "dec";
* token style for embedded source.
public static final String PR_SOURCE = "src";
* token style for an sgml attribute name.
public static final String PR_ATTRIB_NAME = "atn";
* token style for an sgml attribute value.
public static final String PR_ATTRIB_VALUE = "atv";
* A class that indicates a section of markup that is not code, e.g. to allow
* embedding of line numbers within code listings.
public static final String PR_NOCODE = "nocode";
* A set of tokens that can precede a regular expression literal in
* javascript
* has the full list, but I've removed ones that might be problematic when
* seen in languages that don't support regular expression literals.
* <p>Specifically, I've removed any keywords that can't precede a regexp
* literal in a syntactically legal javascript program, and I've removed the
* "in" keyword since it's not a keyword in many languages, and might be used
* as a count of inches.
* <p>The link above does not accurately describe EcmaScript rules since
* it fails to distinguish between (a=++/b/i) and (a++/b/i) but it works
* very well in practice.
private static final String REGEXP_PRECEDER_PATTERN = "(?:^^\\.?|[+-]|[!=]=?=?|\\#|%=?|&&?=?|\\(|\\*=?|[+\\-]=|->|\\/=?|::?|<<?=?|>>?>?=?|,|;|\\?|@|\\[|~|\\{|\\^\\^?=?|\\|\\|?=?|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*";
// CAVEAT: this does not properly handle the case where a regular
// expression immediately follows another since a regular expression may
// have flags for case-sensitivity and the like. Having regexp tokens
// adjacent is not valid in any language I'm aware of, so I'm punting.
// TODO: maybe style special characters inside a regexp as punctuation.
public Prettify() {
try {
Map<String, Object> decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", ALL_KEYWORDS);
decorateSourceMap.put("hashComments", true);
decorateSourceMap.put("cStyleComments", true);
decorateSourceMap.put("multiLineStrings", true);
decorateSourceMap.put("regexLiterals", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"default-code"}));
List<List<Object>> shortcutStylePatterns, fallthroughStylePatterns;
shortcutStylePatterns = new ArrayList<List<Object>>();
fallthroughStylePatterns = new ArrayList<List<Object>>();
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN, Pattern.compile("^[^<?]+")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_DECLARATION, Pattern.compile("^<!\\w[^>]*(?:>|$)")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT, Pattern.compile("^<\\!--[\\s\\S]*?(?:-\\->|$)")}));
// Unescaped content in an unknown language
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-", Pattern.compile("^<\\?([\\s\\S]+?)(?:\\?>|$)")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-", Pattern.compile("^<%([\\s\\S]+?)(?:%>|$)")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PUNCTUATION, Pattern.compile("^(?:<[%?]|[%?]>)")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-", Pattern.compile("^<xmp\\b[^>]*>([\\s\\S]+?)<\\/xmp\\b[^>]*>", Pattern.CASE_INSENSITIVE)}));
// Unescaped content in javascript. (Or possibly vbscript).
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-js", Pattern.compile("^<script\\b[^>]*>([\\s\\S]*?)(<\\/script\\b[^>]*>)", Pattern.CASE_INSENSITIVE)}));
// Contains unescaped stylesheet content
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-css", Pattern.compile("^<style\\b[^>]*>([\\s\\S]*?)(<\\/style\\b[^>]*>)", Pattern.CASE_INSENSITIVE)}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-in.tag", Pattern.compile("^(<\\/?[a-z][^<>]*>)", Pattern.CASE_INSENSITIVE)}));
registerLangHandler(new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns), Arrays.asList(new String[]{"default-markup", "htm", "html", "mxml", "xhtml", "xml", "xsl"}));
shortcutStylePatterns = new ArrayList<List<Object>>();
fallthroughStylePatterns = new ArrayList<List<Object>>();
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN, Pattern.compile("^[\\s]+"), null, " \t\r\n"}));
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_ATTRIB_VALUE, Pattern.compile("^(?:\\\"[^\\\"]*\\\"?|\\'[^\\']*\\'?)"), null, "\"'"}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TAG, Pattern.compile("^^<\\/?[a-z](?:[\\w.:-]*\\w)?|\\/?>$", Pattern.CASE_INSENSITIVE)}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_ATTRIB_NAME, Pattern.compile("^(?!style[\\s=]|on)[a-z](?:[\\w:-]*\\w)?", Pattern.CASE_INSENSITIVE)}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-uq.val", Pattern.compile("^=\\s*([^>\\'\\\"\\s]*(?:[^>\\'\\\"\\s\\/]|\\/(?=\\s)))", Pattern.CASE_INSENSITIVE)}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PUNCTUATION, Pattern.compile("^[=<>\\/]+")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-js", Pattern.compile("^on\\w+\\s*=\\s*\\\"([^\\\"]+)\\\"", Pattern.CASE_INSENSITIVE)}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-js", Pattern.compile("^on\\w+\\s*=\\s*\\'([^\\']+)\\'", Pattern.CASE_INSENSITIVE)}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-js", Pattern.compile("^on\\w+\\s*=\\s*([^\\\"\\'>\\s]+)", Pattern.CASE_INSENSITIVE)}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-css", Pattern.compile("^style\\s*=\\s*\\\"([^\\\"]+)\\\"", Pattern.CASE_INSENSITIVE)}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-css", Pattern.compile("^style\\s*=\\s*\\'([^\\']+)\\'", Pattern.CASE_INSENSITIVE)}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-css", Pattern.compile("^style\\s*=\\s\\*([^\\\"\\'>\\s]+)", Pattern.CASE_INSENSITIVE)}));
registerLangHandler(new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns), Arrays.asList(new String[]{"in.tag"}));
shortcutStylePatterns = new ArrayList<List<Object>>();
fallthroughStylePatterns = new ArrayList<List<Object>>();
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_ATTRIB_VALUE, Pattern.compile("^[\\s\\S]+")}));
registerLangHandler(new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns), Arrays.asList(new String[]{"uq.val"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", CPP_KEYWORDS);
decorateSourceMap.put("hashComments", true);
decorateSourceMap.put("cStyleComments", true);
decorateSourceMap.put("types", C_TYPES);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"c", "cc", "cpp", "cxx", "cyc", "m"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", "null,true,false");
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"json"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", CSHARP_KEYWORDS);
decorateSourceMap.put("hashComments", true);
decorateSourceMap.put("cStyleComments", true);
decorateSourceMap.put("verbatimStrings", true);
decorateSourceMap.put("types", C_TYPES);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"cs"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", JAVA_KEYWORDS);
decorateSourceMap.put("cStyleComments", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"java"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", SH_KEYWORDS);
decorateSourceMap.put("hashComments", true);
decorateSourceMap.put("multiLineStrings", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"bsh", "csh", "sh"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", PYTHON_KEYWORDS);
decorateSourceMap.put("hashComments", true);
decorateSourceMap.put("multiLineStrings", true);
decorateSourceMap.put("tripleQuotedStrings", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"cv", "py"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", PERL_KEYWORDS);
decorateSourceMap.put("hashComments", true);
decorateSourceMap.put("multiLineStrings", true);
decorateSourceMap.put("regexLiterals", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"perl", "pl", "pm"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", RUBY_KEYWORDS);
decorateSourceMap.put("hashComments", true);
decorateSourceMap.put("multiLineStrings", true);
decorateSourceMap.put("regexLiterals", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"rb"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", JSCRIPT_KEYWORDS);
decorateSourceMap.put("cStyleComments", true);
decorateSourceMap.put("regexLiterals", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"js"}));
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", COFFEE_KEYWORDS);
decorateSourceMap.put("hashComments", 3); // ### style block comments
decorateSourceMap.put("cStyleComments", true);
decorateSourceMap.put("multilineStrings", true);
decorateSourceMap.put("tripleQuotedStrings", true);
decorateSourceMap.put("regexLiterals", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"coffee"}));
shortcutStylePatterns = new ArrayList<List<Object>>();
fallthroughStylePatterns = new ArrayList<List<Object>>();
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING, Pattern.compile("^[\\s\\S]+")}));
registerLangHandler(new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns), Arrays.asList(new String[]{"regex"}));
* Registers a language handler for Protocol Buffers as described at
* Based on the lexical grammar at
* @author
decorateSourceMap = new HashMap<String, Object>();
decorateSourceMap.put("keywords", "bytes,default,double,enum,extend,extensions,false,"
+ "group,import,max,message,option,"
+ "optional,package,repeated,required,returns,rpc,service,"
+ "syntax,to,true");
decorateSourceMap.put("types", Pattern.compile("^(bool|(double|s?fixed|[su]?int)(32|64)|float|string)\\b"));
decorateSourceMap.put("cStyleComments", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"proto"}));
} catch (Exception ex) {
LOG.log(Level.SEVERE, null, ex);
* Apply the given language handler to sourceCode and add the resulting
* decorations to out.
* @param basePos the index of sourceCode within the chunk of source
* whose decorations are already present on out.
protected static void appendDecorations(int basePos, String sourceCode, CreateSimpleLexer langHandler, List<Object> out) {
if (sourceCode == null) {
throw new NullPointerException("argument 'sourceCode' cannot be null");
Job job = new Job();
public class CreateSimpleLexer {
protected List<List<Object>> fallthroughStylePatterns;
protected Map<Character, List<Object>> shortcuts = new HashMap<Character, List<Object>>();
protected Pattern tokenizer;
protected int nPatterns;
/** Given triples of [style, pattern, context] returns a lexing function,
* The lexing function interprets the patterns to find token boundaries and
* returns a decoration list of the form
* [index_0, style_0, index_1, style_1, ..., index_n, style_n]
* where index_n is an index into the sourceCode, and style_n is a style
* constant like PR_PLAIN. index_n-1 <= index_n, and style_n-1 applies to
* all characters in sourceCode[index_n-1:index_n].
* The stylePatterns is a list whose elements have the form
* [style : string, pattern : RegExp, DEPRECATED, shortcut : string].
* Style is a style constant like PR_PLAIN, or can be a string of the
* form 'lang-FOO', where FOO is a language extension describing the
* language of the portion of the token in $1 after pattern executes.
* E.g., if style is 'lang-lisp', and group 1 contains the text
* '(hello (world))', then that portion of the token will be passed to the
* registered lisp handler for formatting.
* The text before and after group 1 will be restyled using this decorator
* so decorators should take care that this doesn't result in infinite
* recursion. For example, the HTML lexer rule for SCRIPT elements looks
* something like ['lang-js', /<[s]cript>(.+?)<\/script>/]. This may match
* '<script>foo()<\/script>', which would cause the current decorator to
* be called with '<script>' which would not match the same rule since
* group 1 must not be empty, so it would be instead styled as PR_TAG by
* the generic tag rule. The handler registered for the 'js' extension would
* then be called with 'foo()', and finally, the current decorator would
* be called with '<\/script>' which would not match the original rule and
* so the generic tag rule would identify it as a tag.
* Pattern must only match prefixes, and if it matches a prefix, then that
* match is considered a token with the same style.
* Context is applied to the last non-whitespace, non-comment token
* recognized.
* Shortcut is an optional string of characters, any of which, if the first
* character, gurantee that this pattern and only this pattern matches.
* @param shortcutStylePatterns patterns that always start with
* a known character. Must have a shortcut string.
* @param fallthroughStylePatterns patterns that will be tried in
* order if the shortcut ones fail. May have shortcuts.
protected CreateSimpleLexer(List<List<Object>> shortcutStylePatterns, List<List<Object>> fallthroughStylePatterns) throws Exception {
this.fallthroughStylePatterns = fallthroughStylePatterns;
List<List<Object>> allPatterns = new ArrayList<List<Object>>(shortcutStylePatterns);
List<Pattern> allRegexs = new ArrayList<Pattern>();
Map<String, Object> regexKeys = new HashMap<String, Object>();
for (int i = 0, n = allPatterns.size(); i < n; ++i) {
List<Object> patternParts = allPatterns.get(i);
String shortcutChars = patternParts.size() > 3 ? (String) patternParts.get(3) : null;
if (shortcutChars != null) {
for (int c = shortcutChars.length(); --c >= 0;) {
shortcuts.put(shortcutChars.charAt(c), patternParts);
Pattern regex = (Pattern) patternParts.get(1);
String k = regex.pattern();
if (regexKeys.get(k) == null) {
regexKeys.put(k, new Object());
tokenizer = new CombinePrefixPattern().combinePrefixPattern(allRegexs);
nPatterns = fallthroughStylePatterns.size();
* Lexes job.sourceCode and produces an output array job.decorations of
* style classes preceded by the position at which they start in
* job.sourceCode in order.
* @param job an object like <pre>{
* sourceCode: {string} sourceText plain text,
* basePos: {int} position of job.sourceCode in the larger chunk of
* sourceCode.
* }</pre>
public void decorate(Job job) {
String sourceCode = job.getSourceCode();
int basePos = job.getBasePos();
/** Even entries are positions in source in ascending order. Odd enties
* are style markers (e.g., PR_COMMENT) that run from that position until
* the end.
* @type {Array.<number|string>}
List<Object> decorations = new ArrayList<Object>(Arrays.asList(new Object[]{basePos, PR_PLAIN}));
int pos = 0; // index into sourceCode
String[] tokens = Util.match(tokenizer, sourceCode, true);
Map<String, String> styleCache = new HashMap<String, String>();
for (int ti = 0, nTokens = tokens.length; ti < nTokens; ++ti) {
String token = tokens[ti];
String style = styleCache.get(token);
String[] match = null;
boolean isEmbedded;
if (style != null) {
isEmbedded = false;
} else {
List<Object> patternParts = shortcuts.get(token.charAt(0));
if (patternParts != null) {
match = Util.match((Pattern) patternParts.get(1), token, false);
style = (String) patternParts.get(0);
} else {
for (int i = 0; i < nPatterns; ++i) {
patternParts = fallthroughStylePatterns.get(i);
match = Util.match((Pattern) patternParts.get(1), token, false);
if (match.length != 0) {
style = (String) patternParts.get(0);
if (match.length == 0) { // make sure that we make progress
style = PR_PLAIN;
isEmbedded = style != null && style.length() >= 5 && style.startsWith("lang-");
if (isEmbedded && !(match.length > 1 && match[1] != null)) {
isEmbedded = false;
style = PR_SOURCE;
if (!isEmbedded) {
styleCache.put(token, style);
int tokenStart = pos;
pos += token.length();
if (!isEmbedded) {
decorations.add(basePos + tokenStart);
} else { // Treat group 1 as an embedded block of source code.
String embeddedSource = match[1];
int embeddedSourceStart = token.indexOf(embeddedSource);
int embeddedSourceEnd = embeddedSourceStart + embeddedSource.length();
if (match.length > 2 && match[2] != null) {
// If embeddedSource can be blank, then it would match at the
// beginning which would cause us to infinitely recurse on the
// entire token, so we catch the right context in match[2].
embeddedSourceEnd = token.length() - match[2].length();
embeddedSourceStart = embeddedSourceEnd - embeddedSource.length();
String lang = style.substring(5);
// Decorate the left of the embedded source
appendDecorations(basePos + tokenStart,
token.substring(0, embeddedSourceStart),
this, decorations);
// Decorate the embedded source
appendDecorations(basePos + tokenStart + embeddedSourceStart,
langHandlerForExtension(lang, embeddedSource),
// Decorate the right of the embedded section
appendDecorations(basePos + tokenStart + embeddedSourceEnd,
this, decorations);
job.setDecorations(Util.removeDuplicates(decorations, job.getSourceCode()));
/** returns a function that produces a list of decorations from source text.
* This code treats ", ', and ` as string delimiters, and \ as a string
* escape. It does not recognize perl's qq() style strings.
* It has no special handling for double delimiter escapes as in basic, or
* the tripled delimiters used in python, but should work on those regardless
* although in those cases a single string literal may be broken up into
* multiple adjacent string literals.
* It recognizes C, C++, and shell style comments.
* @param options a set of optional parameters.
* @return a function that examines the source code
* in the input job and builds the decoration list.
protected CreateSimpleLexer sourceDecorator(Map<String, Object> options) throws Exception {
List<List<Object>> shortcutStylePatterns = new ArrayList<List<Object>>();
List<List<Object>> fallthroughStylePatterns = new ArrayList<List<Object>>();
if (options.get("tripleQuotedStrings") != null) {
// '''multi-line-string''', 'single-line-string', and double-quoted
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
} else if (options.get("multiLineStrings") != null) {
// 'multi-line-string', "multi-line-string"
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
} else {
// 'single-line-string', "single-line-string"
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
if (options.get("verbatimStrings") != null) {
// verbatim-string-literal production from the C# grammar. See issue 93.
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
Object hc = options.get("hashComments");
if (hc != null) {
if (options.get("cStyleComments") != null) {
if ((hc instanceof Integer) && (Integer) hc > 1) { // multiline hash comments
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
} else {
// Stop C preprocessor declarations at an unclosed open comment
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
// #include <stdio.h>
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
} else {
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
if (options.get("cStyleComments") != null) {
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
if (options.get("regexLiterals") != null) {
* @const
// A regular expression literal starts with a slash that is
// not followed by * or / so that it is not confused with
// comments.
// and then contains any number of raw characters,
+ "(?:[^/\\x5B\\x5C]"
// escape sequences (\x5C),
+ "|\\x5C[\\s\\S]"
// or non-nesting character sets (\x5B\x5D);
+ "|\\x5B(?:[^\\x5C\\x5D]|\\x5C[\\s\\S])*(?:\\x5D|$))+"
// finally closed by a /.
+ "/";
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-regex",
Pattern.compile("^" + REGEXP_PRECEDER_PATTERN + "(" + REGEX_LITERAL + ")")}));
Pattern types = (Pattern) options.get("types");
if (types != null) {
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TYPE, types}));
if (options.get("keywords") != null) {
String keywords = ((String) options.get("keywords")).replaceAll("^ | $", "");
if (keywords.length() != 0) {
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_KEYWORD,
Pattern.compile("^(?:" + keywords.replaceAll("[\\s,]+", "|") + ")\\b"),
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
" \r\n\t" + Character.toString((char) 0xA0)
// TODO(mikesamuel): recognize non-latin letters and numerals in idents
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_LITERAL,
Pattern.compile("^@[a-z_$][a-z_$@0-9]*", Pattern.CASE_INSENSITIVE),
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TYPE,
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
Pattern.compile("^[a-z_$][a-z_$@0-9]*", Pattern.CASE_INSENSITIVE),
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_LITERAL,
// A hex number
+ "0x[a-f0-9]+"
// or an octal or decimal number,
+ "|(?:\\d(?:_\\d+)*\\d*(?:\\.\\d*)?|\\.\\d\\+)"
// possibly in scientific notation
+ "(?:e[+\\-]?\\d+)?"
+ ')'
// with an optional modifier like UL for unsigned long
+ "[a-z]*", Pattern.CASE_INSENSITIVE),
// Don't treat escaped quotes in bash as starting strings. See issue 144.
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
// The Bash man page says
// A word is a sequence of characters considered as a single
// unit by GRUB. Words are separated by metacharacters,
// which are the following plus space, tab, and newline: { }
// | & $ ; < >
// ...
// A word beginning with # causes that word and all remaining
// characters on that line to be ignored.
// which means that only a '#' after /(?:^|[{}|&$;<>\s])/ starts a
// comment but empirically
// $ echo {#}
// {#}
// $ echo \$#
// $#
// $ echo }#
// }#
// so /(?:^|[|&;<>\s])/ is more appropriate.
// suggests that this definition is compatible with a
// default mode that tries to use a single token definition
// to recognize both bash/python style comments and C
// preprocessor directives.
// This definition of punctuation does not include # in the list of
// follow-on exclusions, so # will not be broken before if preceeded
// by a punctuation character. We could try to exclude # after
// [|&;<>] but that doesn't seem to cause many major problems.
// If that does turn out to be a problem, we should change the below
// when hc is truthy to include # in the run of punctuation characters
// only when not followint [|&;<>].
final String punctation = "^.[^\\s\\w\\.$@\\'\\\"\\`\\/\\\\]*";
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PUNCTUATION,
return new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns);
/** Maps language-specific file extensions to handlers. */
protected Map<String, Object> langHandlerRegistry = new HashMap<String, Object>();
/** Register a language handler for the given file extensions.
* @param handler a function from source code to a list
* of decorations. Takes a single argument job which describes the
* state of the computation. The single parameter has the form
* {@code {
* sourceCode: {string} as plain text.
* decorations: {Array.<number|string>} an array of style classes
* preceded by the position at which they start in
* job.sourceCode in order.
* The language handler should assigned this field.
* basePos: {int} the position of source in the larger source chunk.
* All positions in the output decorations array are relative
* to the larger source chunk.
* } }
* @param fileExtensions
protected void registerLangHandler(CreateSimpleLexer handler, List<String> fileExtensions) throws Exception {
for (int i = fileExtensions.size(); --i >= 0;) {
String ext = fileExtensions.get(i);
if (langHandlerRegistry.get(ext) == null) {
langHandlerRegistry.put(ext, handler);
} else {
throw new Exception("cannot override language handler " + ext);
* Register language handler. The clazz will not be instantiated
* @param clazz the class of the language
* @throws Exception cannot instantiate the object using the class,
* or language handler with specified extension exist already
public void register(Class<? extends Lang> clazz) throws Exception {
if (clazz == null) {
throw new NullPointerException("argument 'clazz' cannot be null");
List<String> fileExtensions = getFileExtensionsFromClass(clazz);
for (int i = fileExtensions.size(); --i >= 0;) {
String ext = fileExtensions.get(i);
if (langHandlerRegistry.get(ext) == null) {
langHandlerRegistry.put(ext, clazz);
} else {
throw new Exception("cannot override language handler " + ext);
protected List<String> getFileExtensionsFromClass(Class<? extends Lang> clazz) throws Exception {
Method getExtensionsMethod = clazz.getMethod("getFileExtensions", (Class<?>[]) null);
return (List<String>) getExtensionsMethod.invoke(null, null);
* Get the parser for the extension specified.
* @param extension the file extension, if null, default parser will be returned
* @param source the source code
* @return the parser
public CreateSimpleLexer langHandlerForExtension(String extension, String source) {
if (!(extension != null && langHandlerRegistry.get(extension) != null)) {
// Treat it as markup if the first non whitespace character is a < and
// the last non-whitespace character is a >.
extension = Util.test(Pattern.compile("^\\s*<"), source)
? "default-markup"
: "default-code";
Object handler = langHandlerRegistry.get(extension);
if (handler instanceof CreateSimpleLexer) {
return (CreateSimpleLexer) handler;
} else {
CreateSimpleLexer _simpleLexer;
try {
Lang _lang = ((Class<Lang>) handler).newInstance();
_simpleLexer = new CreateSimpleLexer(_lang.getShortcutStylePatterns(), _lang.getFallthroughStylePatterns());
List<Lang> extendedLangs = _lang.getExtendedLangs();
for (Lang _extendedLang : extendedLangs) {
List<String> fileExtensions = getFileExtensionsFromClass((Class<Lang>) handler);
for (String _extension : fileExtensions) {
langHandlerRegistry.put(_extension, _simpleLexer);
} catch (Exception ex) {
LOG.log(Level.SEVERE, null, ex);
return null;
return _simpleLexer;