Add a utility for processing the output of HighlightJS
This change prepares syntax highlighting being done in one go instead
of line by line. This is required for upgrading to the latest version
of HighlightJS. Line by line highlighting is not supported anymore.
The change also prepares us for doing this post-processing in a worker.
See follow-up changes.
Change-Id: I4f20cf463600bdc1ab8510e781d0c14fd4e3f9aa
diff --git a/polygerrit-ui/app/utils/hljs-util.ts b/polygerrit-ui/app/utils/hljs-util.ts
new file mode 100644
index 0000000..1bd2072
--- /dev/null
+++ b/polygerrit-ui/app/utils/hljs-util.ts
@@ -0,0 +1,145 @@
+/**
+ * @license
+ * Copyright 2022 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * Utilities related to working with the HighlightJS syntax highlighting lib.
+ *
+ * Note that this utility is mostly used by the hljs-worker, which is a Web
+ * Worker and can thus not depend on document, the DOM or any related
+ * functionality.
+ */
+
+/**
+ * With these expressions you can match exactly what HighlightJS produces. It
+ * is really that simple:
+ * https://github.com/highlightjs/highlight.js/blob/main/src/lib/html_renderer.js
+ */
+const openingSpan = new RegExp('<span class="(.*?)">');
+const closingSpan = new RegExp('</span>');
+
+/** Can be used for `length` in SyntaxLayerRange. */
+const UNCLOSED = -1;
+
+/** Range of characters in a line to be syntax highlighted. */
+export interface SyntaxLayerRange {
+ /** 1-based inclusive. */
+ start: number;
+ /** Can only be UNCLOSED during processing. */
+ length: number;
+ /** HighlightJS specific names, e.g. 'literal'. */
+ className: string;
+}
+
+/**
+ * HighlightJS produces one long HTML string with HTML elements spanning
+ * multiple lines. <gr-diff> is line based, needs all elements closed at the end
+ * of the line, and is not interested in the HTML that HighlightJS produces.
+ *
+ * So we are splitting the HTML string up into lines and process them one by
+ * one. Each <span> is detected, converted into a SyntaxLayerRange and removed.
+ * Unclosed spans will be carried over to the next line.
+ */
+export function highlightedStringToRanges(
+ highlightedCode: string
+): SyntaxLayerRange[][] {
+ // What the function eventually returns.
+ const rangesPerLine: SyntaxLayerRange[][] = [];
+ // The unclosed ranges that are carried over from one line to the next.
+ let carryOverRanges: SyntaxLayerRange[] = [];
+
+ for (let line of highlightedCode.split('\n')) {
+ const ranges: SyntaxLayerRange[] = [...carryOverRanges];
+ carryOverRanges = [];
+ rangesPerLine.push(ranges);
+
+ // Remove all span tags one after another from left to right.
+ // For each opening <span ...> push a new (unclosed) range.
+ // For each closing </span> close the latest unclosed range.
+ let removal: SpanRemoval | undefined;
+ while ((removal = removeFirstSpan(line)) !== undefined) {
+ if (removal.type === SpanType.OPENING) {
+ ranges.push({
+ start: removal.offset,
+ length: UNCLOSED,
+ className: removal.class ?? '',
+ });
+ } else {
+ const unclosed = lastUnclosed(ranges);
+ unclosed.length = removal.offset - unclosed.start;
+ }
+ line = removal.lineAfter;
+ }
+
+ // All unclosed spans need to have the length set such that they extend to
+ // the end of the line. And they have to be carried over to the next line
+ // as cloned objects with start:0.
+ const lineLength = line.length;
+ for (const range of ranges) {
+ if (isUnclosed(range)) {
+ carryOverRanges.push({...range, start: 0});
+ range.length = lineLength - range.start;
+ }
+ }
+ }
+ if (carryOverRanges.length > 0) {
+ throw new Error('unclosed <span>s in highlighted code');
+ }
+ return rangesPerLine;
+}
+
+function isUnclosed(range: SyntaxLayerRange) {
+ return range.length === UNCLOSED;
+}
+
+function lastUnclosed(ranges: SyntaxLayerRange[]) {
+ const unclosed = [...ranges].reverse().find(isUnclosed);
+ if (!unclosed) throw new Error('no unclosed range found');
+ return unclosed;
+}
+
+/** Used for `type` in SpanRemoval. */
+export enum SpanType {
+ OPENING,
+ CLOSING,
+}
+
+/** Return type for removeFirstSpan(). */
+export interface SpanRemoval {
+ type: SpanType;
+ /** The line string after removing the matched span tag. */
+ lineAfter: string;
+ /** The matched css class for OPENING spans. undefined for CLOSING. */
+ class?: string;
+ /** At which char in the line did the removed span tag start? */
+ offset: number;
+}
+
+/**
+ * Finds the first <span ...> or </span>, removes it from the line and returns
+ * details about the removal. Returns `undefined`, if neither is found.
+ */
+export function removeFirstSpan(line: string): SpanRemoval | undefined {
+ const openingMatch = openingSpan.exec(line);
+ const openingIndex = openingMatch?.index ?? Number.MAX_VALUE;
+ const closingMatch = closingSpan.exec(line);
+ const closingIndex = closingMatch?.index ?? Number.MAX_VALUE;
+ if (openingIndex === Number.MAX_VALUE && closingIndex === Number.MAX_VALUE) {
+ return undefined;
+ }
+ const type =
+ openingIndex < closingIndex ? SpanType.OPENING : SpanType.CLOSING;
+ const offset = type === SpanType.OPENING ? openingIndex : closingIndex;
+ const match = type === SpanType.OPENING ? openingMatch : closingMatch;
+ if (match === null) return undefined;
+ const length = match[0].length;
+ const removal: SpanRemoval = {
+ type,
+ lineAfter: line.slice(0, offset) + line.slice(offset + length),
+ offset,
+ class: type === SpanType.OPENING ? match[1] : undefined,
+ };
+ return removal;
+}
diff --git a/polygerrit-ui/app/utils/hljs-util_test.ts b/polygerrit-ui/app/utils/hljs-util_test.ts
new file mode 100644
index 0000000..3c577ca
--- /dev/null
+++ b/polygerrit-ui/app/utils/hljs-util_test.ts
@@ -0,0 +1,162 @@
+/**
+ * @license
+ * Copyright 2022 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+import '../test/common-test-setup-karma';
+import './hljs-util';
+import {
+ highlightedStringToRanges,
+ removeFirstSpan,
+ SpanType,
+} from './hljs-util';
+
+suite('file hljs-util', () => {
+ suite('function removeFirstSpan()', () => {
+ test('no matches', async () => {
+ assert.isUndefined(removeFirstSpan(''));
+ assert.isUndefined(removeFirstSpan('span'));
+ assert.isUndefined(removeFirstSpan('<span>'));
+ assert.isUndefined(removeFirstSpan('</span'));
+ assert.isUndefined(removeFirstSpan('asdf'));
+ });
+
+ test('simple opening match', async () => {
+ const removal = removeFirstSpan('asdf<span class="c">asdf');
+ assert.deepEqual(removal, {
+ type: SpanType.OPENING,
+ lineAfter: 'asdfasdf',
+ class: 'c',
+ offset: 4,
+ });
+ });
+
+ test('simple closing match', async () => {
+ const removal = removeFirstSpan('asdf</span>asdf');
+ assert.deepEqual(removal, {
+ type: SpanType.CLOSING,
+ lineAfter: 'asdfasdf',
+ class: undefined,
+ offset: 4,
+ });
+ });
+ });
+
+ suite('function highlightedStringToRanges()', () => {
+ test('no ranges', async () => {
+ assert.deepEqual(highlightedStringToRanges(''), [[]]);
+ assert.deepEqual(highlightedStringToRanges('\n'), [[], []]);
+ assert.deepEqual(highlightedStringToRanges('asdf\nasdf\nasdf'), [
+ [],
+ [],
+ [],
+ ]);
+ });
+
+ test('one line, one span', async () => {
+ assert.deepEqual(
+ highlightedStringToRanges('asdf<span class="c">qwer</span>asdf'),
+ [[{start: 4, length: 4, className: 'c'}]]
+ );
+ assert.deepEqual(
+ highlightedStringToRanges('<span class="d">asdfqwer</span>'),
+ [[{start: 0, length: 8, className: 'd'}]]
+ );
+ });
+
+ test('one line, two spans one after another', async () => {
+ assert.deepEqual(
+ highlightedStringToRanges(
+ 'asdf<span class="c">qwer</span>zxcv<span class="d">qwer</span>asdf'
+ ),
+ [
+ [
+ {start: 4, length: 4, className: 'c'},
+ {start: 12, length: 4, className: 'd'},
+ ],
+ ]
+ );
+ });
+
+ test('one line, two nested spans', async () => {
+ assert.deepEqual(
+ highlightedStringToRanges(
+ 'asdf<span class="c">qwer<span class="d">zxcv</span>qwer</span>asdf'
+ ),
+ [
+ [
+ {start: 4, length: 12, className: 'c'},
+ {start: 8, length: 4, className: 'd'},
+ ],
+ ]
+ );
+ });
+
+ test('two lines, one span each', async () => {
+ assert.deepEqual(
+ highlightedStringToRanges(
+ 'asdf<span class="c">qwer</span>asdf\n' +
+ 'asd<span class="d">qwe</span>asd'
+ ),
+ [
+ [{start: 4, length: 4, className: 'c'}],
+ [{start: 3, length: 3, className: 'd'}],
+ ]
+ );
+ });
+
+ test('one span over two lines', async () => {
+ assert.deepEqual(
+ highlightedStringToRanges(
+ 'asdf<span class="c">qwer\n' + 'asdf</span>qwer'
+ ),
+ [
+ [{start: 4, length: 4, className: 'c'}],
+ [{start: 0, length: 4, className: 'c'}],
+ ]
+ );
+ });
+
+ test('two spans over two lines', async () => {
+ assert.deepEqual(
+ highlightedStringToRanges(
+ 'asdf<span class="c">qwer<span class="d">zxcv\n' +
+ 'asdf</span>qwer</span>zxcv'
+ ),
+ [
+ [
+ {start: 4, length: 8, className: 'c'},
+ {start: 8, length: 4, className: 'd'},
+ ],
+ [
+ {start: 0, length: 8, className: 'c'},
+ {start: 0, length: 4, className: 'd'},
+ ],
+ ]
+ );
+ });
+
+ test('two spans over four lines', async () => {
+ assert.deepEqual(
+ highlightedStringToRanges(
+ 'asdf<span class="c">qwer\n' +
+ 'asdf<span class="d">qwer\n' +
+ 'asdf</span>qwer\n' +
+ 'asdf</span>qwer'
+ ),
+ [
+ [{start: 4, length: 4, className: 'c'}],
+ [
+ {start: 0, length: 8, className: 'c'},
+ {start: 4, length: 4, className: 'd'},
+ ],
+ [
+ {start: 0, length: 8, className: 'c'},
+ {start: 0, length: 4, className: 'd'},
+ ],
+ [{start: 0, length: 4, className: 'c'}],
+ ]
+ );
+ });
+ });
+});