Fix token highlighting after surrogate pairs

This solves the same bug for token highlighting that https://gerrit-review.git.corp.google.com/c/gerrit/+/352754 solved for intraline diff highlights.

Release-Notes: Fix token highlighting after surrogate pairs
Change-Id: Ia1f80d5bf2c65d4c3a35d6031ccb49f7a619a8ba
diff --git a/polygerrit-ui/app/embed/diff/gr-diff-builder/token-highlight-layer.ts b/polygerrit-ui/app/embed/diff/gr-diff-builder/token-highlight-layer.ts
index 1e5dd65..e9076aa 100644
--- a/polygerrit-ui/app/embed/diff/gr-diff-builder/token-highlight-layer.ts
+++ b/polygerrit-ui/app/embed/diff/gr-diff-builder/token-highlight-layer.ts
@@ -139,11 +139,17 @@
     let atLeastOneTokenMatched = false;
     while ((match = tokenMatcher.exec(text))) {
       const token = match[0];
-      const index = match.index;
-      const length = token.length;
+
       // Binary files encoded as text for example can have super long lines
       // with super long tokens. Let's guard against this scenario.
-      if (length > TOKEN_LENGTH_LIMIT) continue;
+      if (token.length > TOKEN_LENGTH_LIMIT) continue;
+
+      // This is to correctly count surrogate pairs in text and token.
+      // If the index calculation becomes a hotspot, we could precompute a code
+      // unit to code point index map for text before iterating over the results
+      const index = GrAnnotation.getStringLength(text.slice(0, match.index));
+      const length = GrAnnotation.getStringLength(token);
+
       atLeastOneTokenMatched = true;
       const highlightTypeClass =
         token === this.currentHighlight ? CSS_HIGHLIGHT : '';
@@ -339,7 +345,7 @@
       start_line: line,
       start_column: index + 1, // 1-based inclusive
       end_line: line,
-      end_column: index + token.length, // 1-based inclusive
+      end_column: index + GrAnnotation.getStringLength(token), // 1-based inclusive
     };
     this.tokenHighlightListener({token, element, side, range});
   }
diff --git a/polygerrit-ui/app/embed/diff/gr-diff-builder/token-highlight-layer_test.ts b/polygerrit-ui/app/embed/diff/gr-diff-builder/token-highlight-layer_test.ts
index 75c7908..8fd03bb 100644
--- a/polygerrit-ui/app/embed/diff/gr-diff-builder/token-highlight-layer_test.ts
+++ b/polygerrit-ui/app/embed/diff/gr-diff-builder/token-highlight-layer_test.ts
@@ -143,6 +143,33 @@
       });
     });
 
+    test('annotate adds css tokens w/ emojis', () => {
+      const annotateElementStub = sinon.stub(GrAnnotation, 'annotateElement');
+      const el = createLine('these 💩 are 👨‍👩‍👧‍👦 words');
+
+      annotate(el);
+
+      assert.isTrue(annotateElementStub.calledThrice);
+      assertAnnotation(annotateElementStub.args[0], {
+        parent: el,
+        offset: 0,
+        length: 5,
+        cssClass: 'tk-text-these tk-index-0 token ',
+      });
+      assertAnnotation(annotateElementStub.args[1], {
+        parent: el,
+        offset: 8,
+        length: 3,
+        cssClass: 'tk-text-are tk-index-8 token ',
+      });
+      assertAnnotation(annotateElementStub.args[2], {
+        parent: el,
+        offset: 20,
+        length: 5,
+        cssClass: 'tk-text-words tk-index-20 token ',
+      });
+    });
+
     test('annotate adds mouse handlers', () => {
       const el = createLine('these are words');
       const addEventListenerStub = sinon.stub(el, 'addEventListener');