Create an encoder for NoSQL secondary index keys

In a NoSQL system we usually don't have a secondary index available
for columns and instead must fake that ourselves by creating a new
row that is a composite of the various columns.  To ensure sorting
occurs properly in the NoSQL key range space, we need to encode the
column values to build the composite key.

Change-Id: Ieefe9757f0005bb55e30c3dc0786769bf2d402fb
Signed-off-by: Shawn O. Pearce <sop@google.com>
diff --git a/src/main/java/com/google/gwtorm/nosql/IndexKeyBuilder.java b/src/main/java/com/google/gwtorm/nosql/IndexKeyBuilder.java
new file mode 100644
index 0000000..a670840
--- /dev/null
+++ b/src/main/java/com/google/gwtorm/nosql/IndexKeyBuilder.java
@@ -0,0 +1,166 @@
+// Copyright 2010 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.gwtorm.nosql;
+
+
+import java.io.ByteArrayOutputStream;
+import java.io.UnsupportedEncodingException;
+
+/**
+ * Encoder support for {@link IndexFunction} computed strings.
+ * <p>
+ * This class provides a string that may contain multiple values, using
+ * delimiters between fields and big-endian encoded numerics. Sorting the
+ * resulting strings using unsigned byte orderings produces a stable sorting.
+ * <p>
+ * The encoding used by this class relies on having 258 tokens. To get the extra
+ * 2 tokens within a 256 byte range, escapes are used according to the following
+ * simple table:
+ * <ul>
+ * <li>delimiter = \x00\x01
+ * <li>byte \x00 = \x00\xff
+ * <li>byte \xff = \xff\x00
+ * <li>infinity = \xff\xff
+ * </ul>
+ * <p>
+ * Integers are encoded as variable length big-endian values, skipping leading
+ * zero bytes, prefixed by the number of bytes used to encode them. Therefore 0
+ * is encoded as "\x00", and 256 is encoded as "\x02\x01\x00". Negative values
+ * are encoded in their twos complement encoding and therefore sort after the
+ * maximum positive value.
+ * <p>
+ * Strings and byte arrays supplied by the caller have their \x00 and \xff
+ * values escaped according to the table above, but are otherwise written as-is
+ * without a length prefix.
+ * <p>
+ * Callers are responsible for inserting {@link #delimiter()} markers at the
+ * appropriate positions in the sequence.
+ */
+public class IndexKeyBuilder {
+  private final ByteArrayOutputStream buf = new ByteArrayOutputStream();
+
+  /**
+   * Add a delimiter marker to the string.
+   */
+  public void delimiter() {
+    buf.write(0x00);
+    buf.write(0x01);
+  }
+
+  /**
+   * Add the special infinity symbol to the string.
+   * <p>
+   * The infinity symbol sorts after all other values in the same position.
+   */
+  public void infinity() {
+    buf.write(0xff);
+    buf.write(0xff);
+  }
+
+  /**
+   * Add a raw sequence of bytes.
+   * <p>
+   * The bytes 0x00 and 0xff are escaped by this method according to the
+   * encoding table described in the class documentation.
+   *
+   * @param bin array to copy from.
+   * @param pos first index to copy.
+   * @param cnt number of bytes to copy.
+   */
+  public void add(byte[] bin, int pos, int cnt) {
+    while (0 < cnt--) {
+      byte b = bin[pos++];
+      if (b == 0x00) {
+        buf.write(0x00);
+        buf.write(0xff);
+
+      } else if (b == -1) {
+        buf.write(0xff);
+        buf.write(0x00);
+
+      } else {
+        buf.write(b);
+      }
+    }
+  }
+
+  /**
+   * Add a raw sequence of bytes.
+   * <p>
+   * The bytes 0x00 and 0xff are escaped by this method according to the
+   * encoding table described in the class documentation.
+   *
+   * @param bin the complete array to copy.
+   */
+  public void add(byte[] bin) {
+    add(bin, 0, bin.length);
+  }
+
+  /**
+   * Encode a string into UTF-8 and append as a sequence of bytes.
+   *
+   * @param str the string to encode and append.
+   */
+  public void add(String str) {
+    try {
+      add(str.getBytes("UTF-8"));
+    } catch (UnsupportedEncodingException e) {
+      throw new RuntimeException("JVM does not support UTF-8", e);
+    }
+  }
+
+  /**
+   * Add a single character as though it were part of a UTF-8 string.
+   *
+   * @param ch the character to encode and append.
+   */
+  public void add(char ch) {
+    if (ch == 0x00) {
+      buf.write(0x00);
+      buf.write(0xff);
+
+    } else if (ch <= 255) {
+      buf.write(ch);
+
+    } else {
+      add(Character.toString(ch));
+    }
+  }
+
+  /**
+   * Add an integer value as a big-endian variable length integer.
+   *
+   * @param val the value to add.
+   */
+  public void add(long val) {
+    final byte[] t = new byte[9];
+    int i = t.length;
+    while (val != 0) {
+      t[--i] = (byte) (val & 0xff);
+      val >>>= 8;
+    }
+    t[i - 1] = (byte) (t.length - i);
+    buf.write(t, i - 1, t.length - i + 1);
+  }
+
+  /**
+   * Obtain a copy of the internal storage array.
+   *
+   * @return the current state of this, converted into a flat byte array.
+   */
+  public byte[] toByteArray() {
+    return buf.toByteArray();
+  }
+}
diff --git a/src/test/java/com/google/gwtorm/nosql/IndexKeyBuilderTest.java b/src/test/java/com/google/gwtorm/nosql/IndexKeyBuilderTest.java
new file mode 100644
index 0000000..e0f8028
--- /dev/null
+++ b/src/test/java/com/google/gwtorm/nosql/IndexKeyBuilderTest.java
@@ -0,0 +1,82 @@
+// Copyright 2010 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.gwtorm.nosql;
+
+import junit.framework.TestCase;
+
+public class IndexKeyBuilderTest extends TestCase {
+  public void testInt() {
+    IndexKeyBuilder ib;
+
+    ib = new IndexKeyBuilder();
+    ib.add(0);
+    assertEquals(new byte[] {0x00}, ib);
+
+    ib = new IndexKeyBuilder();
+    ib.add(1);
+    assertEquals(new byte[] {0x01, 0x01}, ib);
+
+    ib = new IndexKeyBuilder();
+    ib.add(256);
+    assertEquals(new byte[] {0x02, 0x01, 0x00}, ib);
+  }
+
+  public void testDelimiter() {
+    IndexKeyBuilder ib = new IndexKeyBuilder();
+    ib.delimiter();
+    assertEquals(new byte[] {0x00, 0x01}, ib);
+  }
+
+  public void testStringASCII() {
+    IndexKeyBuilder ib = new IndexKeyBuilder();
+    ib.add("hi");
+    assertEquals(new byte[] {'h', 'i'}, ib);
+  }
+
+  public void testStringNUL() {
+    IndexKeyBuilder ib = new IndexKeyBuilder();
+    ib.add("\0");
+    assertEquals(new byte[] {0x00, (byte) 0xff}, ib);
+  }
+
+  public void testStringFF() {
+    IndexKeyBuilder ib = new IndexKeyBuilder();
+    ib.add(new byte[] {(byte) 0xff});
+    assertEquals(new byte[] {(byte) 0xff, 0x00}, ib);
+  }
+
+  public void testInfinity() {
+    IndexKeyBuilder ib = new IndexKeyBuilder();
+    ib.infinity();
+    assertEquals(new byte[] {(byte) 0xff, (byte) 0xff}, ib);
+  }
+
+  private static void assertEquals(byte[] exp, IndexKeyBuilder ic) {
+    assertEquals(toString(exp), toString(ic.toByteArray()));
+  }
+
+  private static String toString(byte[] bin) {
+    StringBuilder dst = new StringBuilder(bin.length * 2);
+    for (byte b : bin) {
+      dst.append(hexchar[(b >>> 4) & 0x0f]);
+      dst.append(hexchar[b & 0x0f]);
+    }
+    return dst.toString();
+  }
+
+  private static final char[] hexchar =
+      {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', //
+          'a', 'b', 'c', 'd', 'e', 'f'};
+}