| /* |
| * Copyright (C) 2022, Matthias Sohn <matthias.sohn@sap.com> and others |
| * |
| * This program and the accompanying materials are made available under the |
| * terms of the Eclipse Distribution License v. 1.0 which is available at |
| * https://www.eclipse.org/org/documents/edl-v10.php. |
| * |
| * SPDX-License-Identifier: BSD-3-Clause |
| */ |
| package org.eclipse.jgit.benchmarks; |
| |
| import org.openjdk.jmh.annotations.Benchmark; |
| import org.openjdk.jmh.annotations.BenchmarkMode; |
| import org.openjdk.jmh.annotations.Fork; |
| import org.openjdk.jmh.annotations.Measurement; |
| import org.openjdk.jmh.annotations.Mode; |
| import org.openjdk.jmh.annotations.OutputTimeUnit; |
| import org.openjdk.jmh.annotations.Param; |
| import org.openjdk.jmh.annotations.Scope; |
| import org.openjdk.jmh.annotations.Setup; |
| import org.openjdk.jmh.annotations.State; |
| import org.openjdk.jmh.annotations.TearDown; |
| import org.openjdk.jmh.annotations.Warmup; |
| import org.openjdk.jmh.infra.Blackhole; |
| import org.openjdk.jmh.runner.Runner; |
| import org.openjdk.jmh.runner.RunnerException; |
| import org.openjdk.jmh.runner.options.Options; |
| import org.openjdk.jmh.runner.options.OptionsBuilder; |
| |
| import java.util.concurrent.TimeUnit; |
| |
| import static org.eclipse.jgit.diff.RawText.getBufferSize; |
| import static org.eclipse.jgit.diff.RawText.isBinary; |
| import static org.eclipse.jgit.diff.RawText.isCrLfText; |
| |
| @State(Scope.Thread) |
| public class RawTextBenchmark { |
| |
| @State(Scope.Benchmark) |
| public static class BenchmarkState { |
| |
| @Param({"1", "2", "3", "4", "5", "6"}) |
| int testIndex; |
| |
| @Param({"false", "true"}) |
| boolean complete; |
| |
| byte[] bytes; |
| |
| @Setup |
| public void setupBenchmark() { |
| switch (testIndex) { |
| case 1: { |
| byte[] tmpBytes = "a".repeat(102400).getBytes(); |
| bytes = tmpBytes; |
| break; |
| } |
| case 2: { |
| byte[] tmpBytes = "a".repeat(102400).getBytes(); |
| byte[] tmpBytes2 = new byte[tmpBytes.length + 1]; |
| System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length); |
| tmpBytes2[500] = '\0'; |
| tmpBytes2[tmpBytes.length] = '\0'; |
| bytes = tmpBytes2; |
| break; |
| } |
| case 3: { |
| byte[] tmpBytes = "a".repeat(102400).getBytes(); |
| byte[] tmpBytes2 = new byte[tmpBytes.length + 1]; |
| System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length); |
| tmpBytes2[500] = '\r'; |
| tmpBytes2[tmpBytes.length] = '\r'; |
| bytes = tmpBytes2; |
| break; |
| } |
| case 4: { |
| byte[] tmpBytes = "a".repeat(102400).getBytes(); |
| byte[] tmpBytes2 = new byte[tmpBytes.length + 1]; |
| System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length); |
| tmpBytes2[499] = '\r'; |
| tmpBytes2[500] = '\n'; |
| tmpBytes2[tmpBytes.length - 1] = '\r'; |
| tmpBytes2[tmpBytes.length] = '\n'; |
| bytes = tmpBytes2; |
| break; |
| } |
| case 5: { |
| byte[] tmpBytes = "a".repeat(102400).getBytes(); |
| tmpBytes[0] = '\0'; |
| bytes = tmpBytes; |
| break; |
| } |
| case 6: { |
| byte[] tmpBytes = "a".repeat(102400).getBytes(); |
| tmpBytes[0] = '\r'; |
| bytes = tmpBytes; |
| break; |
| } |
| default: |
| } |
| } |
| |
| @TearDown |
| public void teardown() { |
| } |
| } |
| |
| @Benchmark |
| @BenchmarkMode({Mode.AverageTime}) |
| @OutputTimeUnit(TimeUnit.NANOSECONDS) |
| @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Fork(1) |
| public void testIsCrLfTextOld(Blackhole blackhole, BenchmarkState state) { |
| blackhole.consume( |
| isCrLfTextOld( |
| state.bytes, |
| state.bytes.length, |
| state.complete |
| ) |
| ); |
| } |
| |
| @Benchmark |
| @BenchmarkMode({Mode.AverageTime}) |
| @OutputTimeUnit(TimeUnit.NANOSECONDS) |
| @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Fork(1) |
| public void testIsCrLfTextNewCandidate1(Blackhole blackhole, BenchmarkState state) { |
| blackhole.consume( |
| isCrLfTextNewCandidate1( |
| state.bytes, |
| state.bytes.length, |
| state.complete |
| ) |
| ); |
| } |
| |
| @Benchmark |
| @BenchmarkMode({Mode.AverageTime}) |
| @OutputTimeUnit(TimeUnit.NANOSECONDS) |
| @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Fork(1) |
| public void testIsCrLfTextNewCandidate2(Blackhole blackhole, BenchmarkState state) { |
| blackhole.consume( |
| isCrLfTextNewCandidate2( |
| state.bytes, |
| state.bytes.length, |
| state.complete |
| ) |
| ); |
| } |
| |
| @Benchmark |
| @BenchmarkMode({Mode.AverageTime}) |
| @OutputTimeUnit(TimeUnit.NANOSECONDS) |
| @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Fork(1) |
| public void testIsCrLfTextNewCandidate3(Blackhole blackhole, BenchmarkState state) { |
| blackhole.consume( |
| isCrLfTextNewCandidate3( |
| state.bytes, |
| state.bytes.length, |
| state.complete |
| ) |
| ); |
| } |
| |
| @Benchmark |
| @BenchmarkMode({Mode.AverageTime}) |
| @OutputTimeUnit(TimeUnit.NANOSECONDS) |
| @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Fork(1) |
| public void testIsCrLfTextNew(Blackhole blackhole, BenchmarkState state) { |
| blackhole.consume( |
| isCrLfText( |
| state.bytes, |
| state.bytes.length, |
| state.complete |
| ) |
| ); |
| } |
| |
| @Benchmark |
| @BenchmarkMode({Mode.AverageTime}) |
| @OutputTimeUnit(TimeUnit.NANOSECONDS) |
| @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Fork(1) |
| public void testIsBinaryOld(Blackhole blackhole, BenchmarkState state) { |
| blackhole.consume( |
| isBinaryOld( |
| state.bytes, |
| state.bytes.length, |
| state.complete |
| ) |
| ); |
| } |
| |
| |
| @Benchmark |
| @BenchmarkMode({Mode.AverageTime}) |
| @OutputTimeUnit(TimeUnit.NANOSECONDS) |
| @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) |
| @Fork(1) |
| public void testIsBinaryNew(Blackhole blackhole, BenchmarkState state) { |
| blackhole.consume( |
| isBinary( |
| state.bytes, |
| state.bytes.length, |
| state.complete |
| ) |
| ); |
| } |
| |
| |
| /** |
| * Determine heuristically whether a byte array represents binary (as |
| * opposed to text) content. |
| * |
| * @param raw |
| * the raw file content. |
| * @param length |
| * number of bytes in {@code raw} to evaluate. This should be |
| * {@code raw.length} unless {@code raw} was over-allocated by |
| * the caller. |
| * @param complete |
| * whether {@code raw} contains the whole data |
| * @return true if raw is likely to be a binary file, false otherwise |
| * @since 6.0 |
| */ |
| public static boolean isBinaryOld(byte[] raw, int length, boolean complete) { |
| // Similar heuristic as C Git. Differences: |
| // - limited buffer size; may be only the beginning of a large blob |
| // - no counting of printable vs. non-printable bytes < 0x20 and 0x7F |
| int maxLength = getBufferSize(); |
| boolean isComplete = complete; |
| if (length > maxLength) { |
| // We restrict the length in all cases to getBufferSize() to get |
| // predictable behavior. Sometimes we load streams, and sometimes we |
| // have the full data in memory. With streams, we never look at more |
| // than the first getBufferSize() bytes. If we looked at more when |
| // we have the full data, different code paths in JGit might come to |
| // different conclusions. |
| length = maxLength; |
| isComplete = false; |
| } |
| byte last = 'x'; // Just something inconspicuous. |
| for (int ptr = 0; ptr < length; ptr++) { |
| byte curr = raw[ptr]; |
| if (isBinary(curr, last)) { |
| return true; |
| } |
| last = curr; |
| } |
| if (isComplete) { |
| // Buffer contains everything... |
| return last == '\r'; // ... so this must be a lone CR |
| } |
| return false; |
| } |
| |
| /** |
| * Determine heuristically whether a byte array represents text content |
| * using CR-LF as line separator. |
| * |
| * @param raw the raw file content. |
| * @param length number of bytes in {@code raw} to evaluate. |
| * @param complete whether {@code raw} contains the whole data |
| * @return {@code true} if raw is likely to be CR-LF delimited text, |
| * {@code false} otherwise |
| * @since 6.0 |
| */ |
| public static boolean isCrLfTextOld(byte[] raw, int length, boolean complete) { |
| boolean has_crlf = false; |
| byte last = 'x'; // Just something inconspicuous |
| for (int ptr = 0; ptr < length; ptr++) { |
| byte curr = raw[ptr]; |
| if (isBinary(curr, last)) { |
| return false; |
| } |
| if (curr == '\n' && last == '\r') { |
| has_crlf = true; |
| } |
| last = curr; |
| } |
| if (last == '\r') { |
| if (complete) { |
| // Lone CR: it's binary after all. |
| return false; |
| } |
| // Tough call. If the next byte, which we don't have, would be a |
| // '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide |
| // based on what we already scanned; it wasn't binary until now. |
| } |
| return has_crlf; |
| } |
| |
| /** |
| * Determine heuristically whether a byte array represents text content |
| * using CR-LF as line separator. |
| * |
| * @param raw |
| * the raw file content. |
| * @param length |
| * number of bytes in {@code raw} to evaluate. |
| * @return {@code true} if raw is likely to be CR-LF delimited text, |
| * {@code false} otherwise |
| * @param complete |
| * whether {@code raw} contains the whole data |
| * @since 6.0 |
| */ |
| public static boolean isCrLfTextNewCandidate1(byte[] raw, int length, boolean complete) { |
| boolean has_crlf = false; |
| |
| // first detect empty |
| if (length <= 0) { |
| return false; |
| } |
| |
| // next detect '\0' |
| for (int reversePtr = length - 1; reversePtr >= 0; --reversePtr) { |
| if (raw[reversePtr] == '\0') { |
| return false; |
| } |
| } |
| |
| // if '\r' be last, then if complete then return non-crlf |
| if (raw[length - 1] == '\r' && complete) { |
| return false; |
| } |
| |
| for (int ptr = 0; ptr < length - 1; ptr++) { |
| byte curr = raw[ptr]; |
| if (curr == '\r') { |
| byte next = raw[ptr + 1]; |
| if (next != '\n') { |
| return false; |
| } |
| // else |
| // we have crlf here |
| has_crlf = true; |
| // as next is '\n', it can never be '\r', just skip it from next check |
| ++ptr; |
| } |
| } |
| |
| return has_crlf; |
| } |
| |
| /** |
| * Determine heuristically whether a byte array represents text content |
| * using CR-LF as line separator. |
| * |
| * @param raw |
| * the raw file content. |
| * @param length |
| * number of bytes in {@code raw} to evaluate. |
| * @return {@code true} if raw is likely to be CR-LF delimited text, |
| * {@code false} otherwise |
| * @param complete |
| * whether {@code raw} contains the whole data |
| * @since 6.0 |
| */ |
| public static boolean isCrLfTextNewCandidate2(byte[] raw, int length, boolean complete) { |
| boolean has_crlf = false; |
| |
| // first detect empty |
| if (length <= 0) { |
| return false; |
| } |
| |
| // if '\r' be last, then if complete then return non-crlf |
| byte last = raw[length - 1]; |
| if (last == '\0' || last == '\r' && complete) { |
| return false; |
| } |
| |
| for (int ptr = 0; ptr < length - 1; ptr++) { |
| byte b = raw[ptr]; |
| switch (b) { |
| case '\0': |
| return false; |
| case '\r': { |
| ++ptr; |
| b = raw[ptr]; |
| if (b != '\n') { |
| return false; |
| } |
| // else |
| // we have crlf here |
| has_crlf = true; |
| // as next is '\n', it can never be '\r', just skip it from next check |
| break; |
| } |
| default: |
| // do nothing; |
| break; |
| } |
| } |
| |
| return has_crlf; |
| } |
| |
| /** |
| * Determine heuristically whether a byte array represents text content |
| * using CR-LF as line separator. |
| * |
| * @param raw |
| * the raw file content. |
| * @param length |
| * number of bytes in {@code raw} to evaluate. |
| * @return {@code true} if raw is likely to be CR-LF delimited text, |
| * {@code false} otherwise |
| * @param complete |
| * whether {@code raw} contains the whole data |
| * @since 6.0 |
| */ |
| public static boolean isCrLfTextNewCandidate3(byte[] raw, int length, boolean complete) { |
| boolean has_crlf = false; |
| |
| int ptr = -1; |
| byte current; |
| while (ptr < length - 2) { |
| current = raw[++ptr]; |
| if ('\0' == current || '\r' == current && (raw[++ptr] != '\n' || !(has_crlf = true))) { |
| return false; |
| } |
| } |
| |
| if (ptr == length - 2) { |
| // if '\r' be last, then if isComplete then return binary |
| current = raw[++ptr]; |
| if('\0' == current || '\r' == current && complete){ |
| return false; |
| } |
| } |
| |
| return has_crlf; |
| } |
| |
| |
| public static void main(String[] args) throws RunnerException { |
| Options opt = new OptionsBuilder() |
| .include(RawTextBenchmark.class.getSimpleName()) |
| .forks(1).jvmArgs("-ea").build(); |
| new Runner(opt).run(); |
| } |
| } |